In [1]:
import graph_tool.all as gt



In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import random
import time
from collections import deque

%matplotlib inline

# メモリで比較

In [3]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/Pokec/WC.csv")
network.head()

Unnamed: 0,# FromNodeId,ToNodeId,WC
0,1,13,0.037037
1,1,11,0.027027
2,1,6,0.071429
3,1,3,0.333333
4,1,4,1.0


In [4]:
network_np = network.values
network_np.shape

(30622564, 3)

In [5]:
network_np.T[[0,1]].T

array([[1.000000e+00, 1.300000e+01],
       [1.000000e+00, 1.100000e+01],
       [1.000000e+00, 6.000000e+00],
       ...,
       [1.632802e+06, 1.632637e+06],
       [1.632802e+06, 1.632736e+06],
       [1.632803e+06, 1.632405e+06]])

In [6]:
# %%time
# G_gt = gt.Graph(directed = True)
# edge_weights = G_gt.new_edge_property('double')
# G_gt.properties[("e","weight")] = edge_weights

# for edge in tqdm(network_np):
#     e = G_gt.add_edge(edge[0], edge[1])
#     edge_weights[e] = edge[1]

G_gt = gt.Graph(directed = True)
%time G_gt.add_edge_list(network_np.T[[0,1]].T)

CPU times: user 5.26 s, sys: 400 ms, total: 5.66 s
Wall time: 4.96 s


In [7]:
%%time
G_nx = nx.DiGraph()
G_nx.add_weighted_edges_from(network_np)

CPU times: user 2min 12s, sys: 4.38 s, total: 2min 17s
Wall time: 2min 17s


In [8]:
G_practice = gt.Graph()

In [9]:
edge = np.array([[0,1],
                 [1,2],
                 [2,3],
                 [4,1],
                 [3,2],
                 [3,4]
                ])
G_practice.add_edge_list(edge)

In [10]:
# グラフのすべての枝を取得
G_practice.get_edges()

array([[0, 1],
       [1, 2],
       [2, 3],
       [3, 2],
       [3, 4],
       [4, 1]])

In [11]:
# グラフのすべての頂点を取得
G_practice.get_vertices()

array([0, 1, 2, 3, 4])

In [12]:
# 頂点3のすべての隣接頂点を取得
G_practice.get_all_neighbors(3)

array([2, 4, 2])

In [13]:
# 頂点1のすべての入頂点を取得
G_practice.get_in_neighbors(1)

array([0, 4])

In [14]:
# 頂点1のすべての出頂点を取得
G_practice.get_out_neighbors(1)

array([2])

In [15]:
# 頂点1のすべての入次数を取得
# リストで頂点を与えることに注意
G_practice.get_in_degrees([1])

array([2], dtype=uint64)

In [16]:
# 頂点1のすべての出次数を取得
# リストで頂点を与えることに注意
G_practice.get_out_degrees([1])

array([1], dtype=uint64)

In [17]:
# 頂点1のすべての入出次数を取得
# リストで頂点を与えることに注意
G_practice.get_total_degrees([1])

array([3], dtype=uint64)

# BFS

## graph-tool VS 自作関数 VS networkx

In [18]:
for e in gt.bfs_iterator(G_practice, G_practice.vertex(3)):
    print(e)
    print(e.source(), e.target())

(3, 2)
3 2
(3, 4)
3 4
(4, 1)
4 1


In [19]:
%timeit gt.bfs_iterator(G_gt, G_gt.vertex(3))

33.3 µs ± 43.9 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [20]:
# 幅優先探索によって、Sから到達可能なノードを返す
def bfs(G, S):
    visited = {s:None for s in S}
    queue = deque(S)
    while queue:
        v = queue.popleft()
        out_node = G[v]
        for u in out_node:
            if not (u in visited):
                queue.append(u)
                visited[u] = v
    return visited

%time bfs(G_nx, [3])

CPU times: user 13.1 s, sys: 44 ms, total: 13.1 s
Wall time: 13.1 s


{3: None,
 1.0: 3,
 807726.0: 3,
 1381344.0: 3,
 13.0: 1.0,
 11.0: 1.0,
 6.0: 1.0,
 4.0: 1.0,
 5.0: 1.0,
 15.0: 1.0,
 14.0: 1.0,
 7.0: 1.0,
 8.0: 1.0,
 12.0: 1.0,
 9.0: 1.0,
 10.0: 1.0,
 16.0: 1.0,
 516090.0: 1381344.0,
 123384.0: 1381344.0,
 392.0: 13.0,
 407.0: 13.0,
 408.0: 13.0,
 396.0: 13.0,
 397.0: 13.0,
 398.0: 13.0,
 411.0: 13.0,
 399.0: 13.0,
 412.0: 13.0,
 402.0: 13.0,
 403.0: 13.0,
 404.0: 13.0,
 413.0: 13.0,
 405.0: 13.0,
 9090.0: 13.0,
 10150.0: 13.0,
 159316.0: 13.0,
 310.0: 11.0,
 309.0: 11.0,
 308.0: 11.0,
 313.0: 11.0,
 316.0: 11.0,
 333.0: 11.0,
 331.0: 11.0,
 319.0: 11.0,
 322.0: 11.0,
 320.0: 11.0,
 307.0: 11.0,
 221.0: 11.0,
 332.0: 11.0,
 306.0: 11.0,
 315.0: 11.0,
 314.0: 11.0,
 324.0: 11.0,
 323.0: 11.0,
 336.0: 11.0,
 334.0: 11.0,
 335.0: 11.0,
 321.0: 11.0,
 327.0: 11.0,
 213.0: 11.0,
 330.0: 11.0,
 326.0: 11.0,
 325.0: 11.0,
 318.0: 11.0,
 328.0: 11.0,
 317.0: 11.0,
 338.0: 11.0,
 337.0: 11.0,
 311.0: 11.0,
 339.0: 11.0,
 312.0: 11.0,
 340.0: 11.0,
 341.0: 11

In [21]:
%timeit nx.bfs_edges(G_nx, 3)

350 ns ± 2.04 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


networkxが一番高速ではあったが、グラフの保存領域にメモリを使いすぎてしまっていたり、そもそもグラフの作成が遅い等のデメリットもある

## 強連結成分分解

In [22]:
%timeit nx.strongly_connected_components(G_nx)

1.3 µs ± 2.56 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [23]:
%timeit gt.label_components(G_gt)

1.57 s ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
# 自作関数
# 強連結成分分解に使うための関数
from collections import deque

# 強連結成分分解に使うための関数
def dfs_go(G):
    visited = dict([])
    # vs:帰りがけ順
    vs = deque([])
    for s in G.nodes():
        if s not in visited:
            visited[s] = None
            Stack = deque([s])
            vs_tmp = deque([s])
            while Stack:
                v = Stack.pop()
                #if v not in vs_tmp:
                vs_tmp.appendleft(v)
                out_node = G[v]
                for u in out_node:
                    if u not in visited:
                        visited[u] = s
                        Stack.append(u)
            #vs = vs_tmp + vs
            vs.extendleft(vs_tmp)
    return visited, vs

# 強連結成分分解に使うための関数
def dfs_back(G, vs):
    group = dict([])
    group_num = 0
    DAG = nx.DiGraph()
    Edges = []
    Nodes = dict([])
    for s in vs:
        if s not in group:
            w = 1
            Stack = deque([s])
            while Stack:
                v = Stack.pop()
                group[v] = group_num
                in_node = G.predecessors(v)
                for u in in_node:
                    if u not in group:
                        group[u] = s
                        w += 1
                        Stack.append(u)
                    else:
                        if group_num !=  group[u]:
                            DAG.add_edge(group_num, group[u])
                            Edges.append((group_num, group[u]))
            DAG.add_node(group_num, weight=w)
            Nodes[group_num] = w
            group_num += 1
    return group, DAG, Nodes, Edges

def scc_DAG(G):
    visited ,vs = dfs_go(G)
    group, DAG, Nodes, Edges = dfs_back(G, vs)
    return group, DAG, Nodes, Edges

%time scc_DAG(G_nx)

CPU times: user 1min 13s, sys: 624 ms, total: 1min 14s
Wall time: 1min 14s


({1632795.0: 0,
  1632790.0: 1,
  1632789.0: 2,
  1632788.0: 3,
  1632771.0: 4,
  1632799.0: 5,
  1632778.0: 6,
  1632776.0: 7,
  1632775.0: 8,
  1632774.0: 9,
  1632773.0: 10,
  1632767.0: 11,
  1632765.0: 12,
  1632763.0: 13,
  1632622.0: 14,
  1632742.0: 15,
  1632759.0: 16,
  1632756.0: 17,
  1632755.0: 18,
  1632746.0: 19,
  1632741.0: 20,
  1632736.0: 21,
  1632802.0: 21,
  1632733.0: 22,
  1632732.0: 23,
  1632723.0: 24,
  1632721.0: 25,
  1632719.0: 26,
  1632714.0: 27,
  1632712.0: 28,
  1632711.0: 29,
  1632710.0: 30,
  1632709.0: 31,
  1632703.0: 32,
  1632701.0: 33,
  1632699.0: 34,
  1632694.0: 35,
  1632693.0: 36,
  1632692.0: 37,
  1632688.0: 38,
  1632686.0: 39,
  1632675.0: 40,
  1632674.0: 41,
  1632673.0: 42,
  1632667.0: 43,
  1632663.0: 44,
  1632657.0: 45,
  1632655.0: 46,
  1632648.0: 47,
  1632642.0: 48,
  1632636.0: 49,
  1632634.0: 50,
  1632630.0: 51,
  1632626.0: 52,
  1632621.0: 53,
  1632620.0: 54,
  1632618.0: 55,
  1632617.0: 56,
  1632615.0: 57,
  16326

In [27]:
%time nx.condensation(G_nx)

CPU times: user 1min 17s, sys: 404 ms, total: 1min 17s
Wall time: 1min 17s


<networkx.classes.digraph.DiGraph at 0x7f79664554d0>