# Pruned Monte-Carlo

* 強連結成分分解の関数を作る
* GAINの再帰部分をどうするか？
* Cでシュミレーションだけやってしまう

* deltaとlatestが上手く動作していないため、クラスで書くか、返り値等を変更する必要あり

In [41]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import random
import time
from collections import deque

%matplotlib inline

In [42]:
# データの読み込み
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data.csv")
network.head()

Unnamed: 0,# FromNodeId,ToNodeId,p
0,0,4,0.008
1,0,5,0.005682
2,0,7,0.033333
3,0,8,0.009615
4,0,9,0.066667


In [43]:
# numpy型に変換
network_np = network.values

In [44]:
# 入力は隣接リスト(numpy)[[from_node, to_node, edge_prob],...]
def live_edge_graph_edges(p, p_len):
    rand = np.random.uniform(0, 1, p_len)
    return np.array([[p[i][0], p[i][1]] for i in range(p_len) if rand[i] < p[i][2]])

In [231]:
# 幅優先探索によって、Sから到達可能なノードを返す
def bfs(G, S):
    visited = {s:None for s in S}
    queue = deque(S)
    while queue:
        v = queue.popleft()
        out_node = G.successors(v)
        for u in out_node:
            if not (u in visited):
                queue.append(u)
                visited[u] = v
    return visited

In [46]:
# 幅優先探索によって、Sから到達可能なノードを返す
def bfs_reverse(G, S):
    visited = {s:None for s in S}
    queue = deque(S)
    while queue:
        v = queue.popleft()
        in_node = G.predecessors(v)
        for u in in_node:
            if not (u in visited):
                queue.append(u)
                visited[u] = v
    return visited

In [53]:
# 強連結成分分解に使うための関数
def dfs_go(G):
    visited = dict([])
    # vs:帰りがけ順
    vs = deque([])
    for s in G.node():
        if s not in visited:
            visited[s] = None
            Stack = deque([s])
            vs_tmp = deque([s])
            while Stack:
                v = Stack.pop()
                #if v not in vs_tmp:
                vs_tmp.appendleft(v)
                out_node = G[v]
                for u in out_node:
                    if u not in visited:
                        visited[u] = s
                        Stack.append(u)
            #vs = vs_tmp + vs
            vs.extendleft(vs_tmp)
    return visited, vs

# 強連結成分分解に使うための関数
def dfs_back(G, vs):
    group = dict([])
    group_num = 0
    DAG = nx.DiGraph()
    for s in vs:
        if s not in group:
            w = 1
            Stack = deque([s])
            members = []
            while Stack:
                v = Stack.pop()
                members.append(v)
                group[v] = group_num
                in_node = G.predecessors(v)
                for u in in_node:
                    if u not in group:
                        group[u] = group_num
                        w += 1
                        Stack.append(u)
                    else:
                        if group_num !=  group[u]:
                            DAG.add_edge(group_num, group[u])
            DAG.add_node(group_num, weight=w, members=members)
            group_num += 1
    return group, DAG

def scc_DAG(G):
    visited ,vs = dfs_go(G)
    group, DAG = dfs_back(G, vs)
    return group, DAG

In [54]:
%time simulation = live_edge_graph_edges(network_np, len(network_np))

CPU times: user 447 ms, sys: 7.65 ms, total: 454 ms
Wall time: 455 ms


In [55]:
len(simulation)

51989

In [296]:
# i回目のシュミレーションでv_Vが到達できる頂点数
def GAIN(i, v_V, comp, G, A, S, h, D):
    global latest
    global delta
    # v:i回目のシュミレーションで作成されたグラフのv_Vを含む強連結成分
    v = comp[i][v_V]
    
    # V[i]にvがない場合0(後に消されていくため)
    if v not in G[i].nodes():
        delta[i][v] = 0
        print(i,v,v_V)
        return 0
    
    # ? delta
    if latest[i][v]:
        return delta[i][v]
        
    latest[i][v] = True
    
    # len(S)==0の理由は初回のみ行えば良いため(h(ハブ)以降の到達頂点数は一回行えば十分であるため)
    # vがhのacestorだった場合、hの到達頂点数を計算して、他のacestorの時にも使い回す
    if (v in A[i]) and (len(S) == 0):
        # GAINの引数は後で変える
        # hのGAINをはじめから足しておく
        # print(v,h[i])
        h_Vs = G[i].nodes[h[i]]["members"]
        for h_V in h_Vs:
            delta[i][v] = GAIN(i, h_V, comp, G, A, S, h, D)
    else:
        delta[i][v] = 0
    
    # bfs
    Q = deque([v])
    # Xは探索済みの強連結成分
    X = set([v])
    while Q:
        u = Q.popleft()
        
        # 意味不明
        #if (v in A[i]) and (u in D[i]) and (len(S) == 0):
            #continue
        delta[i][v] += G[i].nodes[u]["weight"]
        
        
        Edges = G[i].out_edges(u)
        for u_, w in Edges:
            # uから出ている任意の枝について探索する
            if u_ == u:
                # 探索済みの強連結成分は探索しなくていいので、w not in X
                # w in V[i]はのちのupdateでV[i]が変化するため
                if (w not in X) and (w in G[i].nodes()):
                    Q.append(w)
                    X.add(w)
        
    return delta[i][v]

In [300]:
def UPDATEDAG(i, t_V, comp, G):
    global latest
    t = comp[i][t_V]
    print("update",t)
    # t -> u
    u = list(bfs(G[i], [t]))
    # v -> u:上で求めたuにだどりつくvを求める
    v = list(bfs_reverse(G[i], u))
    for v_ in v:
        if v_ in G[i].nodes():
            latest[i][v_] = False
    
    G[i].remove_nodes_from(u)
    return G[i], latest

In [301]:
def PMC_greedy(network_np, k, R):
    E_ = dict([])
    G_ = dict([])
    G = dict([])
    comp = dict([])
    h = dict([])
    D = dict([])
    A = dict([])
    V = dict([])
    global latest
    latest = dict([])
    global delta
    delta = dict([])
    
    # 全体のグラフ
    G_all = nx.DiGraph()
    G_all.add_weighted_edges_from(network_np)
    
    # 元グラフの頂点集合
    G_V = [int(node) for node in G_all.node()]
    
    # copy用に作っておく
    G_copy = nx.DiGraph()
    G_copy.add_nodes_from(G_V)
    
    for i in range(R):
        # 各辺 e を確率 pe で残すことで得られる辺集合
        E_[i] = live_edge_graph_edges(network_np, len(network_np))
        
        G_[i] = G_copy.copy()
        G_[i].add_edges_from(E_[i])
        # G_i_ = (V, E_i_) の強連結成分を計算
        # G_i_より構築された頂点重み付き DAG
        # comp[元ノード]→DAGノード
        # G[i]:DAG
        # V[i]:{DAGノード:weight}
        # E[i]:DAG枝(作成段階で重複があるためsetをしている)
        comp[i], G[i] = scc_DAG(G_[i])
        
        """
        G[i] = nx.condensation(G_[i])
        comp[i] = dict(G[i].node)
        V[i] = G[i].node()
        E[i] = G[i].edges()
        
        # 縮約後の各強連結成分のノード数
        G_i_item = dict(G[i].nodes).items()
        weight[i] = {node:len(list(w.values())[0]) for node, w in G_i_item}
        """
        
        # h_i ← V_i において最大の次数をもつ頂点
        G_i_deg = dict(G[i].degree())
        h[i] = max(G_i_deg, key=G_i_deg.get)
        print("hi",h[i])
        
        # h_iから到達可能な頂点集合
        # ? 型は未定
        # D:Descendant
        D[i] = set(bfs(G[i], [h[i]]))
        
        # h_iに到達可能な頂点集合
        # ? 型は未定
        # A:Ancestor
        A[i] = set(bfs_reverse(G[i], [h[i]])) - set([h[i]])
        
        V[i] = G[i].node()
        latest[i] = [False] * len(V[i])
        delta[i] = [False] * len(G_V)
        
    
    S = []
    for j in range(k):
        t = np.argmax([sum([GAIN(i, v, comp, G, A, S, h, D) for i in range(R)]) / R for v in tqdm(G_V)])
        print(t,[sum([GAIN(i, v, comp, G, A, S, h, D) for i in range(R)]) / R for v in G_V][t])
        
        S.append(t)
        
        for i in range(R):
            G[i], latest = UPDATEDAG(i, t, comp, G)
    return S

In [302]:
k = 3
R = 2
PMC_greedy(network_np, k, R)

hi 61255
hi 71435


HBox(children=(IntProgress(value=0, max=75879), HTML(value='')))

6068 98.0
update 27116
update 27085


HBox(children=(IntProgress(value=0, max=75879), HTML(value='')))

0 27116 6068
1 27085 6068
0 27116 6068
1 27085 6068
6068 98.0
update 27116


NetworkXError: The node 27116 is not in the digraph.

In [None]:
# 自作関数(強連結成分分解)との比較

In [49]:
%%time

network_np = network.values
k = 3
R = 1


E_ = dict([])
G_ = dict([])
G = dict([])
V = dict([])
E = dict([])
comp = dict([])
weight = dict([])
h = dict([])
D = dict([])
A = dict([])
latest =dict([])

for i in range(R):
    # 各辺 e を確率 pe で残すことで得られる辺集合
    start = time.time()
    E_[i] = live_edge_graph_edges(network_np, len(network_np))
    print("1",time.time() - start)

    start = time.time()
    G_[i] = nx.DiGraph()
    G_[i].add_edges_from(E_[i])
    print("2",time.time() - start)
    
    # G_i_ = (V, E_i_) の強連結成分を計算
    # G_i_より構築された頂点重み付き DAG
    start = time.time()
    G[i] = nx.condensation(G_[i])
    print("3",time.time() - start)
    
    start = time.time()
    comp[i] = dict(G[i].node)
    V[i] = G[i].node()
    E[i] = G[i].edges()
    print("4",time.time() - start)

    # 縮約後の各強連結成分のノード数
    start = time.time()
    G_i_item = dict(G[i].nodes).items()
    weight[i] = {node:len(list(w.values())[0]) for node, w in G_i_item}
    print("5",time.time() - start)

    # h_i ← V_i において最大の次数をもつ頂点
    start = time.time()
    G_i_deg = dict(G[i].degree())
    h[i] = max(G_i_deg, key=G_i_deg.get)
    print("6",time.time() - start)

    # h_iから到達可能な頂点集合
    # ? 型は未定
    # D:Descendant
    start = time.time()
    D[i] = set(bfs(G[i], [h[i]]))
    print("7",time.time() - start)

    # h_iに到達可能な頂点集合
    # ? 型は未定
    # A:Ancestor
    start = time.time()
    A[i] = set(bfs_reverse(G[i], [h[i]]))
    print("8",time.time() - start)

    start = time.time()
    latest[i] = [False for v in V[i]]
    print("9",time.time() - start)

1 0.5074639320373535
2 0.24335122108459473
3 0.9596779346466064
4 0.02229785919189453
5 0.07672572135925293
6 0.037210941314697266
7 0.003609180450439453
8 1.5020370483398438e-05
9 0.003695964813232422
CPU times: user 2.06 s, sys: 51.4 ms, total: 2.11 s
Wall time: 1.94 s


In [20]:
%%time

network_np = network.values
k = 3
R = 1


E_ = dict([])
G_ = dict([])
G = dict([])
V = dict([])
E = dict([])
comp = dict([])
weight = dict([])
h = dict([])
D = dict([])
A = dict([])
latest =dict([])

for i in range(R):
    # 各辺 e を確率 pe で残すことで得られる辺集合
    start = time.time()
    E_[i] = live_edge_graph_edges(network_np, len(network_np))
    print("1",time.time() - start)

    start = time.time()
    G_[i] = nx.DiGraph()
    G_[i].add_edges_from(E_[i])
    print("2",time.time() - start)
    
    start = time.time()
    comp[i], G[i], V[i], E[i] = scc_DAG(G_[i])
    E[i] = list(set(E[i]))
    print("3",time.time() - start)
    
    """
    # G_i_ = (V, E_i_) の強連結成分を計算
    # G_i_より構築された頂点重み付き DAG
    start = time.time()
    G[i] = nx.condensation(G_[i])
    print("3",time.time() - start)
    
    start = time.time()
    comp[i] = dict(G[i].node)
    V[i] = G[i].node()
    E[i] = G[i].edges()
    print("4",time.time() - start)

    # 縮約後の各強連結成分のノード数
    start = time.time()
    G_i_item = dict(G[i].nodes).items()
    weight[i] = {node:len(list(w.values())[0]) for node, w in G_i_item}
    print("5",time.time() - start)

    """

    # h_i ← V_i において最大の次数をもつ頂点
    start = time.time()
    G_i_deg = dict(G[i].degree())
    h[i] = max(G_i_deg, key=G_i_deg.get)
    print("6",time.time() - start)
    
    # h_iから到達可能な頂点集合
    # ? 型は未定
    # D:Descendant
    start = time.time()
    D[i] = set(bfs(G[i], [h[i]]))
    print("7",time.time() - start)

    # h_iに到達可能な頂点集合
    # ? 型は未定
    # A:Ancestor
    start = time.time()
    A[i] = set(bfs_reverse(G[i], [h[i]]))
    print("8",time.time() - start)

    start = time.time()
    latest[i] = [False for v in V[i]]
    print("9",time.time() - start)

1 0.5972480773925781
2 0.3634610176086426
3 0.6327078342437744
6 0.0563511848449707
7 5.8650970458984375e-05
8 0.0016491413116455078
9 0.003326892852783203
CPU times: user 1.79 s, sys: 49.9 ms, total: 1.84 s
Wall time: 1.75 s


In [100]:
%%time
G_all = nx.DiGraph()
G_all.add_weighted_edges_from(network_np)

CPU times: user 2.86 s, sys: 47.8 ms, total: 2.91 s
Wall time: 2.94 s


In [101]:
%%time
H = G_all.copy()

CPU times: user 1.76 s, sys: 107 ms, total: 1.86 s
Wall time: 1.94 s
