# Pruned Monte-Carlo

* Cでシュミレーションだけやってしまう

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import random
import time
from collections import deque
from InfMaxProblem import Monte_Carlo as mc

%matplotlib inline

In [2]:
# 入力は隣接リスト(numpy)[[from_node, to_node, edge_prob],...]
def live_edge_graph_edges(p, p_len):
    rand = np.random.uniform(0, 1, p_len)
    prob = p.T[2]
    l = np.where(rand < prob)[0]
    return np.array([[p[i][0], p[i][1]] for i in l])
# %time live_edge_graph_edges(network_np, len(network_np))
# Wall time: 86.1 ms

In [3]:
# 幅優先探索によって、Sから到達可能なノードを返す
def bfs(G, S):
    visited = {s:None for s in S}
    queue = deque(S)
    while queue:
        v = queue.popleft()
        out_node = G.successors(v)
        for u in out_node:
            if not (u in visited):
                queue.append(u)
                visited[u] = v
    return visited

In [4]:
# 幅優先探索によって、Sから到達可能なノードを返す
def bfs_reverse(G, S):
    visited = {s:None for s in S}
    queue = deque(S)
    while queue:
        v = queue.popleft()
        in_node = G.predecessors(v)
        for u in in_node:
            if not (u in visited):
                queue.append(u)
                visited[u] = v
    return visited

In [5]:
# 強連結成分分解に使うための関数
def dfs_go(G):
    visited = dict([])
    # vs:帰りがけ順
    vs = deque([])
    for s in G.nodes():
        if s not in visited:
            visited[s] = None
            Stack = deque([s])
            vs_tmp = deque([s])
            while Stack:
                v = Stack.pop()
                #if v not in vs_tmp:
                vs_tmp.appendleft(v)
                out_node = G[v]
                for u in out_node:
                    if u not in visited:
                        visited[u] = s
                        Stack.append(u)
            #vs = vs_tmp + vs
            vs.extendleft(vs_tmp)
    return visited, vs

# 強連結成分分解に使うための関数
def dfs_back(G, vs):
    group = dict([])
    group_num = 0
    DAG = nx.DiGraph()
    for s in vs:
        if s not in group:
            w = 1
            Stack = deque([s])
            members = []
            while Stack:
                v = Stack.pop()
                members.append(v)
                group[v] = group_num
                in_node = G.predecessors(v)
                for u in in_node:
                    if u not in group:
                        group[u] = group_num
                        w += 1
                        Stack.append(u)
                    else:
                        if group_num !=  group[u]:
                            DAG.add_edge(group[u], group_num)
            DAG.add_node(group_num, weight=w, members=members)
            group_num += 1
    return group, DAG

def scc_DAG(G):
    visited ,vs = dfs_go(G)
    group, DAG = dfs_back(G, vs)
    return group, DAG

In [6]:
# i回目のシュミレーションでv_Vが到達できる頂点数
def GAIN(i, v_V, comp, G, A, S, h, D):
    # v:i回目のシュミレーションで作成されたグラフのv_Vを含む強連結成分
    v = comp[i][v_V]
    
    # V[i]にvがない場合0(後に消されていくため)
    if v not in G[i].nodes():
        delta[i][v] = 0
        return 0
    
    # ? delta
    if latest[i][v]:
        return delta[i][v]
        
    latest[i][v] = True
    
    # len(S)==0の理由は初回のみ行えば良いため(h(ハブ)以降の到達頂点数は一回行えば十分であるため)
    # vがhのacestorだった場合、hの到達頂点数を計算して、他のacestorの時にも使い回す
    if (v in A[i]) and (len(S) == 0):
        # GAINの引数は後で変える
        # hのGAINをはじめから足しておく
        h_Vs = G[i].nodes[h[i]]["members"]
        for h_V in h_Vs:
            delta[i][v] = GAIN(i, h_V, comp, G, A, S, h, D)
    else:
        delta[i][v] = 0
    
    # bfs
    Q = deque([v])
    # Xは探索済みの強連結成分
    X = set([v])
    while Q:
        u = Q.popleft()
        
        if (v in A[i]) and (u in D[i]) and (len(S) == 0):
            continue
            
        delta[i][v] += G[i].nodes[u]["weight"]
        
        
        Edges = G[i].out_edges(u)
        for u_, w in Edges:
            # 探索済みの強連結成分は探索しなくていいので、w not in X
            # w in V[i]はのちのupdateでV[i]が変化するため
            if (w not in X) and (w in G[i].nodes()):
                Q.append(w)
                X.add(w)
        
    return delta[i][v]

In [7]:
def UPDATEDAG(i, t_V, comp, G):
    # t:DAG上でのノードid
    t = comp[i][t_V]
    
    if t in G[i]:
        # t -> u
        # u = list(bfs(G[i], [t]))
        u = list(dict(nx.bfs_edges(G[i], t)))
        # v -> u:上で求めたuにだどりつくvを求める
        # v = list(bfs_reverse(G[i], u))
        v = list(dict(nx.bfs_edges(G[i], t, reverse=False)))

        # 上の２条件を満たす集合がv
        v = set(u) & set(v)

        # v かつ Viに存在する頂点
        v_ = list(v & set(G[i].nodes()))
        latest[i].update(zip(v_, [False]*len(v_)))

        G[i].remove_nodes_from(u)
        return G[i]
    else:
        return G[i]

In [8]:
def PMC_greedy(network_np, k, R):
    time_list = []
    # E_ = dict([])
    # G_ = dict([])
    G = dict([])
    comp = dict([])
    h = dict([])
    D = dict([])
    A = dict([])
    V = dict([])
    global latest
    latest = dict([])
    global delta
    delta = {i:dict([]) for i in range(R)}
    
    # 全体のグラフ
    Node = np.unique(network_np.T[[0,1]])
    
    # 元グラフの頂点集合
    G_V = [int(node) for node in Node]
    
    # copy用に作っておく
    G_copy = nx.DiGraph()
    G_copy.add_nodes_from(G_V)
    
    for i in tqdm(range(R)):
        
        E_ = live_edge_graph_edges(network_np, len(network_np))
        
        G_ = G_copy.copy()
        G_.add_edges_from(E_)
        
        comp[i], G[i] = scc_DAG(G_)
        
        G_i_deg = dict(G[i].degree())
        h[i] = max(G_i_deg, key=G_i_deg.get)
        
        # D[i] = set(bfs(G[i], [h[i]]))
        D[i] = set(dict(nx.bfs_edges(G[i], h[i])))
        
        A[i] = set(dict(nx.bfs_edges(G[i], h[i], reverse=True))) - set([h[i]])
        
        V[i] = G[i].nodes()
        latest[i] = {v:False for v in V[i]}
    print("comp init")
    
    S = []
    for j in range(k):
        start = time.time()
        v_gain = {v:sum([GAIN(i, v, comp, G, A, S, h, D) for i in range(R)])/R for v in tqdm(G_V)}
        t = max(v_gain, key=v_gain.get)
        
        S.append(t)
        
        for i in range(R):
            G[i] = UPDATEDAG(i, t, comp, G)
            
        time_list.append(time.time())
        
    return S

In [9]:
# k = 5
# R = 200
# %time PMC_greedy(network_np, k, R)

# CPU times: user 16min, sys: 9min 10s, total: 25min 10s
# Wall time: 32min 56s
# [763, 645, 634, 71399, 3924]

In [10]:
# G = nx.DiGraph()
# G.add_weighted_edges_from(network_np)

In [11]:
# %time mc.approx_inf_size_IC_T(G, [0, 1, 29, 27, 26], 10000)

# 計算実験

In [12]:
def PMC_greedy_time(network_np, k, R):
    time_list = []
    # E_ = dict([])
    # G_ = dict([])
    G = dict([])
    comp = dict([])
    h = dict([])
    D = dict([])
    A = dict([])
    V = dict([])
    global latest
    latest = dict([])
    global delta
    delta = {i:dict([]) for i in range(R)}
    
    # 全体のグラフ
    Node = np.unique(network_np.T[[0,1]])
    
    # 元グラフの頂点集合
    G_V = [int(node) for node in Node]
    
    # copy用に作っておく
    G_copy = nx.DiGraph()
    G_copy.add_nodes_from(G_V)
    
    for i in tqdm(range(R)):
        
        E_ = live_edge_graph_edges(network_np, len(network_np))
        
        G_ = G_copy.copy()
        G_.add_edges_from(E_)
        
        comp[i], G[i] = scc_DAG(G_)
        
        G_i_deg = dict(G[i].degree())
        h[i] = max(G_i_deg, key=G_i_deg.get)
        
        # D[i] = set(bfs(G[i], [h[i]]))
        D[i] = set(dict(nx.bfs_edges(G[i], h[i])))
        
        A[i] = set(dict(nx.bfs_edges(G[i], h[i], reverse=True))) - set([h[i]])
        
        V[i] = G[i].nodes()
        latest[i] = {v:False for v in V[i]}
    print("comp init")
    
    S = []
    for j in range(k):
        start = time.time()
        v_gain = {v:sum([GAIN(i, v, comp, G, A, S, h, D) for i in range(R)])/R for v in tqdm(G_V)}
        t = max(v_gain, key=v_gain.get)
        
        S.append(t)
        
        for i in range(R):
            G[i] = UPDATEDAG(i, t, comp, G)
            
        time_list.append(time.time())
        
    return S, time_list

## Epinions

In [13]:
run_time = dict([])
S = dict([])

### WC

In [14]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/Epinions/WC.csv")
network.head()

Unnamed: 0,# FromNodeId,ToNodeId,WC
0,0,4,0.008
1,0,5,0.005682
2,0,7,0.033333
3,0,8,0.009615
4,0,9,0.066667


In [15]:
# numpy型に変換
network_np = network.values

In [None]:
start = time.time()
S["WC"], time_list = PMC_greedy_time(network_np, 50, 200)
run_time["WC"] = time.time() - start

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


comp init


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=75879.0), HTML(value='')))

In [None]:
WC_df_seed_time = pd.DataFrame({"Size of Seed Set":range(1,51), 
                                "Running Time[s]":time_list}).set_index("Size of Seed Set")
WC_df_seed_time.to_csv("WC_df_seed_time.csv")

In [None]:
WC_df_seed_time.plot()

In [None]:
run_time["WC"]
# 2870.9144158363342

In [None]:
S["WC"]

# [763,
#  634,
#  645,
#  71399,
#  5232,
#  3924,
#  637,
#  2969,
#  1835,
#  145,
#  44,
#  1059,
#  2066,
#  1669,
#  824,
#  1172,
#  5227,
#  1539,
#  2118,
#  5144,
#  1596,
#  2704,
#  1225,
#  4931,
#  1501,
#  546,
#  629,
#  9412,
#  1720,
#  1409,
#  661,
#  1533,
#  1638,
#  5224,
#  3850,
#  6003,
#  3952,
#  5905,
#  1849,
#  751,
#  71388,
#  1626,
#  1189,
#  7427,
#  6489,
#  426,
#  7047,
#  3065,
#  3234,
#  770]

### TR

In [None]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/Epinions/TR.csv")
network.head()

In [None]:
# numpy型に変換
network_np = network.values

In [None]:
start = time.time()
S["TR"], time_list = PMC_greedy_time(network_np, 50, 200)
run_time["TR"] = time.time() - start

In [None]:
TR_df_seed_time = pd.DataFrame({"Size of Seed Set":range(1,51), 
                                "Running Time[s]":time_list}).set_index("Size of Seed Set")
TR_df_seed_time.to_csv("TR_df_seed_time.csv")

In [None]:
TR_df_seed_time.plot()

In [None]:
run_time["TR"]
# 14991.995094537735

In [None]:
S["TR"]
# [5175,
#  3805,
#  1209,
#  4077,
#  5432,
#  962,
#  12948,
#  622,
#  4154,
#  5803,
#  1805,
#  6529,
#  762,
#  9419,
#  3023,
#  4092,
#  1643,
#  183,
#  3987,
#  4716,
#  10070,
#  2200,
#  7642,
#  8975,
#  188,
#  6701,
#  7544,
#  19561,
#  5752,
#  178,
#  260,
#  459,
#  2274,
#  2199,
#  10997,
#  591,
#  1931,
#  3778,
#  4832,
#  950,
#  572,
#  19583,
#  3515,
#  246,
#  3519,
#  5832,
#  71401,
#  821,
#  5359,
#  2736]

### P=0.1

In [None]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/Epinions/P=0_1.csv")
network.head()

In [None]:
# numpy型に変換
network_np = network.values

In [None]:
start = time.time()
S["P=0_1"], time_list = PMC_greedy_time(network_np, 50, 200)
run_time["P=0_1"] = time.time() - start

In [None]:
P01_df_seed_time = pd.DataFrame({"Size of Seed Set":range(1,51), 
                                "Running Time[s]":time_list}).set_index("Size of Seed Set")
P01_df_seed_time.to_csv("P01_df_seed_time.csv")

In [None]:
P01_df_seed_time.plot()

In [None]:
run_time["P=0_1"]

In [None]:
S["P=0_1"]