In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import random
import time
from collections import deque
from InfMaxProblem import Monte_Carlo as mc

%matplotlib inline

# データの読み込み

In [3]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/HepPh/WC.csv")
network.head()

Unnamed: 0,# FromNodeId,ToNodeId,WC
0,9907233,9301253,0.016949
1,9907233,9504304,0.030303
2,9907233,9505235,0.043478
3,9907233,9506257,0.1
4,9907233,9606402,0.047619


In [4]:
network = pd.read_csv("data.csv")
network.head()

Unnamed: 0,# FromNodeId,ToNodeId,WC
0,0,4,0.015625
1,0,5,0.011236
2,0,7,0.1
3,0,8,0.03125
4,0,9,0.111111


In [5]:
# numpy型に変換
network_np = network.values

In [6]:
# 空の有向グラフを作成
G = nx.DiGraph()

In [7]:
# 重み付きの枝を加える
G.add_weighted_edges_from(network_np)

In [8]:
options = {'node_color': 'cyan',
           'edge_color': 'gray',
           'node_size': 1000,
           'width': 0.5,
          }

In [9]:
# pos = nx.spring_layout(G, k=1)

In [10]:
# fig = plt.figure(figsize=(25,12))
# # nx.draw_networkx_edges(G, pos, **options)
# # nx.draw_networkx_nodes(G, pos, **options)
# nx.draw_networkx(G, pos, **options)
# plt.show()

# greedy

In [11]:
def Greedy_Approx(G, k, T):
    S = []
    V = set(G.nodes())
    for i in range(k):
        sigma = {s:mc.approx_inf_size_IC_T(G, S+[s], T) for s in tqdm(V) if s not in S}
        max_s = max(sigma, key=sigma.get)
        S.append(max_s)
        V.remove(max_s)
    return S

Greedy_Approx(G, 5, 10)
# [0.0, 25.0, 1.0, 22.0, 27.0]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=499.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=498.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=497.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




[1.0, 145.0, 383.0, 308.0, 208.0]

In [40]:
def CELF(G, k, T):
    S = []
    V = set(G.nodes())
    lamda = [dict() for i in range(k)]
    spread = [dict() for i in range(k)]
    max_spread = [0]
    for i in tqdm(range(k)):
        if i == 0:
            lamda[i] = {s:mc.approx_inf_size_IC_T(G, [s], T) for s in tqdm(V)}
            s_ = max(lamda[i], key=lamda[i].get)
            pre_spread = lamda[i][s_] - 0
        else:
            max_gain = 0
            for s in tqdm(V):
                if max_gain <= lamda[i-1][s]:
                    spread[i][s] = mc.approx_inf_size_IC_T(G, S+[s], T)
                    lamda[i][s] = spread[i][s] - pre_spread
                    if max_gain <= lamda[i][s]:
                        s_ = s
                        max_gain = lamda[i][s]
                else:
                    lamda[i][s] = lamda[i-1][s]
                    
            pre_spread = spread[i][s_]
        max_spread.append(pre_spread)
        S.append(s_)
        V.remove(s_)
    return S, max_spread

# CELF(G, 5, 10)
# [0.0, 29.0, 27.0, 28.0, 5.0]




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=499.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=498.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=497.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))





([1.0, 145.0, 5.0, 127.0, 170.0], [0, 64.7, 101.4, 124.0, 135.6, 137.9])

In [37]:
def LTLG(G, k, T, R_len):
    S = []
    V = set(G.nodes())
    lamda = [dict() for i in range(k)]
    spread = [dict() for i in range(k)]
    for i in tqdm(range(k)):
        if i == 0:
            lamda[i] = {s:mc.approx_inf_size_IC_T(G, [s], T) for s in tqdm(V)}
            s_ = max(lamda[i], key=lamda[i].get)
            pre_spread = lamda[i][s_] - 0
        else:
            lamda[i] = dict()
            max_gain = 0
            R = np.random.choice(list(V), R_len, replace=False)
            for s in tqdm(V):
                if s in R:
                    if max_gain <= lamda[i-1][s]:
                        spread[i][s] = mc.approx_inf_size_IC_T(G, S+[s], T)
                        lamda[i][s] = spread[i][s] - pre_spread
                        if max_gain <= lamda[i][s]:
                            s_ = s
                            max_gain = lamda[i][s]
                    else:
                        lamda[i][s] = lamda[i-1][s]
                else:
                    lamda[i][s] = lamda[i-1][s]
            pre_spread = spread[i][s_]
        S.append(s_)
        V.remove(s_)
    return S

In [38]:
k = 5
n = G.number_of_nodes()
eps = 0.1
mini_batch = int((n / k)*np.log(1/eps))
print(n)
print(mini_batch)

# %time S_Greedy = Greedy_Approx(G, 5, 100)
%time S_LTLG = LTLG(G, k, 20, mini_batch)
%time S_CELS = CELF(G, k, 20)

500
230


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=499.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=498.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=497.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))



CPU times: user 45.2 s, sys: 2.65 s, total: 47.9 s
Wall time: 43.9 s


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


HBox(children=(FloatProgress(value=0.0, max=499.0), HTML(value='')))

1
1
1
1



HBox(children=(FloatProgress(value=0.0, max=498.0), HTML(value='')))

2



HBox(children=(FloatProgress(value=0.0, max=497.0), HTML(value='')))

KeyboardInterrupt: 

## seed simulation

In [54]:
from multiprocessing import Pool

In [55]:
def experiment_IC(G, seed, T=10000):
    inf_sum = 0
    inf_size_list = []
    for i in tqdm(range(T)):
        simulation = mc.IC_simulation(G, seed)
        
        # 影響数
        inf_sum += simulation
        inf_size_list.append(inf_sum / (i+1))
    return inf_size_list, inf_sum/T

In [56]:
def IC_simulation_wrapper(args):
    return mc.IC_simulation(*args)

In [57]:
def experiment_IC_mult(G, seed, T=10000):
    with Pool(processes=4) as p:
        values = [(G, seed) for t in range(T)]
        result = p.map(IC_simulation_wrapper, values)
    return sum(result)/T

In [58]:
# mc.approx_inf_size_IC_T(G, S_Greedy, 10000)

In [74]:
mc.approx_inf_size_IC_T(G, S_LTLG, 200)

943.635

In [75]:
mc.approx_inf_size_IC_T(G, S_CELS, 200)

883.645

# 計算実験

In [68]:
def CELF_time(G, k, T=10000):
    S = []
    V = set(G.nodes())
    lamda = [dict() for i in range(k)]
    spread = [dict() for i in range(k)]
    max_spread = [0]
    time_list = []
    for i in tqdm(range(k)):
        start = time.time()
        if i == 0:
            lamda[i] = {s:mc.approx_inf_size_IC_T(G, [s], T) for s in tqdm(V)}
            s_ = max(lamda[i], key=lamda[i].get)
            pre_spread = lamda[i][s_] - 0
        else:
            max_gain = 0
            for s in tqdm(V):
                if max_gain <= lamda[i-1][s]:
                    spread[i][s] = mc.approx_inf_size_IC_T(G, S+[s], T)
                    lamda[i][s] = spread[i][s] - pre_spread
                    if max_gain <= lamda[i][s]:
                        s_ = s
                        max_gain = lamda[i][s]
                else:
                    lamda[i][s] = lamda[i-1][s]
                    
            pre_spread = spread[i][s_]
        max_spread.append(pre_spread)
        S.append(s_)
        V.remove(s_)
        
        time_list.append(time.time() - start)
    return S, max_spread, time_list

## HepPh

In [69]:
run_time = dict([])
S = dict([])
spread = dict([])

### WC

In [70]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/HepPh/WC.csv")
network.head()

Unnamed: 0,# FromNodeId,ToNodeId,WC
0,9907233,9301253,0.016949
1,9907233,9504304,0.030303
2,9907233,9505235,0.043478
3,9907233,9506257,0.1
4,9907233,9606402,0.047619


In [71]:
# numpy型に変換
network_np = network.values

# 空の有向グラフを作成
G = nx.DiGraph()

# 重み付きの枝を加える
G.add_weighted_edges_from(network_np)

In [None]:
start = time.time()
S["WC"], spread["WC"], time_list = CELF_time(G, 50)
run_time["WC"] = time.time() - start

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=34546.0), HTML(value='')))

In [None]:
WC_df_seed_time = pd.DataFrame({"Size of Seed Set":range(1,51), 
                                "Running Time[s]":time_list}).set_index("Size of Seed Set")
WC_df_seed_time.to_csv("result/HepPh/CEFL/WC_df_seed_time.csv")

In [None]:
WC_df_seed_time.plot()

In [None]:
run_time["WC"]

In [None]:
S["WC"]

In [None]:
spread["WC"]

### TR

In [None]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/HepPh/TR.csv")
network.head()

In [None]:
# numpy型に変換
network_np = network.values

# 空の有向グラフを作成
G = nx.DiGraph()

# 重み付きの枝を加える
G.add_weighted_edges_from(network_np)

In [None]:
start = time.time()
S["TR"], spread["TR"], time_list = CELF_time(G, 50)
run_time["TR"] = time.time() - start

In [None]:
TR_df_seed_time = pd.DataFrame({"Size of Seed Set":range(1,51), 
                                "Running Time[s]":time_list}).set_index("Size of Seed Set")
TR_df_seed_time.to_csv("result/HepPh/CEFL/TR_df_seed_time.csv")

In [None]:
TR_df_seed_time.plot()

In [None]:
run_time["TR"]

In [None]:
S["TR"]

In [None]:
spread["TR"]

### P=0.1

In [None]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/HepPh/P=0_1.csv")
network.head()

In [None]:
# numpy型に変換
network_np = network.values

# 空の有向グラフを作成
G = nx.DiGraph()

# 重み付きの枝を加える
G.add_weighted_edges_from(network_np)

In [None]:
start = time.time()
S["P=0_1"], spread["P=0_1"], time_list = CELF_time(G, 50)
run_time["P=0_1"] = time.time() - start

In [None]:
P01_df_seed_time = pd.DataFrame({"Size of Seed Set":range(1,51), 
                                "Running Time[s]":time_list}).set_index("Size of Seed Set")
P01_df_seed_time.to_csv("result/HepPh/CEFL/P01_df_seed_time.csv")

In [None]:
P01_df_seed_time.plot()

In [None]:
run_time["P=0_1"]

In [None]:
S["P=0_1"]

In [None]:
spread["P=0_1"]

### P=0.01

In [None]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/HepPh/P=0_01.csv")
network.head()

In [None]:
# numpy型に変換
network_np = network.values

# 空の有向グラフを作成
G = nx.DiGraph()

# 重み付きの枝を加える
G.add_weighted_edges_from(network_np)

In [None]:
start = time.time()
S["P=0_01"], spread["P=0_01"], time_list = CELF_time(G, 50)
run_time["P=0_01"] = time.time() - start

In [None]:
P001_df_seed_time = pd.DataFrame({"Size of Seed Set":range(1,51), 
                                "Running Time[s]":time_list}).set_index("Size of Seed Set")
P001_df_seed_time.to_csv("result/HepPh/CEFL/P001_df_seed_time.csv")

In [None]:
P001_df_seed_time.plot()

In [None]:
run_time["P=0_01"]

In [None]:
S["P=0_01"]

In [None]:
spread["P=0_01"]

### P=0.001

In [None]:
# 枝確率を計算済みのネットワークを読み込む
network = pd.read_csv("data/HepPh/P=0_001.csv")
network.head()

In [None]:
# numpy型に変換
network_np = network.values

# 空の有向グラフを作成
G = nx.DiGraph()

# 重み付きの枝を加える
G.add_weighted_edges_from(network_np)

In [None]:
start = time.time()
S["P=0_001"], spread["P=0_001"], time_list = CELF_time(G, 50)
run_time["P=0_001"] = time.time() - start

In [None]:
P0001_df_seed_time = pd.DataFrame({"Size of Seed Set":range(1,51), 
                                "Running Time[s]":time_list}).set_index("Size of Seed Set")
P0001_df_seed_time.to_csv("result/HepPh/CEFL/P0001_df_seed_time.csv")

In [None]:
P0001_df_seed_time.plot()

In [None]:
run_time["P=0_001"]

In [None]:
S["P=0_001"]

In [None]:
spread["P=0_001"]

# 結果

In [None]:
pd.DataFrame(S)

In [None]:
pd.DataFrame([run_time])

In [None]:
# resultフォルダに保存
pd.DataFrame(S).to_csv("result/HepPh/CEFL/seed.csv")
pd.DataFrame([run_time]).to_csv("result/HepPh/CEFL/run_time.csv")

pd.DataFrame(spread).to_csv("result/HepPh/CEFL/spread.csv")