In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import random
import glob

from functions import *

In [2]:
big_res = pd.DataFrame({"Targeting Method": ["Friend","Pair","Random"]*5,
              "Measure": list(np.repeat(['Betweeness', "Degree",'Eigenvector','Katz',"Page Rank"],3))})
big_res

Unnamed: 0,Targeting Method,Measure
0,Friend,Betweeness
1,Pair,Betweeness
2,Random,Betweeness
3,Friend,Degree
4,Pair,Degree
5,Random,Degree
6,Friend,Eigenvector
7,Pair,Eigenvector
8,Random,Eigenvector
9,Friend,Katz


In [8]:
#the five largest villages (by N) in Banerjee are 
villages = [60, 59, 52, 71, 65]

n_samples = 200
p = 0.10

for vil in villages:
    path = 'banerjee12_data/datav4.0/Data/1. Network Data/Adjacency Matrices/adj_allVillageRelationships_HH_vilno_{}.csv'.format(vil)
    adj_mat = pd.read_csv(path,header = None)
    current_g = nx.from_pandas_adjacency(adj_mat) #network for the village
    
    probs, centralities = get_prob_centralities(current_g, n_samples = n_samples, p = p) #over 500 seed iterations
    
    combinedDataProbs = pd.concat([centralities, probs], axis=1)

    seedNames = ["Random", "Friend", "Pair"]
    meltedDataProbs = pd.melt(combinedDataProbs.reset_index(), id_vars = [*seedNames, "index"], value_name = 'Measure Value', var_name = "Measure")
    meltedDataProbs = pd.melt(meltedDataProbs, id_vars = ["index", "Measure", "Measure Value"], value_name = 'Selection Probability', var_name = "Targeting Method")
    
    # Calculating Spearman Correlation
    spearmanData = meltedDataProbs.groupby(['Targeting Method', 'Measure'])[['Selection Probability','Measure Value']].corr(method="spearman").iloc[0::2,-1]
    # Dropping level_2 which was artificially created by grouping
    spearmanData = spearmanData.reset_index().drop('level_2', axis = 1).query("`Measure Value` != 1")
    
    print('village {}'.format(vil))
    res = spearmanData.sort_values("Measure").reset_index().drop("index", axis = 1)
    #print(res)
    big_res["Vil" + str(vil) + " N={}".format(nx.number_of_nodes(current_g))] = res['Measure Value']
    

village 60
village 59
village 52
village 71
village 65


In [9]:
big_res

Unnamed: 0,Targeting Method,Measure,Vil60 N=356,Vil59 N=329,Vil52 N=327,Vil71 N=298,Vil65 N=297
0,Friend,Betweeness,0.913873,0.915381,0.929804,0.914863,0.92832
1,Pair,Betweeness,0.825622,0.829512,0.84947,0.807951,0.846116
2,Random,Betweeness,0.116211,0.211235,0.043392,0.015885,0.129158
3,Friend,Degree,0.915042,0.91916,0.935624,0.938591,0.924149
4,Pair,Degree,0.828116,0.826052,0.857667,0.829859,0.833607
5,Random,Degree,0.118028,0.248777,0.030672,0.040911,0.167919
6,Friend,Eigenvector,0.702892,0.795534,0.857756,0.829753,0.77254
7,Pair,Eigenvector,0.667422,0.700527,0.7568,0.721879,0.685434
8,Random,Eigenvector,0.146341,0.234716,0.024539,0.065571,0.18737
9,Friend,Katz,0.53092,0.500946,0.260951,0.276563,0.091169


In [5]:
path = "mass_data/less_than_1500_nodes"
data_paths = [x.replace("{}/".format(path), "") for x in glob.glob("{}/*".format(path))]
data_paths

['contacts-prox-high-school-2013',
 'inf-euroroad',
 'bt.csv',
 'swingers',
 'Karate',
 'fb-pages-tvshow',
 'kor.2015.mers.1.00',
 'uni_email',
 'contacts-prox-high-school-2013-attr',
 'fb-pages-politician',
 'sgp.2003.sars.1.00',
 'law_firm',
 'fb-messages',
 'soc-hamsterster',
 'webkb-wisc',
 'usa.2020.covid.6.00',
 'primary-school-proximity',
 'KKI',
 'infect-hyper',
 'infect-dublin',
 'fb_friends.csv',
 'high_tech_company',
 'usa.2009.flu.1.00',
 'soc-firm-hi-tech',
 'physician_trust',
 '7th_graders',
 'aves-weaver-social',
 'kidnappings',
 'email_company',
 'moreno_taro']

In [6]:
#remove fb-pages tvshow and politician
data_paths.remove("fb-pages-tvshow")
data_paths.remove("fb-pages-politician")

In [7]:
G_list = []
for d_path in data_paths:
    
    if any(fname.endswith('.csv') for fname in  glob.glob('{}/{}/*'.format(path,d_path))):
        df = pd.read_csv('{}/{}/edges.csv'.format(path,d_path))
        G = nx.from_pandas_edgelist(df, "# source", " target")
        
    elif any(fname.endswith('.edgelist') for fname in glob.glob('{}/{}/*'.format(path,d_path))):
        G = nx.read_edgelist("{}/{}/edges.edgelist".format(path, d_path), nodetype=int)
        
    elif not any(fname.endswith('txt.gz') for fname in  glob.glob('{}/{}/*'.format(path,d_path))):
        G = nx.read_adjlist("{}/{}/edges.txt".format(path, d_path), nodetype=int)

    else:
        G = nx.read_adjlist(gzip.open('{}/{}/edges.txt.gz'.format(path,d_path)), nodetype=int)
    print(d_path, "N = " + str(nx.number_of_nodes(G)))
    G_list.append(G)

contacts-prox-high-school-2013 N = 327
inf-euroroad N = 1174
bt.csv N = 692
swingers N = 96
Karate N = 34
kor.2015.mers.1.00 N = 186
uni_email N = 1133
contacts-prox-high-school-2013-attr N = 327
sgp.2003.sars.1.00 N = 172
law_firm N = 71
fb-messages N = 1899
soc-hamsterster N = 2426
webkb-wisc N = 265
usa.2020.covid.6.00 N = 92
primary-school-proximity N = 242
KKI N = 2238
infect-hyper N = 113
infect-dublin N = 410
fb_friends.csv N = 800
high_tech_company N = 21
usa.2009.flu.1.00 N = 286
soc-firm-hi-tech N = 33
physician_trust N = 241
7th_graders N = 29
aves-weaver-social N = 445
kidnappings N = 351
email_company N = 167
moreno_taro N = 22


In [12]:
%%time

for i,G in enumerate(G_list):    

    probs, centralities = get_prob_centralities(G, n_samples, p) 
    
    combinedDataProbs = pd.concat([centralities, probs], axis=1)

    seedNames = ["Random", "Friend", "Pair"]
    meltedDataProbs = pd.melt(combinedDataProbs.reset_index(), id_vars = [*seedNames, "index"], value_name = 'Measure Value', var_name = "Measure")
    meltedDataProbs = pd.melt(meltedDataProbs, id_vars = ["index", "Measure", "Measure Value"], value_name = 'Selection Probability', var_name = "Targeting Method")

    # Calculating Spearman Correlation
    spearmanData = meltedDataProbs.groupby(['Targeting Method', 'Measure'])[['Selection Probability','Measure Value']].corr(method="spearman").iloc[0::2,-1]
    # Dropping level_2 which was artificially created by grouping
    spearmanData = spearmanData.reset_index().drop('level_2', axis = 1).query("`Measure Value` != 1")

    print(data_paths[i], "N = ", str(nx.number_of_nodes(G)))
    res = spearmanData.sort_values("Measure").reset_index().drop("index", axis = 1)
    print(res)
    big_res[data_paths[i] + " N={}".format(nx.number_of_nodes(G))] = res['Measure Value']
    

contacts-prox-high-school-2013 N =  327
   Targeting Method      Measure  Measure Value
0            Friend   Betweeness       0.738967
1              Pair   Betweeness       0.503822
2            Random   Betweeness      -0.027612
3            Friend       Degree       0.760983
4              Pair       Degree       0.542885
5            Random       Degree      -0.029237
6            Friend  Eigenvector       0.643252
7              Pair  Eigenvector       0.461283
8            Random  Eigenvector      -0.023290
9            Friend         Katz       0.547974
10             Pair         Katz       0.342276
11           Random         Katz       0.014812
12           Friend    Page Rank       0.785744
13             Pair    Page Rank       0.555524
14           Random    Page Rank      -0.028232
inf-euroroad N =  1174
   Targeting Method      Measure  Measure Value
0            Friend   Betweeness       0.461755
1              Pair   Betweeness       0.377699
2            Random   Bet

soc-hamsterster N =  2426
   Targeting Method      Measure  Measure Value
0            Friend   Betweeness       0.517848
1              Pair   Betweeness       0.496517
2            Random   Betweeness       0.007706
3            Friend       Degree       0.514010
4              Pair       Degree       0.469176
5            Random       Degree       0.003513
6            Friend  Eigenvector       0.185272
7              Pair  Eigenvector       0.221953
8            Random  Eigenvector       0.005114
9            Friend         Katz       0.011282
10             Pair         Katz       0.031432
11           Random         Katz       0.009961
12           Friend    Page Rank       0.828041
13             Pair    Page Rank       0.675913
14           Random    Page Rank       0.017414
webkb-wisc N =  265
   Targeting Method      Measure  Measure Value
0            Friend   Betweeness       0.666396
1              Pair   Betweeness       0.666202
2            Random   Betweeness       0.0

physician_trust N =  241
   Targeting Method      Measure  Measure Value
0            Friend   Betweeness       0.712756
1              Pair   Betweeness       0.600182
2            Random   Betweeness       0.028980
3            Friend       Degree       0.837740
4              Pair       Degree       0.693692
5            Random       Degree      -0.018310
6            Friend  Eigenvector      -0.130349
7              Pair  Eigenvector      -0.131049
8            Random  Eigenvector      -0.022980
9            Friend         Katz       0.342682
10             Pair         Katz       0.343566
11           Random         Katz       0.022223
12           Friend    Page Rank       0.882248
13             Pair    Page Rank       0.707700
14           Random    Page Rank      -0.006989
7th_graders N =  29
   Targeting Method      Measure  Measure Value
0            Friend   Betweeness       0.819288
1              Pair   Betweeness       0.553114
2            Random   Betweeness      -0.08

In [31]:
#big_res.to_csv("correlations_dat.csv", index = False)
big_res.columns

Index(['Targeting Method', 'Measure', 'Vil60 N=356', 'Vil59 N=329',
       'Vil52 N=327', 'Vil71 N=298', 'Vil65 N=297',
       'contacts-prox-high-school-2013 N=327', 'inf-euroroad N=1174',
       'bt.csv N=692', 'swingers N=96', 'Karate N=34',
       'kor.2015.mers.1.00 N=186', 'uni_email N=1133',
       'contacts-prox-high-school-2013-attr N=327', 'sgp.2003.sars.1.00 N=172',
       'law_firm N=71', 'fb-messages N=1899', 'soc-hamsterster N=2426',
       'webkb-wisc N=265', 'usa.2020.covid.6.00 N=92',
       'primary-school-proximity N=242', 'KKI N=2238', 'infect-hyper N=113',
       'infect-dublin N=410', 'fb_friends.csv N=800', 'high_tech_company N=21',
       'usa.2009.flu.1.00 N=286', 'soc-firm-hi-tech N=33',
       'physician_trust N=241', '7th_graders N=29', 'aves-weaver-social N=445',
       'kidnappings N=351', 'email_company N=167', 'moreno_taro N=22'],
      dtype='object')

In [37]:
import plotly.graph_objects as go

column = 'KKI N=2238' #place the colname here to choose which network
spearmanData = big_res[["Targeting Method", "Measure", column]]

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = spearmanData.pivot("Targeting Method", "Measure", column).columns,
        y = spearmanData.pivot("Targeting Method", "Measure", column).index,
        z = np.array(spearmanData.pivot("Targeting Method", "Measure", column)),
        text = spearmanData.pivot("Targeting Method", "Measure", column).values,
        texttemplate = '%{text:.2f}'
    )
)
fig.show()