In [1]:
import pandas as pd
import glob
import networkx as nx
from functions import *
import os
import scipy as sp
import scipy.io  
import io
import snap

In [2]:
path = "less_than_1500_nodes"
[x.replace("{}\\".format(path), "") for x in glob.glob("{}/*".format(path))]

['7th_graders',
 'aves-weaver-social',
 'bt.csv',
 'contacts-prox-high-school-2013',
 'contacts-prox-high-school-2013-attr',
 'email_company',
 'fb-messages',
 'fb-pages-politician',
 'fb-pages-tvshow',
 'fb_friends.csv',
 'high_tech_company',
 'inf-euroroad',
 'infect-dublin',
 'infect-hyper',
 'Karate',
 'kidnappings',
 'KKI',
 'kor.2015.mers.1.00',
 'law_firm',
 'moreno_taro',
 'physician_trust',
 'primary-school-proximity',
 'sgp.2003.sars.1.00',
 'soc-firm-hi-tech',
 'soc-hamsterster',
 'swingers',
 'uni_email',
 'usa.2009.flu.1.00',
 'usa.2020.covid.6.00',
 'webkb-wisc']

In [3]:
data_paths = [x.replace("{}\\".format(path), "") for x in glob.glob("{}/*".format(path))]

In [4]:
G_list = []
for d_path in data_paths:
    print(d_path)
    if any(fname.endswith('.csv') for fname in  glob.glob('{}/{}/*'.format(path,d_path))):
        df = pd.read_csv('{}/{}/edges.csv'.format(path,d_path))
        G = nx.from_pandas_edgelist(df, "# source", " target")
        
    elif any(fname.endswith('.edgelist') for fname in glob.glob('{}/{}/*'.format(path,d_path))):
        G = nx.read_edgelist("{}/{}/edges.edgelist".format(path, d_path), nodetype=int)
        
    elif not any(fname.endswith('txt.gz') for fname in  glob.glob('{}/{}/*'.format(path,d_path))):
        G = nx.read_adjlist("{}/{}/edges.txt".format(path, d_path), nodetype=int)

    else:
        G = nx.read_adjlist(gzip.open('{}/{}/edges.txt.gz'.format(path,d_path)), nodetype=int)
            
    G_list.append(G)

7th_graders
aves-weaver-social
bt.csv
contacts-prox-high-school-2013
contacts-prox-high-school-2013-attr
email_company
fb-messages
fb-pages-politician
fb-pages-tvshow
fb_friends.csv
high_tech_company
inf-euroroad
infect-dublin
infect-hyper
Karate
kidnappings
KKI
kor.2015.mers.1.00
law_firm
moreno_taro
physician_trust
primary-school-proximity
sgp.2003.sars.1.00
soc-firm-hi-tech
soc-hamsterster
swingers
uni_email
usa.2009.flu.1.00
usa.2020.covid.6.00
webkb-wisc


In [5]:
G_list

[<networkx.classes.graph.Graph at 0x1fdb953c070>,
 <networkx.classes.graph.Graph at 0x1fdb854efd0>,
 <networkx.classes.graph.Graph at 0x1fdb953ce20>,
 <networkx.classes.graph.Graph at 0x1fdbb259f10>,
 <networkx.classes.graph.Graph at 0x1fdbb1d82b0>,
 <networkx.classes.graph.Graph at 0x1fdbae8b3d0>,
 <networkx.classes.graph.Graph at 0x1fdb9534c70>,
 <networkx.classes.graph.Graph at 0x1fdb953c6a0>,
 <networkx.classes.graph.Graph at 0x1fdb953c760>,
 <networkx.classes.graph.Graph at 0x1fdbb24dfd0>,
 <networkx.classes.graph.Graph at 0x1fdbb1d8280>,
 <networkx.classes.graph.Graph at 0x1fdb953c0a0>,
 <networkx.classes.graph.Graph at 0x1fdbae8b310>,
 <networkx.classes.graph.Graph at 0x1fdbae8b4c0>,
 <networkx.classes.graph.Graph at 0x1fdbb28dcd0>,
 <networkx.classes.graph.Graph at 0x1fdbb0b0bb0>,
 <networkx.classes.graph.Graph at 0x1fdb9534dc0>,
 <networkx.classes.graph.Graph at 0x1fdb953c5e0>,
 <networkx.classes.graph.Graph at 0x1fdb953c040>,
 <networkx.classes.graph.Graph at 0x1fdbb28dbe0>,


In [6]:
%%time
# Parameters 
nsamp = 10
p = 0.05
big_dat = pd.DataFrame({"seeding method": ['random']*nsamp + ['friend']*nsamp + ['pair']*nsamp})

for i,G in enumerate(G_list):    
    try:
        print(data_paths[i])
        print(nx.number_of_nodes(G_list[i]))

        (rtran, ftran, ptran) = sim_2seed_transitivity(G_list[i], p = p, nsamp = nsamp)
        # append results as columns
        big_dat["{}_num_nodes:{}".format(data_paths[i],  nx.number_of_nodes(G_list[i]))] = rtran + ftran + ptran
    except:
        print(data_paths[i])
    

7th_graders
29
aves-weaver-social
445
bt.csv
692
contacts-prox-high-school-2013
327
contacts-prox-high-school-2013-attr
327
email_company
167
fb-messages
1899
fb-pages-politician
5908
fb-pages-tvshow
3892
fb_friends.csv
800
high_tech_company
21
inf-euroroad
1174
infect-dublin
410
infect-hyper
113
Karate
34
kidnappings
351
kidnappings
KKI
2238
kor.2015.mers.1.00
186
kor.2015.mers.1.00
law_firm
71
moreno_taro
22
physician_trust
241
primary-school-proximity
242
sgp.2003.sars.1.00
172
sgp.2003.sars.1.00
soc-firm-hi-tech
33
soc-hamsterster
2426
swingers
96
swingers
uni_email
1133
usa.2009.flu.1.00
286
usa.2009.flu.1.00
usa.2020.covid.6.00
92
usa.2020.covid.6.00
webkb-wisc
265
Wall time: 14min 22s


In [7]:
big_dat.to_csv("2seedTransivityData1.csv")

# Random Graphs

In [101]:
G_random_BA = []
G_random_configuration = []
G_random_poisson = []

for i, G_empirical in enumerate(G_list):

    print(data_paths[i])

    try:
        G_BA = nx.barabasi_albert_graph(len(G_empirical.nodes()), int(np.average(G_empirical.degree())/2))
        G_random_BA.append(G_BA)

    except:
        print("BA")
        G_random_BA.append(None)

    try:
        G_poisson =  nx.erdos_renyi_graph(len(G_empirical.nodes()), 
                                      np.mean(list(dict(G_empirical.degree()).values()))/(len(G_empirical.nodes()) - 1), seed = 2022)
        largest_component = sorted(nx.connected_components(G_poisson), key=len, reverse=True)
        G_poisson = G_poisson.subgraph(largest_component[0])
        G_random_poisson.append(G_poisson)
    except:
        print("Poisson")
        G_random_poisson.append(None)

    try:
        G_config = nx.configuration_model(list(dict(G_empirical.degree()).values()), seed = 2022)
        largest_component = sorted(nx.connected_components(G_config), key=len, reverse=True)
        G_config = G_config.subgraph(largest_component[0])
        G_random_configuration.append(G_config)
    except:
        print("Configuration")
        G_random_configuration.append(None)



7th_graders
aves-weaver-social
bt.csv
contacts-prox-high-school-2013
contacts-prox-high-school-2013-attr
email_company
fb-messages
fb-pages-politician
fb-pages-tvshow
fb_friends.csv
high_tech_company
inf-euroroad
infect-dublin
infect-hyper
Karate
kidnappings
KKI
kor.2015.mers.1.00
BA
law_firm
moreno_taro
physician_trust
primary-school-proximity
sgp.2003.sars.1.00
BA
soc-firm-hi-tech
soc-hamsterster
swingers
uni_email
usa.2009.flu.1.00
BA
usa.2020.covid.6.00
BA
webkb-wisc


In [None]:
%%time
# Parameters 
nsamp = 10
p = 0.05
big_dat_random_BA = pd.DataFrame({"seeding method": ['random']*nsamp + ['friend']*nsamp + ['pair']*nsamp})
big_dat_random_configuration = pd.DataFrame({"seeding method": ['random']*nsamp + ['friend']*nsamp + ['pair']*nsamp})
big_dat_random_poisson = pd.DataFrame({"seeding method": ['random']*nsamp + ['friend']*nsamp + ['pair']*nsamp})

for i,G in enumerate(G_random_BA): 
    
    print(data_paths[i])
    
    if len(G_list[i].nodes()) < 3500:
        print(nx.number_of_nodes(G_list[i]))
    
        try:
            print("BA")
            (rtran, ftran, ptran) = sim_2seed_transitivity(G_random_BA[i], p = p, nsamp = nsamp)
            big_dat_random_BA["{}_num_nodes:{}".format(data_paths[i],  nx.number_of_nodes(G_random_BA[i]))] = rtran + ftran + ptran
        except:
            print("Failed")
        try:
            print("Configuration")
            (rtran, ftran, ptran) = sim_2seed_transitivity(G_random_configuration[i], p = p, nsamp = nsamp)
            big_dat_random_configuration["{}_num_nodes:{}".format(data_paths[i],  nx.number_of_nodes(G_random_configuration[i]))] = rtran + ftran + ptran
        except:
            print("Failed")
        try:
            print("Poisson")
            (rtran, ftran, ptran) = sim_2seed_transitivity(G_random_poisson[i], p = p, nsamp = nsamp)
            big_dat_random_poisson["{}_num_nodes:{}".format(data_paths[i],  nx.number_of_nodes(G_random_poisson[i]))] = rtran + ftran + ptran
        except:
            print("Failed")


7th_graders
29
BA
Configuration
Failed
Poisson
aves-weaver-social
445
BA
Configuration
Failed
Poisson
bt.csv
692
BA
Configuration
Failed
Poisson
contacts-prox-high-school-2013
327
BA
Configuration
Failed
Poisson
contacts-prox-high-school-2013-attr
327
BA
Configuration
Failed
Poisson
email_company
167
BA
Configuration
Failed
Poisson
fb-messages
1899
BA
Configuration
Failed
Poisson
fb-pages-politician
fb-pages-tvshow
fb_friends.csv
800
BA
Configuration
Failed
Poisson
high_tech_company
21
BA
Configuration
Failed
Poisson
inf-euroroad
1174
BA
Configuration
Failed
Poisson
infect-dublin
410
BA
Configuration
Failed
Poisson
infect-hyper
113
BA
Configuration
Failed
Poisson
Karate
34
BA
Configuration
Failed
Poisson
kidnappings
351
BA
Configuration
Failed
Poisson
KKI
2238
BA


In [10]:
big_dat_random.to_csv("2seedTransivityDataRandom1.csv")

# Calculate Clustering Coefficients of Networks

In [43]:
# individualClustering_empirical = [nx.clustering(G) for G in G_list]
# individualClustering_random = [nx.clustering(G) if G is not None else None for G in G_random]


averageClustering_empirical = [np.mean(list(G.values())) for G in individualClustering_empirical]
averageClustering_random = [np.mean(list(G.values())) if G is not None else None for G in individualClustering_random]

sdClustering_empirical = [np.std(list(G.values())) for G in individualClustering_empirical]
sdClustering_random = [np.std(list(G.values())) if G is not None else None for G in individualClustering_random]

In [47]:
clusteringData = pd.DataFrame([data_paths, 
                               averageClustering_empirical, 
                               averageClustering_random,
                               sdClustering_empirical,
                               sdClustering_random]).transpose()
clusteringData.columns = ["network", "empirical_clust", "random_clust", "empirical_clustSD", "random_clustSD"]
clusteringData
clusteringData.to_csv("clusteringData.csv")

In [50]:
averageClustering_random

[0.4727556472565547,
 0.31565787484542834,
 0.4256024565776378,
 0.35155732423906255,
 0.35401488059421243,
 0.3965018174550641,
 0.314639725201374,
 0.3118757159226849,
 0.31222487932712867,
 0.3186501413354587,
 0.5170087226809915,
 0.3118680844264864,
 0.32490110882890133,
 0.4286979767717517,
 0.38758350453852186,
 0.3087857124263623,
 0.3119081551447534,
 None,
 0.4107647383425519,
 0.3019250951069133,
 0.3358348440665224,
 0.41548201320908923,
 None,
 0.44319660543978023,
 0.31352861391943665,
 0.32859659478054476,
 0.31404789759034185,
 None,
 None,
 0.31600445878076827]

0.7766783097534905

In [42]:
averageClustering_empirical

[0.7766783097534905,
 0.6685047954844804,
 0.5251012948583238,
 0.5035048191728448,
 0.5035048191728448,
 0.591863208548695,
 0.10939892385364362,
 0.38509612579327435,
 0.37373843245973964,
 0.3153509697401213,
 0.8032036811780289,
 0.016731564857629593,
 0.45582424184357156,
 0.5347555956050045,
 0.5409685086155674,
 0.0,
 0.3872007143917601,
 0.0,
 0.5715333938756393,
 0.33939393939393936,
 0.3115751356464073,
 0.5255415410620273,
 0.0,
 0.6705116697267809,
 0.5375333362074076,
 0.0,
 0.22017608650411602,
 0.0,
 0.0,
 0.2080282179610873]