# Percolation analysis

* read a network
* drop the links
* add links based on some measure (for example, link weight)

* we measure on scale (0-1) how quickly they make a one complete component
* Percolation = |N_LCC|/|N|


In [1]:
import pandas as pd 
import networkx as nx
import os
import random
from random import sample

def read_graph(N):
    G = nx.Graph()
    return nx.read_gml(f'Networks/{N}.gml')

names = [ 'dolphins',
          'polbooks',
          'word_adjacencies',
          'arenas-email',
             'Karate',
             'Erdos Renyi',
             'circuits s208',
             'circuits s420',
             'circuits s838',
             'E. coli',
             'Barabasi_albert_graph',
             'facebook 0',
             'facebook 107',
             'facebook 348',
             'facebook 414',
             'facebook 686',
             'facebook 1684',
             'bn-macaque-rhesus_brain_2',
             'soc-tribes',
             'fb-pages-food',
             'bn-cat-mixed-species_brain_1',
             'soc-firm-hi-tech']


# read the networks
networks = []
for N in names:
    network = {}
    network['name'] = N
    network['graph'] = read_graph(N)
    networks.append(network)
networks

standard networks dataset\dolphins\dolphins.gml
standard networks dataset\polbooks\out2.txt
standard networks dataset\word_adjacencies.gml\word_adjacencies.gml
standard networks dataset\arenas-email\out2.txt
standard networks datasetKarate
standard networks datasetErdos Renyi
standard networks dataset\USAir97\USAir97.mtx
standard networks dataset\circuits\s208_st.txt
standard networks dataset\circuits\s420_st.txt
standard networks dataset\circuits\s838_st.txt
standard networks dataset\E. Coli\E. Coli.txt
standard networks datasetBarabasi_albert_graph
standard networks dataset\facebook\0.edges
standard networks dataset\facebook\107.edges
standard networks dataset\facebook\348.edges
standard networks dataset\facebook\414.edges
standard networks dataset\facebook\686.edges
standard networks dataset\facebook\1684.edges
standard networks dataset\bio-celegans\bio-celegans.mtx
standard networks dataset\bn-macaque-rhesus_brain_2\bn-macaque-rhesus_brain_2.txt
standard networks dataset\soc-tribes

In [4]:
nodes = [len(networks[n].nodes()) for n in networks]
edges = [len(networks[n].edges()) for n in networks]

pd.DataFrame({'$|N|$': nodes, '$|E|$': edges})

Unnamed: 0,$|N|$,$|E|$
0,62,159
1,190,441
2,112,425
3,1893,5451
4,34,78
5,500,1500
6,122,189
7,252,399
8,512,819
9,1699,3758


In [5]:
def weighted_edges(G, C):
    '''return a weighted edges'''
    W = []
    for u,v in G.edges():
        W.append([u, v, C[u]*C[v]])
    return sorted(W, key=lambda x: x[2])

def batch_list(lst):
    """
    Divide a list into batches of an equal number of items (as close to 50 as possible).
    """
    batch_size = (len(lst) + 49) // 50  # Calculate the batch size
    num_batches = (len(lst) + batch_size - 1) // batch_size
    batches = [lst[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]
    return batches

def simulation(centr):
    results = []
    for network in networks.keys():
        print(network)
        G0 = networks[network]
        bc_G0 = centr(G0)
        W = weighted_edges(G0, bc_G0)
        G = nx.Graph()
        G.add_nodes_from(G0.nodes())
        batches = batch_list(W)

        result = []
        for b in range(len(batches)):
            for u,v,_ in batches[b]:
                G.add_edge(u, v)
            largest_component = G.subgraph(max(nx.connected_components(G), key=len))
            result.append([b, len(largest_component)/len(G)])
        results.append(result)
    return results


In [6]:
def plot(results, title):
    import matplotlib.pyplot as plt
    fig = plt.figure(dpi=600)
    fig, ax = plt.subplots()
    markers = ['+', 'x', 'o', 's', 'd', 'D', '*'] # Add your desired markers here
    for d in range(len(results)):
        data = results[d]
        x = [item[0] for item in data]
        y = [item[1] for item in data]
        marker_idx = d % len(markers) # Choose marker based on index of the result
        ax.plot(x, y, marker=markers[marker_idx], linewidth=0.5, markersize=3, label=name[d])

    ax.set_xlabel('edges')
    ax.set_ylabel(r'$|N_{LCC}| / |N| $')
    ax.set_title(f'{title}-based weighted edges')
    plt.legend()
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()


In [9]:
def properties(G):
    GCC = nx.transitivity(G)
#     ACC = nx.average_clustering(G)
    d = nx.density(G)
    r = nx.degree_assortativity_coefficient(G)    
    lcg = sorted(nx.connected_components(G), key=len, reverse=True)
    LCG = G.subgraph(lcg[0])    
    ASP = nx.average_shortest_path_length(LCG)
    diam = nx.diameter(LCG)

    communities = greedy_modularity_communities(G)
    mod = nx.community.modularity(G, communities)
    eff = round(nx.global_efficiency(G), 12)
    return  GCC, 0, d, r, ASP, diam, mod, eff

In [10]:
network_properties = [properties(networks[g]) for g in networks]

In [12]:
GCCs = [i for i, _,_,_,_,_,_,_ in network_properties]
ACCs = [i for _, i,_,_,_,_,_,_ in network_properties]
ds   = [i for _, _,i,_,_,_,_,_ in network_properties]
rs   = [i for _, _,_,i,_,_,_,_ in network_properties]
ASPs = [i for _, _,_,_,i,_,_,_ in network_properties]
diam = [i for _, _,_,_,_,i,_,_ in network_properties]
mod = [i for _, _,_,_,_,_,i,_ in network_properties]
eff  = [i for _, _,_,_,_,_,_,i in network_properties]

df2 = pd.DataFrame({'Networks': name, 'GCC': GCCs, 'Density': ds, 'r': rs, 'Diameter': diam, 'M': mod, 'e' : eff}) 
df2.to_csv('Data/networks properties.csv', index=False)

In [15]:
df2 = pd.read_csv('Data/networks properties.csv')
print(df2)

numeric_cols = df2.select_dtypes(include='number').columns
df_quartiles = df2[numeric_cols].apply(lambda x: pd.qcut(x.dropna(), q=[0, 0.25, 0.5, 0.75, 1.0], labels=[0.25,0.5,0.75,1]) if x.dtype != object else x)
df_quartiles['Networks'] = df2['Networks']
df_quartiles

                        Networks       GCC   Density         r  Diameter  \
0                       dolphins  0.308776  0.084082 -0.043594         8   
1                       polbooks  0.000000  0.024561 -0.252596        10   
2               word_adjacencies  0.156935  0.068372 -0.129348         5   
3                   arenas-email  0.000000  0.003044 -0.130757        13   
4                         Karate  0.255682  0.139037 -0.475613         5   
5                    Erdos Renyi  0.011938  0.012024  0.002105         8   
6               circuits s208_st  0.057361  0.025606 -0.002013        11   
7               circuits s420_st  0.051680  0.012616 -0.005911        13   
8               circuits s838_st  0.048368  0.006261 -0.030017        15   
9                        E. Coli  0.000000  0.002605 -0.338126        10   
10         Barabasi_albert_graph  0.026599  0.011952 -0.086755         5   
11                     facebook0  0.425869  0.045570  0.236039        11   
12          

Unnamed: 0,GCC,Density,r,Diameter,M,e,Networks
0,0.75,0.75,0.5,0.5,0.75,0.5,dolphins
1,0.25,0.5,0.25,0.75,1.0,0.25,polbooks
2,0.75,0.75,0.5,0.25,0.25,0.75,word_adjacencies
3,0.25,0.25,0.25,1.0,1.0,0.25,arenas-email
4,0.75,1.0,0.25,0.25,0.5,1.0,Karate
5,0.5,0.25,0.75,0.5,0.5,0.5,Erdos Renyi
6,0.5,0.5,0.75,1.0,1.0,0.25,circuits s208_st
7,0.5,0.5,0.75,1.0,1.0,0.25,circuits s420_st
8,0.5,0.25,0.75,1.0,1.0,0.25,circuits s838_st
9,0.25,0.25,0.25,0.75,1.0,0.25,E. Coli


In [18]:

medians = df1.median()
print(medians)
for column in df1.columns :
    if column!= 'Networks':
        median = medians[column]  # Retrieve the median for the column
        df1[column] = (df1[column] < median).astype(int)


Degree         0.5
Betweenness    0.5
Closeness      0.5
Clustering     0.0
Random         0.0
Inverted PA    0.0
dtype: float64


  medians = df1.median()


In [20]:
df1 = pd.read_csv('Data/R.csv')

Dataset = pd.merge(df_quartiles, df1, on='Networks')
Dataset = Dataset.reindex(columns = ['Networks', 'GCC', 'M', 'Density', 'r', 'e', 'Diameter', 'Degree',
       'Betweenness', 'Closeness', 'Clustering', 'Random', 'Inverted PA'])
Dataset.to_csv('Data/vulnerability output.csv', index=False)
Dataset

Unnamed: 0,Networks,GCC,M,Density,r,e,Diameter,Degree,Betweenness,Closeness,Clustering,Random,Inverted PA
0,dolphins,0.75,0.75,0.75,0.5,0.5,0.5,1,1,1,1,1,1
1,polbooks,0.25,1.0,0.5,0.25,0.25,0.75,0,0,0,0,0,0
2,word_adjacencies,0.75,0.25,0.75,0.5,0.75,0.25,1,1,1,1,1,1
3,arenas-email,0.25,1.0,0.25,0.25,0.25,1.0,0,1,0,0,0,0
4,Karate,0.75,0.5,1.0,0.25,1.0,0.25,1,1,1,1,1,1
5,Erdos Renyi,0.5,0.5,0.25,0.75,0.5,0.5,0,1,0,0,1,0
6,circuits s208_st,0.5,1.0,0.5,0.75,0.25,1.0,0,1,0,1,1,1
7,circuits s420_st,0.5,1.0,0.5,0.75,0.25,1.0,0,0,0,0,0,0
8,circuits s838_st,0.5,1.0,0.25,0.75,0.25,1.0,0,0,0,1,1,0
9,E. Coli,0.25,1.0,0.25,0.25,0.25,0.75,0,0,0,0,0,0


In [23]:
from scipy.signal import savgol_filter
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold, cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('Data/vulnerability output.csv')
oldR = {}
X = data.loc[:, ['GCC', 'M', 'Density', 'r', 'Diameter', 'e']]
for c in [ 'Degree', 'Betweenness', 'Closeness', 'Clustering', 'Random', 'Inverted PA']:
    Y = data[c]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=19)
    oldR[c] = pls_da1(X_train, y_train, X_test).tolist()

oldR


{'Degree': [[0.05995933753391198],
  [-0.12477738914710855],
  [0.1551630288425342],
  [-0.09863093490684777],
  [-0.027317382280960888],
  [0.13348452722438384]],
 'Betweenness': [[-0.08025546041626173],
  [-0.09769551134120143],
  [-0.018301232674523066],
  [-0.12699453809988978],
  [-0.13426144910824434],
  [0.07433977929146635]],
 'Closeness': [[0.26283991332385276],
  [-0.019797713417283522],
  [0.12087058949024736],
  [0.10051921144536435],
  [-0.0072009139738630715],
  [0.0286475738612056]],
 'Clustering': [[0.0422321496081812],
  [-0.030319163258991238],
  [0.058725747612403185],
  [-0.11954616909061516],
  [-0.030681568069593938],
  [0.05356041628237219]],
 'Random': [[0.01729892484273604],
  [-0.08583580188351589],
  [-0.03698584843842086],
  [-0.0458083464008918],
  [-0.0649561487434998],
  [0.04160684725508881]],
 'Inverted PA': [[0.2536268730615397],
  [-0.022133098855755925],
  [-0.0595075654484576],
  [-0.21883406213253181],
  [-0.2796648098469584],
  [-0.123918907051552

In [25]:
R = {c: [i[0] for i in oldR[c]] for c in oldR}
R['Measures'] = ['GCC', 'M', 'Density', 'r', 'e', 'Diameter']
result = pd.DataFrame(R)
result.to_csv('Data/final.csv', index=False)

In [26]:
result

Unnamed: 0,Degree,Betweenness,Closeness,Clustering,Random,Inverted PA,Measures
0,0.059959,-0.080255,0.26284,0.042232,0.017299,0.253627,GCC
1,-0.124777,-0.097696,-0.019798,-0.030319,-0.085836,-0.022133,M
2,0.155163,-0.018301,0.120871,0.058726,-0.036986,-0.059508,Density
3,-0.098631,-0.126995,0.100519,-0.119546,-0.045808,-0.218834,r
4,-0.027317,-0.134261,-0.007201,-0.030682,-0.064956,-0.279665,e
5,0.133485,0.07434,0.028648,0.05356,0.041607,-0.123919,Diameter
