# Preprocessing with more NetworkX features

This experiment includes data from most NetworkX features. This data was tested in our main linear and non-linear models as well as in some of the experimental models, but it did not lead to a significant improvement as the preseleced ones in preprocessing led to the best separability. Thus, only the data generation part remains from this experiment.

In [10]:
import pandas as pd
import ast
import networkx as nx
from tqdm import tqdm

In [11]:

def centralities(edgelist):
    """
    Given an undirected edge‐list, build a NetworkX graph and return
    a dict mapping node -> dict of all node‐level centralities & local metrics.
    """
    G = nx.from_edgelist(edgelist)
    # Core centralities
    deg      = nx.degree_centrality(G)
    clo      = nx.closeness_centrality(G)
    cflow_clo= nx.current_flow_closeness_centrality(G)
    harm     = nx.harmonic_centrality(G)
    bet      = nx.betweenness_centrality(G, seed=1)
    comm_bet = nx.communicability_betweenness_centrality(G)
    cfbet    = nx.current_flow_betweenness_centrality(G)
    load     = nx.load_centrality(G)
    pr       = nx.pagerank(G)
    eig      = nx.eigenvector_centrality_numpy(G)
    katz     = nx.katz_centrality_numpy(G)
    info     = nx.information_centrality(G)
    subg     = nx.subgraph_centrality(G)
    perco    = nx.percolation_centrality(G)
    so       = nx.second_order_centrality(G)
    lapl     = nx.laplacian_centrality(G)

    # Local clustering / core / triangle metrics
    clust    = nx.clustering(G)
    sq_clust = nx.square_clustering(G)
    tri      = nx.triangles(G)
    core     = nx.core_number(G)
    ecc      = nx.eccentricity(G)

    feats = {}
    for v in G:
        feats[v] = {
            'degree_centrality':             deg[v],
            'closeness_centrality':          clo[v],
            'current_flow_closeness':        cflow_clo[v],
            'harmonic_centrality':           harm[v],
            'betweenness_centrality':        bet[v],
            'communicability_betweenness':   comm_bet[v],
            'current_flow_betweenness':      cfbet[v],
            'load_centrality':               load[v],
            'pagerank':                      pr[v],
            'eigenvector_centrality':        eig[v],
            'katz_centrality':               katz[v],
            'information_centrality':        info[v],
            'subgraph_centrality':           subg[v],
            'percolation_centrality':        perco[v],
            'second_order_centrality':       so[v],
            'laplacian_centrality':          lapl[v],
            'clustering_coefficient':        clust[v],
            'square_clustering':             sq_clust[v],
            'triangles':                     tri[v],
            'core_number':                   core[v],
            'eccentricity':                  ecc[v],
        }
    return feats

In [12]:
train_df = pd.read_csv('../../data/train.csv')
train_df['edgelist'] = train_df['edgelist'].apply(ast.literal_eval)

In [13]:
rows = []
for _, r in tqdm(train_df.iterrows(), total=len(train_df), desc="Expanding train"):
    edgels = r['edgelist']
    root   = r['root']
    cents  = centralities(edgels)
    for v, feats in cents.items():
        row = {
            'id':         r['id'],
            'language':   r['language'],
            'sentence':   r['sentence'],
            'n':          r['n'],
            'vertex':     v,
            **feats,
            'is_root':    int(v == root)
        }
        rows.append(row)

Expanding train: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 10500/10500 [06:31<00:00, 26.81it/s]


In [16]:
expanded_train = pd.DataFrame(rows)
expanded_train.to_csv('../../data/expanded_train_all_feats.csv', index=False)

In [25]:
test_df = pd.read_csv('../../data/test.csv')
test_df['edgelist'] = test_df['edgelist'].apply(ast.literal_eval)

In [26]:
rows = []
for _, r in tqdm(test_df.iterrows(), total=len(test_df), desc="Expanding test"):
    edgels = r['edgelist']
    cents  = centralities(edgels)
    for v, feats in cents.items():
        row = {
            'id':         r['id'],
            'language':   r['language'],
            'sentence':   r['sentence'],
            'n':          r['n'],
            'vertex':     v,
            **feats
        }
        rows.append(row)

Expanding test: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10395/10395 [26:35<00:00,  6.52it/s]


In [27]:
expanded_test = pd.DataFrame(rows)
expanded_test.to_csv('../../data/expanded_test_all_feats.csv', index=False)