In [1]:
import pandas as pd
import networkx as nx
import ast
from math import log

In [2]:
df = pd.read_csv('../../data/expanded_train_with_leaf.csv')

In [3]:
df['pos_norm'] = df['vertex'] / df['n']

In [4]:
raw = pd.read_csv('../../data/train.csv')
raw['edgelist'] = raw['edgelist'].apply(ast.literal_eval)

In [5]:
df = df.merge(
    raw[['language','sentence','edgelist']],
    on=['language','sentence'],
    how='left'
)

In [6]:
# Compute max_branch_size & subtree_entropy per node
def balance_features(group):
    edges = group['edgelist'].iloc[0]
    T = nx.Graph(edges)
    n = T.number_of_nodes()
    records = []
    for v in T.nodes():
        sizes = []
        for nbr in T[v]:
            # remove edge and find component containing nbr
            T2 = T.copy()
            T2.remove_edge(v, nbr)
            for comp in nx.connected_components(T2):
                if nbr in comp:
                    sizes.append(len(comp))
                    break
        max_branch = max(sizes) if sizes else 0
        ent = 0.0
        for s in sizes:
            p = s / n
            ent -= p * log(p, 2) if p>0 else 0.0
        records.append((v, max_branch, ent))
    return pd.DataFrame(records, columns=['vertex','max_branch_size','subtree_entropy'])

In [7]:
# apply per (language, sentence)
bal = (
    df.groupby(['language','sentence'], group_keys=False)
      .apply(balance_features)
      .reset_index(drop=True)
)

  .apply(balance_features)


In [8]:
# concat
df = pd.concat([df.reset_index(drop=True), bal], axis=1)

In [9]:
export_cols = [
    'language','sentence','vertex','n',
    'degree','closeness','harmonic','betweeness','load','pagerank',
    'eigenvector','katz','information','current_flow_betweeness',
    'percolation','second_order','laplacian',
    'is_leaf','pos_norm','max_branch_size','subtree_entropy',
    'is_root'
]

In [10]:
df[export_cols].to_csv('../../data/expanded_with_features_non-linear.csv', index=False)
print("Exported features")

Exported features
