### 1. import the modules


In [7]:
from torch_geometric.datasets import TUDataset
import pandas as pd
import numpy as np

from gtda.homology import VietorisRipsPersistence
import networkx as nx
from gtda.diagrams import BettiCurve

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

from scipy.stats import rankdata
from itertools import combinations


### 2. load and prep the data

Each row of a given file is a graph, with:
- edge_index (list: 2 x #edges): pairs of nodes constituting edges
- edge_attr (list: #edges x #edge-features): for the aforementioned edges, contains their features
- y (list: 1 x #labels): contains the number of labels available to predict (here 1, equal to zero or one)

here, each entry in dataset, is a graph AND NOT tabular data


In [8]:
dataset = TUDataset(root="data/TUDataset", name="MUTAG")
print (dataset[0])


Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])


### 3. Forman-Ricci curvature computation for edge filtration
- Forman-Ricci curvature captures the local geometry around each edge
- We'll use these curvature values to create distance matrices for edge-based filtration
- Higher curvature indicates well-connected regions, lower curvature indicates bridges/bottlenecks


In [9]:
def compute_forman_ricci_curvature(G):
    """
    Compute Forman-Ricci curvature for each edge in the graph.
    
    Formula: Ric_F(u,v) = 4 - deg(u) - deg(v) + 3*triangles(u,v) + 2*quadrangles(u,v)
    For simplicity, we'll use a simplified version focusing on degrees and triangles.
    """
    curvatures = {}
    
    for u, v in G.edges():
        deg_u = G.degree(u)
        deg_v = G.degree(v)
        
        # Count triangles containing edge (u,v)
        common_neighbors = set(G.neighbors(u)) & set(G.neighbors(v))
        triangles = len(common_neighbors)
        
        # Simplified Forman-Ricci curvature (focusing on degrees and triangles)
        curvature = 4 - deg_u - deg_v + 3 * triangles
        curvatures[(u, v)] = curvature
    
    return curvatures

def create_edge_distance_matrix(G, curvatures):
    """
    Create a distance matrix based on edge curvature differences.
    We'll map edges to nodes in a new graph where edges become nodes.
    """
    edges = list(G.edges())
    n_edges = len(edges)
    
    if n_edges == 0:
        return np.zeros((1, 1))
    
    # Create distance matrix based on curvature differences
    dist_matrix = np.zeros((n_edges, n_edges))
    
    for i, edge1 in enumerate(edges):
        for j, edge2 in enumerate(edges):
            if i == j:
                dist_matrix[i, j] = 0
            else:
                curv1 = curvatures.get(edge1, curvatures.get((edge1[1], edge1[0]), 0))
                curv2 = curvatures.get(edge2, curvatures.get((edge2[1], edge2[0]), 0))
                dist_matrix[i, j] = abs(curv1 - curv2)
    
    return dist_matrix

VR = VietorisRipsPersistence(metric="precomputed", homology_dimensions=[0,1], n_jobs=-1)

graph_distance_matrices = []
y = []
for data in dataset:
    G = nx.Graph()
    edges = data.edge_index.numpy().T
    G.add_edges_from(edges)
    y.append(data.y.item())

    # Compute Forman-Ricci curvature for all edges
    curvatures = compute_forman_ricci_curvature(G)
    
    # Create distance matrix based on edge curvature differences
    dist_matrix = create_edge_distance_matrix(G, curvatures)
    
    graph_distance_matrices.append(dist_matrix)


### 4. perform persistent homology based on edge curvature filtration


In [10]:
dgms = VR.fit_transform(graph_distance_matrices)

# Single Betti curve computation for faster processing
BC = BettiCurve(n_bins=100)
X = BC.fit_transform(dgms).reshape(len(dgms), -1)
y = np.array(y)


### 5. hyperparam tuning & training the classifier


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
param_grid = {
    "max_depth":[3,5,7],
    "booster":['gbtree', 'dart'],
    "learning_rate": [0.01, 0.1, 0.2]
}

grid = GridSearchCV(XGBClassifier(), param_grid, cv = 5, scoring = 'accuracy')
grid.fit(X_train, y_train)
print("Best params:\n", grid.best_params_)


Best params:
 {'booster': 'gbtree', 'eta': 0.3, 'learning_rate': 0.01, 'max_depth': 5, 'tree_method': 'auto'}


In [20]:
clf = XGBClassifier(
    booster='gbtree', 
    learning_rate=0.1, 
    max_depth=3, 
    n_estimators=200, 
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print("XGB accuracy: ", accuracy_score(y_test, y_pred))
print("Mean cross validation score: ", scores.mean())


XGB accuracy:  0.631578947368421
Mean cross validation score:  0.7023391812865498
