### 1. import the modules


In [1]:
from torch_geometric.datasets import TUDataset
import pandas as pd
import numpy as np

from gtda.homology import VietorisRipsPersistence
import networkx as nx
from gtda.diagrams import BettiCurve

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

from scipy.stats import rankdata


### 2. load and prep the data

Each row of a given file is a graph, with:
- edge_index (list: 2 x #edges): pairs of nodes constituting edges
- edge_attr (list: #edges x #edge-features): for the aforementioned edges, contains their features
- y (list: 1 x #labels): contains the number of labels available to predict (here 1, equal to zero or one)

here, each entry in dataset, is a graph AND NOT tabular data


In [2]:
dataset = TUDataset(root="data/TUDataset", name="MUTAG")
print (dataset[0])


Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])


### 3. initialize a constructor for simplicial complexes
- homology_dimensions decide how many betti vectors we want to compute, for now, we'll stick to Betti-0 and Betti-1
- we will also compute distance matrix based on differences in atomic numbers


In [4]:
VR = VietorisRipsPersistence(metric="precomputed", homology_dimensions=[0,1], n_jobs=-1)

# Mapping from MUTAG node labels to atomic numbers
# 0=C(6), 1=N(7), 2=O(8), 3=F(9), 4=I(53), 5=Cl(17), 6=Br(79)
atomic_number_map = {0: 6, 1: 7, 2: 8, 3: 9, 4: 53, 5: 17, 6: 79}

graph_distance_matrices = []
y = []
for data in dataset:
    G = nx.Graph()
    edges = data.edge_index.numpy().T
    G.add_edges_from(edges)
    y.append(data.y.item())

    # Get node labels (atomic types) and map to atomic numbers
    node_labels = data.x[:, 0].numpy()  # First column contains node labels
    atomic_numbers = np.array([atomic_number_map[label] for label in node_labels])

    # Create distance matrix based on differences in atomic numbers
    dist_matrix = np.abs(atomic_numbers[:, None] - atomic_numbers[None, :])

    graph_distance_matrices.append(dist_matrix)


### 4. perform persistent homology
- convert graphs to filtrations (nested sequence of simplicial complexes)
- calculate persistence

### computing Betti Vectors and storing them in  a dataframe 
- betti_features essentially stores a numpy array, where each row corresponds to the betti-0 and betti-1 for each graph
- number of intervals/testing spaces in each betti vector is the count of n_bins from min birth to max death
- betti vectorization renders fixed set data, that can be converted into tabular form that is needed for a pd datafram


In [5]:
dgms = VR.fit_transform(graph_distance_matrices)

# Single Betti curve computation for faster processing
BC = BettiCurve(n_bins=100)
X = BC.fit_transform(dgms).reshape(len(dgms), -1)
y = np.array(y)


### 5. hyperparam tuning & training the classifier


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
param_grid = {
    "max_depth":[5, 10, 20],
    "booster":['gbtree', 'dart'],
    #"learning_rate": [0.01, 0.1, 0.2], 
    "eta": [0.3, 0.5, 0.7, 0.9],
    "tree_method": ["auto", "exact", "approx"]
}

grid = GridSearchCV(XGBClassifier(), param_grid, cv = 5, scoring = 'accuracy')
grid.fit(X_train, y_train)
print("Best params:\n", grid.best_params_)


Best params:
 {'booster': 'gbtree', 'eta': 0.3, 'max_depth': 5, 'tree_method': 'auto'}


In [8]:
clf = XGBClassifier(
    booster='gbtree', 
    #learning_rate=0.1, 
    max_depth=5, 
    #n_estimators=200, 
    #subsample=0.8,
    #colsample_bytree=0.8,
    eta=0.3,
    tree_method= 'auto'
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print("XGB accuracy: ", accuracy_score(y_test, y_pred))
print("Mean cross validation score: ", scores.mean())


XGB accuracy:  0.6842105263157895
Mean cross validation score:  0.6649122807017545
