### 1. import the modules

In [8]:
from torch_geometric.datasets import TUDataset
import pandas as pd
import numpy as np

from gtda.homology import VietorisRipsPersistence
import networkx as nx
from gtda.diagrams import BettiCurve

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

### 2. load and prep the data

Each row of a given file is a graph, with:
- edge_index (list: 2 x #edges): pairs of nodes constituting edges
- edge_attr (list: #edges x #edge-features): for the aforementioned edges, contains their features
- y (list: 1 x #labels): contains the number of labels available to predict (here 1, equal to zero or one)

here, each entry in dataset, is a graph AND NOT tabular data

In [9]:
dataset = TUDataset(root="data/TUDataset", name="MUTAG")
print (dataset[0])

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])


### 3. initialize a constructor for simplicial complexes
- homology_dimensions decide how many betti vectors we want to compute, for now, we'll stick to Betti-0 and Betti-1
- we will also compute distance matrix based on differences in degree

In [10]:
VR = VietorisRipsPersistence(metric="precomputed", homology_dimensions=[0,1])

graph_distance_matrices = []
y = []
for data in dataset:
    G = nx.Graph()
    edges = data.edge_index.numpy().T
    G.add_edges_from(edges)
    y.append(data.y.item())

    # calculate degree values
    deg_dict = dict(G.degree())
    values = np.array([deg_dict[node] for node in G.nodes()])

    dist_matrix = np.abs(values[:, None]- values[None, :])

    graph_distance_matrices.append(dist_matrix)


### 4. perform persistent homology
- convert graphs to filtrations (nested sequence of simplicial complexes)
- calculate persistence

### computing Betti Vectors and storing them in  a dataframe 
- betti_features essentially stores a numpy array, where each row corresponds to the betti-0 and betti-1 for each graph
- number of intervals/testing spaces in each betti vector is the count of n_bins from min birth to max death
- betti vectorization renders fixed set data, that can be converted into tabular form that is needed for a pd datafram

In [11]:
dgms = VR.fit_transform(graph_distance_matrices)

BC = BettiCurve(n_bins = 100)
betti_features = BC.fit_transform(dgms)

X = pd.DataFrame(betti_features.reshape(len(betti_features), -1))
y = np.array(y)

### 5. hyperparam tuning & training the classifier

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
param_grid = {
    "max_depth":[3,5,7],
    "booster":['gbtree', 'dart'],
    "learning_rate": [0.01, 0.1, 0.2]
}

grid = GridSearchCV(XGBClassifier(), param_grid, cv = 5, scoring = 'accuracy')
grid.fit(X_train, y_train)
print("Best params:\n", grid.best_params_)

Best params:
 {'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 3}


In [13]:
clf = XGBClassifier(booster='gbtree', learning_rate=0.01, max_depth=3)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("XGB accuracy: ", accuracy_score(y_test, y_pred))

XGB accuracy:  0.6842105263157895
