### 1. import the modules

In [4]:
from torch_geometric.datasets import TUDataset
import pandas as pd
import numpy as np

from gtda.homology import VietorisRipsPersistence
import networkx as nx
from gtda.diagrams import BettiCurve

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score

from scipy.stats import rankdata

### 2. load and prep the data

Each row of a given file is a graph, with:
- edge_index (list: 2 x #edges): pairs of nodes constituting edges
- edge_attr (list: #edges x #edge-features): for the aforementioned edges, contains their features
- y (list: 1 x #labels): contains the number of labels available to predict (here 1, equal to zero or one)

here, each entry in dataset, is a graph AND NOT tabular data

In [5]:
dataset = TUDataset(root="data/TUDataset", name="MUTAG")
print (dataset[0])

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])


### 3. initialize a constructor for simplicial complexes
- homology_dimensions decide how many betti vectors we want to compute, for now, we'll stick to Betti-0 and Betti-1
- we will also compute distance matrix based on differences in degree

In [6]:
VR = VietorisRipsPersistence(metric="precomputed", homology_dimensions=[0,1])

graph_distance_matrices = []
y = []
for data in dataset:
    G = nx.Graph()
    edges = data.edge_index.numpy().T
    G.add_edges_from(edges)
    y.append(data.y.item())

    # degree dictionary
    deg_central_dict = nx.degree_centrality(G)
    values = np.array([deg_central_dict[node] for node in G.nodes()])

    # converting degree values into ranks
    ranks = rankdata(values, method="average")

    dist_matrix = np.abs(ranks[:, None]- ranks[None, :])

    graph_distance_matrices.append(dist_matrix)

### 4. perform persistent homology
- convert graphs to filtrations (nested sequence of simplicial complexes)
- calculate persistence

### computing Betti Vectors and storing them in  a dataframe 
- betti_features essentially stores a numpy array, where each row corresponds to the betti-0 and betti-1 for each graph
- number of intervals/testing spaces in each betti vector is the count of n_bins from min birth to max death
- betti vectorization renders fixed set data, that can be converted into tabular form that is needed for a pd datafram

In [7]:
dgms = VR.fit_transform(graph_distance_matrices)

BC_100 = BettiCurve(n_bins = 100)
BC_50 = BettiCurve(n_bins = 50)
BC_200 = BettiCurve(n_bins = 200)

betti_50 = BC_50.fit_transform(dgms)
betti_100 = BC_100.fit_transform(dgms)
betti_200 = BC_200.fit_transform(dgms)

# betti_features = BC.fit_transform(dgms)

X = np.hstack([
    betti_50.reshape(len(betti_50), -1),
    betti_100.reshape(len(betti_100), -1),
    betti_200.reshape(len(betti_200), -1)
])
y = np.array(y)

### 5. hyperparam tuning & training the classifier

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

param_grid = {
    "hidden_layer_sizes": [(100,), (150,)],
    "activation": ["relu", "tanh"],
    "solver": ["adam"],
    "learning_rate_init": [0.01, 0.05],
    "max_iter": [500]
}

clf = MLPClassifier(random_state=42)
search = RandomizedSearchCV(
    clf, 
    param_distributions = param_grid,
    n_iter = 5,
    cv = 5,
    scoring = "accuracy",
    random_state = 42,
    n_jobs = -1
)

search.fit(X, y)
print("Best params:\n", search.best_params_)

Best params:
 {'solver': 'adam', 'max_iter': 500, 'learning_rate_init': 0.01, 'hidden_layer_sizes': (150,), 'activation': 'relu'}


In [9]:
clf = MLPClassifier(
    hidden_layer_sizes=(150,),
    activation="relu", 
    learning_rate_init=0.01, 
    solver="adam",
    random_state=42,
    max_iter=500
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print("XGB accuracy: ", accuracy_score(y_test, y_pred))
print("Mean cross validation score: ", scores.mean())

XGB accuracy:  0.8947368421052632
Mean cross validation score:  0.8453216374269005
