In [None]:
import glob
import pandas as pd
import ast
import os
import networkx as nx
import numpy as np
from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import scipy.sparse as sp

In [None]:
!pip install xgboost

In [None]:
# read data from Baseline/cisternrois folder

all_baseline_files = glob.glob(os.path.join('Baseline/secondcistern', "*.gml"))
label_baseline = len(all_baseline_files) * [0]

all_followup_files = glob.glob(os.path.join('Controls/secondcistern', "*.gml"))
label_followup = len(all_followup_files) * [1]

all_files = all_baseline_files + all_followup_files
all_labels = label_baseline + label_followup

# load graphs

all_graphs = [nx.read_gml(graph) for graph in all_files]


In [None]:
# def normalize_adj(adj):
#     """ Symmetrically normalize adjacency matrix."""
#     """ Copy from https://github.com/tkipf/gcn """
#     adj = sp.coo_matrix(adj)
#     rowsum = np.array(adj.sum(1))
#     d_inv_sqrt = np.power(rowsum, -0.5).flatten()
#     d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
#     d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
#     return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

In [None]:

def extract_features(graph, max_size):
    """
    Extracts eigen vector and eigen values from a graph's adjacency matrix
    """

    adj_matrix = nx.to_numpy_array(graph)
    # adj_matrix = normalize_adj(adj_matrix)
    # eigen values and eigen vectors
    eigen_values, eigen_vectors = eigh(adj_matrix)
    print(eigen_values.shape, eigen_vectors.shape)

    eigen_values, eigen_vectors = eigen_values.real, eigen_vectors.real

    # eigen values to absolute values
    eigen_values = np.abs(eigen_values)
    print(eigen_values)

    # sort eigen values and eigen vectors
    idx = eigen_values.argsort()[::-1]
    print(idx)
    eigen_values = eigen_values[idx]
    eigen_vectors = eigen_vectors[:, idx]

    eigen_vectors = np.abs(eigen_vectors)

    # subset top 8 eigen values and eigen vectors
    eigen_values = eigen_values[:max_size]
    eigen_vectors = eigen_vectors[:max_size, :max_size]

    eigenvectors_flattened = eigen_vectors.flatten()
    #print(eigenvectors_flattened)

    return eigenvectors_flattened

ev = [extract_features(graph, 5) for graph in all_graphs]
X = np.array(ev)
y = np.array(all_labels)  # Your labels array (e.g., 0 for 'bad', 1 for 'good')

# Split the data into training and testing sets along with filenames
X_train, X_test, y_train, y_test, filenames_train, filenames_test = train_test_split(
    X, y, all_files, test_size=0.2, random_state=12, shuffle=True, stratify=y)

In [None]:
# Create xgboost classifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("F1: %.2f%%" % (f1 * 100.0))

# Get the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Average ROC
roc = roc_auc_score(y_pred, y_test)
print("ROC: {:.2f}%".format(roc * 100))

# Pair filenames in the test set with their predicted and actual labels
results = pd.DataFrame({
    'Filename': filenames_test,
    'Actual Label': y_test,
    'Predicted Label': y_pred
})


# Show files where the model predicted the wrong label
print(results[results['Actual Label'] != results['Predicted Label']])

In [None]:
# Create Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# fit model no training data
model = RandomForestClassifier()
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("F1: %.2f%%" % (f1 * 100.0))

# Pair filenames in the test set with their predicted and actual labels
results = pd.DataFrame({
    'Filename': filenames_test,
    'Actual Label': y_test,
    'Predicted Label': y_pred
})

'''print(results)'''

# Show files where the model predicted the wrong label
print(results[results['Actual Label'] != results['Predicted Label']])


In [None]:
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
import pandas as pd

X = np.array(ev)
y = np.array(all_labels)

# Number of folds for cross-validation
n_folds = 5

# Set up K-Fold cross-validation
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize lists to store results
accuracies = []
f1_scores = []
conf_matrices = []

# Perform 5-fold cross-validation
for train_index, test_index in kf.split(X):
    # Split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create and fit the model
    #model = XGBClassifier()
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and store metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))

# Calculate average metrics
avg_accuracy = np.mean(accuracies)
avg_f1_score = np.mean(f1_scores)

# Output average results
print("Average Accuracy: {:.2f}%".format(avg_accuracy * 100))
print("Average F1 Score: {:.2f}%".format(avg_f1_score * 100))

# Aggregate confusion matrices
total_conf_matrix = np.sum(conf_matrices, axis=0)

# Output the aggregated confusion matrix
print("Aggregated Confusion Matrix:")
print(total_conf_matrix)


In [None]:
# SVM Model
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    'kernel': ['linear', 'rbf', 'poly']  # Type of the kernel
}

# Initialize the SVM classifier
svm_model = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='f1')

# Perform grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Use best params to train the model
best_svm_model = SVC(C=grid_search.best_params_['C'], gamma=grid_search.best_params_['gamma'],
                     kernel=grid_search.best_params_['kernel'])
best_svm_model.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = best_svm_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
import pandas as pd

X = np.array(ev)
y = np.array(all_labels)

# Number of folds for cross-validation
n_folds = 5

# Set up K-Fold cross-validation
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize lists to store results
accuracies = []
f1_scores = []
conf_matrices = []
roc = []

# Perform 5-fold cross-validation
for train_index, test_index in kf.split(X):
    # Split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create and fit the model
    model = SVC(C=1, gamma='scale', kernel='poly')
                
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and store metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))
    roc.append(roc_auc_score(y_test, y_pred))

# Calculate average metrics
avg_accuracy = np.mean(accuracies)
avg_f1_score = np.mean(f1_scores)
avg_roc = np.mean(roc)

# Output average results
print("Average Accuracy: {:.2f}%".format(avg_accuracy * 100))
print("Average F1 Score: {:.2f}%".format(avg_f1_score * 100))
print("Average ROC: {:.2f}%".format(avg_roc * 100))

# Aggregate confusion matrices
total_conf_matrix = np.sum(conf_matrices, axis=0)

# Output the aggregated confusion matrix
print("Aggregated Confusion Matrix:")
print(total_conf_matrix)


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import optuna
from optuna.samplers import TPESampler
import lightgbm as lgb

# Number of folds for cross-validation
n_folds = 5

# Set up K-Fold cross-validation
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

X = np.array(ev)
y = np.array(all_labels)

# Initialize lists to store results
accuracies = []
f1_scores = []
conf_matrices = []
roc = []

# Perform 5-fold cross-validation
for train_index, test_index in kf.split(X):
    # Split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create and fit the model
    model = lgb.LGBMClassifier(learning_rate=0.01, n_estimators=1000, num_leaves=31, objective='binary', metric='accuracy ')
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and store metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))
    roc.append(roc_auc_score(y_test, y_pred))

# Calculate average metrics
avg_accuracy = np.mean(accuracies)
avg_f1_score = np.mean(f1_scores)
avg_roc = np.mean(roc)

# Output average results
print("Average Accuracy: {:.2f}%".format(avg_accuracy * 100))
print("Average F1 Score: {:.2f}%".format(avg_f1_score * 100))
print("Average ROC: {:.2f}%".format(avg_roc * 100))

# Aggregate confusion matrices
total_conf_matrix = np.sum(conf_matrices, axis=0)

# Output the aggregated confusion matrix
print("Aggregated Confusion Matrix:")
print(total_conf_matrix)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

X = np.array(ev)
y = np.array(all_labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# define objective function

def objective(trial, data=X_train, target=np.ravel(y_train)):
    train_X, test_X, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=42)

    hyperparams = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'n_estimators': 500,
        'seed': 2023,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.2, 1.0),
        'max_depth': trial.suggest_int('max_depth', 2, 14),

    }

    model = lgb.LGBMClassifier(**hyperparams)
    model.fit(train_X, train_y, eval_set=[(test_X, test_y)], callbacks=[lgb.early_stopping(10,0,0.005)])

    preds = model.predict(test_X) 

    accuracy = roc_auc_score(test_y, preds)

    return accuracy

sample = TPESampler(seed=2023)
study = optuna.create_study(direction='maximize', sampler=sample)
study.optimize(objective, n_trials=500)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

In [None]:
# train model with best hyperparameters

lgb_params = trial.params
lgb_params['metric'] = 'auc'
lgb_params['random_state'] = 2023
lgb_params['n_estimators'] = 1000

m = lgb.LGBMClassifier(**lgb_params)
m.fit(X_train, y_train, eval_set=[(X_test, y_test)])

preds = m.predict(X_test)

print('ROC AUC score: {}'.format(roc_auc_score(y_test, preds)))
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))

In [None]:
print('ROC AUC score: {}'.format(roc_auc_score(y_test, preds)))
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))

print('Best trial:', study.best_trial.params)
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))