In [33]:
from torch_geometric.datasets import MoleculeNet
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GINEConv, global_mean_pool
from torch.nn import Sequential, Linear, ReLU
import torch.nn as nn

import numpy as np


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold
import time

import torch
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from torch.utils.data import TensorDataset
from sklearn import metrics

device = 'mps' if torch.mps.is_available() else 'cpu'

import warnings
warnings.filterwarnings('ignore')

from model_cIGNR import *
from layers import *
from siren_pytorch import *

# warnings.filterwarnings(action='once')

In [34]:
dataset = MoleculeNet(root='data/', name='BBBP')
train_idx, test_idx = train_test_split(range(len(dataset)), test_size=0.2, random_state=0)
train_set = [dataset[i] for i in train_idx]
test_set = [dataset[i] for i in test_idx]

train_loader = DataLoader(train_set, batch_size=10)
test_loader = DataLoader(test_set, batch_size=10)

atom_counts = [data.x.size(0) for data in dataset]

# Basic stats
num_molecules = len(atom_counts)
min_atoms = min(atom_counts)
max_atoms = max(atom_counts)

n_card = max_atoms

In [35]:
mlp_dim_hidden= [48,36,24]
mlp_num_layer = len(mlp_dim_hidden)
mlp_act = 'sine'
emb_dim = 2
gnn_num_layer = 3
flag_emb = 1
gnn_type = 'gin'

latent_dim = 4
gnn_layers = [2, 2, 2, latent_dim]


snet = SirenNet(
    dim_in = 2, # input [x,y] coordinate
    dim_hidden = mlp_dim_hidden,
    dim_out = 1, # output graphon (edge) probability 
    num_layers = mlp_num_layer, # f_theta number of layers
    final_activation = 'sigmoid',
    w0_initial = 30.,
    activation = mlp_act)

model = cIGNR(net=snet, input_card=n_card, emb_dim=emb_dim, latent_dim=latent_dim, num_layer=gnn_num_layer, device=device, flag_emb=flag_emb, gnn_type= gnn_type, gnn_layers=gnn_layers)

p = "/Users/berfininal/Documents/ML-proteins/implicit_graphon/IGNR/c-IGNR/Result/checkpoints/_checkpoint_dataset_bbbp_gin_dim_{latent_dim}.pt"
path = f"/Users/berfininal/Documents/ML-proteins/implicit_graphon/IGNR/c-IGNR/Result/checkpoints/_checkpoint_dataset_bbbp_gin_dim_4.pt"
checkpoint = torch.load(path, map_location = 'cpu', weights_only = False)

model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
model = model.to(torch.device(device))

In [36]:

def encode_dataset(loader):
    embeddings, labels = [], []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            x = model.encode(batch.x, batch.edge_index, batch.batch)
            embeddings.append(x.cpu())
            labels.extend(batch.y.cpu().numpy())
    return torch.cat(embeddings).numpy(), labels

In [37]:
x_train, y_train = encode_dataset(train_loader)
x_test, y_test = encode_dataset(test_loader)


#### SVM

In [38]:
svm = SVC(kernel='rbf')
svm.fit(x_train, y_train)
            
acc = accuracy_score(y_test, svm.predict(x_test))

print(acc)

0.7794117647058824


In [42]:
def svm_cls(x_train, y_train):
    
    param_grid =  [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                    {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000] },{'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}]              

    t0 = time.time()
    clf = GridSearchCV(SVC(), param_grid, cv=10,
                        scoring='accuracy')

    clf.fit(x_train, y_train)



    t = time.time() - t0
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print('Training accuracy')
    print(clf.best_score_)
    print(clf.best_estimator_)
    print()
    print()
    print('****Results****')
    svm_pred=clf.predict(x_train)
    print("=" * 52)
    print("time cost: {}".format(t))
    print()
    print("confusion matrix\n", metrics.confusion_matrix(y_train, svm_pred))
    print()
    print("\t\taccuracy: {}".format(metrics.accuracy_score(y_train, svm_pred)))
    print("\t\troc_auc_score: {}".format(metrics.roc_auc_score(y_train, svm_pred)))
    print("\t\tcohen_kappa_score: {}".format(metrics.cohen_kappa_score(y_train, svm_pred)))
    print()
    print("\t\tclassification report")
    print("-" * 52)
    print(metrics.classification_report(y_train, svm_pred)) 

    test_acc = np.round(accuracy_score(y_test, clf.best_estimator_.predict(x_test)), 3)
    print(f"Test accuarcy : {test_acc}")

    return clf, svm_pred, y_test, x_train, y_train


In [43]:
clf, svm_pred, y_test, x_train, y_train = svm_cls(x_train, y_train)

Best parameters set found on development set:

{'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'}

Training accuracy
0.7614955858147539
SVC(C=0.001, gamma=0.01)


****Results****
time cost: 19.845431804656982

confusion matrix
 [[   0  389]
 [   0 1242]]

		accuracy: 0.7614960147148988
		roc_auc_score: 0.5
		cohen_kappa_score: 0.0

		classification report
----------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       389
         1.0       0.76      1.00      0.86      1242

    accuracy                           0.76      1631
   macro avg       0.38      0.50      0.43      1631
weighted avg       0.58      0.76      0.66      1631

Test accuarcy : 0.779


In [52]:

from torcheval.metrics.functional import binary_auroc
train_auroc = binary_auroc(torch.tensor(svm_pred), torch.tensor(y_train)[:,0])
print(f"train auroc : {train_auroc}")
print(f"Test auroc : {binary_auroc(torch.tensor(clf.best_estimator_.predict(x_test)), torch.tensor(y_test)[:,0])} ")

train auroc : 0.5
Test auroc : 0.5 


#### MLP 

In [53]:
class MLP(nn.Module):
    def __init__(self, d_in, hidden = [256, 128], p = 0.3):
        super().__init__()

        layers = []
        dims = [d_in] + hidden

        for i in range(len(dims)-1):
            layers += [nn.Linear(dims[i], dims[i+1]), nn.ReLU(), nn.Dropout(p)]
        
        layers += [nn.Linear(dims[-1], 1)]
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(-1)
    

In [54]:
Xt = torch.tensor(x_train, dtype=torch.float32)
Xs = torch.tensor(x_test,  dtype=torch.float32)

y_train = np.asarray(y_train)
y_test  = np.asarray(y_test)
num_classes = len(np.unique(y_train))

yt = torch.tensor(y_train, dtype=torch.float32)
ys = torch.tensor(y_test,  dtype=torch.float32)


batch_size = 10
train_loader = DataLoader(TensorDataset(Xt, yt), batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(TensorDataset(Xs, ys), batch_size=batch_size, shuffle=False)

In [55]:
def train():

    model = MLP(d_in = 2, hidden=[8], p = 0.3).to(device)
    criterion = nn.BCEWithLogitsLoss()
    criterion_tst = nn.BCEWithLogitsLoss()

    opt = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay=1e-4)
    loss_total = []
    acc_total = []

    for epoch in range(15):
        model.train()
        acc = 0
        total = 0
        for batch in train_loader:
            x, y = batch
            x = x.to(device)
            y = y.to(device)

            logits = model(x)
            loss = criterion(logits.view(-1,1), y)

            pred = (torch.sigmoid(logits) >= 0.5).float()
            acc += (pred == y).sum().item()

            opt.zero_grad()
            loss.backward()
            opt.step()

            total +=loss.item() 
        
        loss_test, acc_test = evaluate(model, test_loader, criterion=criterion_tst, device = device)
        print(f"Test loss : {loss_test}")
        print(f"Test acc : {acc_test}")


        acc_total.append(float(np.round(acc/len(train_loader),3)))

        loss_total.append(float(np.round(total/len(train_loader),3)))

    print(f"Accuracy : {acc_total}")
    print(f"Loss total : {loss_total}")

In [56]:

def evaluate(model, loader, criterion, device):
    model.eval()
    total=0
    correct=0 
    n=0
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        logits = model(X)
        loss = criterion(logits.view(-1,1), y)

        pred = (torch.sigmoid(logits) >= 0.5).float()
        correct += (pred == y).sum().item()
        
        total += loss.item() 
    return total/len(test_loader), correct/len(test_loader)

In [57]:

model = MLP(d_in = 4, hidden=[8,16], p = 0.3).to(device)
criterion = nn.BCEWithLogitsLoss()
criterion_tst = nn.BCEWithLogitsLoss()

opt = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay=1e-4)
loss_total = []
acc_total = []

for epoch in range(15):
    model.train()
    acc = 0
    total = 0
    for batch in train_loader:
        x, y = batch
        x = x.to(device)
        y = y.to(device)

        logits = model(x)
        loss = criterion(logits.view(-1,1), y)

        pred = (torch.sigmoid(logits) >= 0.5).float()
        acc += (pred == y).sum().item()

        opt.zero_grad()
        loss.backward()
        opt.step()

        total +=loss.item() 
    
    loss_test, acc_test = evaluate(model, test_loader, criterion=criterion_tst, device = device)
    print(f"Test loss : {loss_test}")
    print(f"Test acc : {acc_test}")


    acc_total.append(float(np.round(acc/len(train_loader),3)))

    loss_total.append(float(np.round(total/len(train_loader),3)))

print(f"Accuracy : {acc_total}")
print(f"Loss total : {loss_total}")

Test loss : 0.5616167972727519
Test acc : 77.21951219512195
Test loss : 0.5313384656499072
Test acc : 77.21951219512195
Test loss : 0.5250883276869611
Test acc : 77.21951219512195
Test loss : 0.5181652547382727
Test acc : 77.21951219512195
Test loss : 0.5161681378759989
Test acc : 77.21951219512195
Test loss : 0.5150986528251229
Test acc : 77.21951219512195
Test loss : 0.51423025058537
Test acc : 77.21951219512195
Test loss : 0.5136502461462487
Test acc : 77.21951219512195
Test loss : 0.5121363488639273
Test acc : 77.21951219512195
Test loss : 0.515874486870882
Test acc : 77.21951219512195
Test loss : 0.5126649929982859
Test acc : 77.21951219512195
Test loss : 0.5123881163393579
Test acc : 77.21951219512195
Test loss : 0.5112451442131182
Test acc : 77.21951219512195
Test loss : 0.5113419832252875
Test acc : 77.21951219512195
Test loss : 0.5107574357492167
Test acc : 77.21951219512195
Accuracy : [65.53, 75.695, 75.732, 75.677, 75.677, 75.677, 75.732, 75.732, 75.732, 75.677, 75.677, 75.6