In [46]:
import fire
from visdom import Visdom
import pickle
import sys, os
import umap, numba
import random
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os,glob, pandas as pd
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import copy
from collections import Counter
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.nn import GCNConv, GATConv, DeepGraphInfomax, SAGEConv
from torch_geometric.nn import DenseGraphConv
from torch_geometric.utils import to_dense_batch, to_dense_adj, dense_to_sparse
from torch_geometric.nn import GINEConv
from torch_geometric.utils import dropout_adj
from torch_geometric.nn import APPNP
from torch_cluster import knn_graph
from torch_geometric.data import Data 
from torch_geometric.utils import train_test_split_edges
from torch_geometric.utils.convert import to_networkx
from torch_geometric.data import InMemoryDataset,DataLoader
from sklearn.utils.class_weight import compute_class_weight

In [53]:
EPS = 1e-15

class GCNNet(torch.nn.Module):
    def __init__(self, inp_dim, out_dim, hidden_topology=[32,64,128,128], p=0.5, p2=0.1, drop_each=True):
        super(GCNNet, self).__init__()
        self.out_dim=out_dim
        self.convs = nn.ModuleList([GATConv(inp_dim, hidden_topology[0])]+[GATConv(hidden_topology[i],hidden_topology[i+1]) for i in range(len(hidden_topology[:-1]))])
        self.drop_edge = lambda edge_index: dropout_adj(edge_index,p=p2)[0]
        self.dropout = nn.Dropout(p)
        self.fc = nn.Linear(hidden_topology[-1], out_dim)
        self.drop_each=drop_each

    def forward(self, x, edge_index, edge_attr=None):
        for conv in self.convs:
            if self.drop_each and self.training: edge_index=self.drop_edge(edge_index)
            x=conv(x, edge_index, edge_attr)
            x = F.relu(x)
        if self.training:
            x = self.dropout(x)
        x = self.fc(x)
        return x
    
class GCNFeatures(torch.nn.Module):
    def __init__(self, gcn, bayes=False):
        super(GCNFeatures, self).__init__()
        self.gcn=gcn
        self.drop_each=bayes
    
    def forward(self, x, edge_index, edge_attr=None):
        for conv in self.gcn.convs:
            if self.drop_each: edge_index=self.gcn.drop_edge(edge_index)
            x = F.relu(conv(x, edge_index, edge_attr))
        if self.drop_each:
            x = self.gcn.dropout(x)
        y = F.softmax(self.gcn.fc(x))
        return x,y
    
def fit_model(graph_data='',
              use_weights=False,
              use_model=None,
              n_batches_backward=1,
              f1_metric='weighted',
              n_epochs=1500,
              out_dir="/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Sophie_Chen/gnn/",
              lr=1e-2,
              eta_min=1e-4,
              T_max=20,
              wd=0,
              hidden_topology=[32,64,128,128],
              p=0.5,
              p2=0.3,
              burnin=400,
              warmup=100,
              gpu_id=0,
              batch_size=1
             ):
    print(gpu_id); torch.cuda.set_device(gpu_id)
    datasets=pickle.load(open(graph_data,'rb'))
    train_data = [dataset for key in datasets for dataset in datasets[key]['train']]
    
    # dataset splits
    train_dataset, test_dataset= train_test_split(train_data, random_state=42)
    train_dataset, val_dataset= train_test_split(train_dataset, random_state=42)
    # train_dataset=random.sample(train_data,40)  
    # val_data = [dataset for key in datasets for dataset in datasets[key]['val']]
    # val_dataset=random.sample(val_data,15)
    print(len(train_dataset), 'training graphs,', len(val_dataset), 'validation graphs')

    # graph sizes
    print("training graph sizes:", end='')
    for x in train_dataset:
        print(x.x.shape[0], end=", ")
    print("\nValidation graph sizes:", end=" ")
    for x in val_dataset:
        print(x.x.shape[0], end=", ")
 
    y_train=np.hstack([graph.y.numpy() for graph in train_dataset])

    # weights
    if use_weights: 
        weights=torch.tensor(compute_class_weight('balanced',classes=np.unique(y_train),y=y_train))
    else: 
        weights=None 
       
    # load model
    model=GCNNet(train_dataset[0].x.shape[1],len(np.unique(y_train)),hidden_topology=hidden_topology,p=p,p2=p2)
    model=model.cuda()

    # load optimizer
    optimizer = torch.optim.Adam(model.parameters(),lr=lr,weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max, eta_min=eta_min, last_epoch=-1)
    criterion= torch.nn.CrossEntropyLoss(weight=torch.tensor(weights).float() if use_weights else None)
    criterion= criterion.cuda()

    # initialize val saving
    save_mod=False
    past_performance=[0]

    # dataloaders
    dataloaders={}

    dataloaders['train']=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
    dataloaders['val']=DataLoader(val_dataset,shuffle=True)
    dataloaders['warmup']=DataLoader(train_dataset,shuffle=False)
    train_loader=dataloaders['warmup']

    #training
    n_total_batches=0
    train_val_f1=[]
    for epoch in range(n_epochs):
        Y,Y_Pred=[],[]
        for i,data in enumerate(train_loader):
            n_total_batches+=1
            model.train(True)
            x=data.x.cuda()
            edge_index=data.edge_index.cuda()
            y=data.y.cuda()
            y_out=model(x,edge_index)
            loss = criterion(y_out, y) / n_batches_backward
            loss.backward()
            if n_total_batches%n_batches_backward==0 or (i==len(train_loader.dataset)-1):
                optimizer.step()
                optimizer.zero_grad()
            Y_Pred.append(F.softmax(y_out, dim=1).argmax(1).detach().cpu().numpy().flatten())
            Y.append(y.detach().cpu().numpy().flatten())
            del x, edge_index, loss, y_out
            if epoch <=warmup:
                break 
       
        if epoch == warmup:
            train_loader=dataloaders['train']
        if epoch>=burnin:
            save_mod=True
        
        train_f1=f1_score(np.hstack(Y),np.hstack(Y_Pred),average=f1_metric)
        scheduler.step()
        Y, Y_Pred, Y_Prob=[],[],[]
        
        for i,data in enumerate(dataloaders['val']):
            model.train(False)
            x=data.x.cuda()
            edge_index=data.edge_index.cuda()
            y=data.y.cuda()
            
            y_out=model(x,edge_index)
            loss = criterion(y_out, y) 
            y_prob=F.softmax(y_out, dim=1).detach().cpu().numpy()
            y_pred=y_prob.argmax(1).flatten()
            y_true=y.detach().cpu().numpy().flatten()
            Y_Pred.append(y_pred)
            Y_Prob.append(y_prob)
            Y.append(y_true)
            #if vis_every and epoch%vis_every==0 and not i:
               # vis.scatter(data.pos.numpy(),opts=dict(markercolor=(y_pred*255).astype(int),webgl=False,markerborderwidth=0,markersize=5),win="pred")
               # vis.scatter(data.pos.numpy(),opts=dict(markercolor=y_true*255,webgl=False,markerborderwidth=0,markersize=5),win="true")
            del x, edge_index, loss, y_out
        val_f1=f1_score(np.hstack(Y),np.hstack(Y_Pred),average=f1_metric)
        val_roc=roc_auc_score(np.hstack(Y), np.vstack(Y_Prob)[:,1], average='macro')#multi_class="ovr",average='macro')
        
        if save_mod and val_roc>=max(past_performance):
            best_model_dict=copy.deepcopy(model.state_dict())
            past_performance.append(val_roc)
        
        print(epoch,train_f1,val_f1,val_roc, flush=True)
        train_val_f1.append((train_f1,val_f1, val_roc))

    model.load_state_dict(best_model_dict)
    torch.save(model.state_dict(),os.path.join(out_dir,f"model.pth"))
    torch.save(train_val_f1,os.path.join(out_dir,f"f1.log.pth"))
    return Y, Y_Pred

In [54]:
y, y_pred= fit_model(graph_data="/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Sophie_Chen/graph_dataset/graph_dataset.pkl",
                     use_weights=True,
                     n_epochs=800,
                     out_dir='/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Sophie_Chen/gnn/',
                     lr=1e-4,
                     batch_size=1,
                     hidden_topology=[32,64,64,128],
                     burnin=0,
                     gpu_id=0)

0
131 training graphs, 44 validation graphs
training graph sizes:5719, 5563, 3966, 815, 4659, 2825, 3510, 10168, 256, 2561, 6168, 3216, 2462, 3117, 459, 6239, 8481, 3270, 2825, 2752, 2096, 6622, 6892, 1275, 6236, 2800, 2314, 5083, 7126, 668, 8312, 6222, 2518, 1230, 5075, 20589, 7200, 358, 2955, 2440, 5137, 5176, 826, 844, 22341, 10205, 2647, 2806, 842, 2806, 5831, 2311, 8925, 7890, 8318, 1708, 3460, 4841, 6896, 23840, 5494, 1299, 2059, 6950, 1926, 3291, 18384, 2529, 842, 6353, 1787, 2536, 3424, 7420, 7762, 1686, 2642, 2598, 1665, 5952, 15681, 8040, 1663, 16135, 7910, 12102, 5921, 5019, 6056, 5059, 902, 2955, 3285, 4607, 3797, 2217, 14939, 7163, 2774, 2572, 506, 4642, 4557, 256, 2678, 2435, 4862, 5344, 5193, 8943, 817, 13521, 3746, 10333, 647, 10733, 4710, 212, 4710, 5494, 4557, 8385, 6416, 902, 6601, 2460, 5649, 3544, 6098, 7523, 3236, 
Validation graph sizes: 2793, 1805, 2198, 1771, 2110, 668, 2053, 703, 2290, 795, 2608, 871, 5101, 15269, 3282, 2706, 3522, 3631, 703, 2365, 10277, 844,

  criterion= torch.nn.CrossEntropyLoss(weight=torch.tensor(weights).float() if use_weights else None)


0 0.7027886634106104 0.686628182897969 0.4774627859582858
1 0.7727283732604792 0.686628182897969 0.4797697900966827
2 0.8057986626734552 0.686628182897969 0.496874962770547
3 0.8052260364608892 0.686628182897969 0.500202471980304
4 0.8120064695698704 0.686628182897969 0.5004210444909397
5 0.8128752636947686 0.686628182897969 0.5022977650572915
6 0.8314341391071112 0.686628182897969 0.506902762024198
7 0.8506051132477062 0.686628182897969 0.5106076731598759
8 0.8592732326733107 0.686628182897969 0.5134512302361863
9 0.8564639205819526 0.686628182897969 0.5139197386177341
10 0.8599100111486991 0.686628182897969 0.513070775617314
11 0.8629812512360765 0.686628182897969 0.5154179944527795
12 0.8632938466718697 0.686628182897969 0.5169797191057953
13 0.8652993900140943 0.686628182897969 0.5219232367915752
14 0.8614838906200134 0.686628182897969 0.5324840567517884
15 0.8548354566325328 0.686628182897969 0.5300558581385584
16 0.8493005258014781 0.686628182897969 0.5293159074077883
17 0.850411

KeyboardInterrupt: 

In [40]:
#original approach

datasets=sophie_data
train_data = [dataset for key in datasets for dataset in datasets[key]['train']]
train_dataset=random.sample(train_data,40)
val_data = [dataset for key in datasets for dataset in datasets[key]['val']]
val_dataset=random.sample(val_data,15)
print(len(train_dataset), 'training graphs,', len(val_dataset), 'validation graphs')

#dataloaders


dataloaders['train']=DataLoader(train_dataset,batch_size=1,shuffle=True)
dataloaders['val']=DataLoader(val_dataset,shuffle=True)
dataloaders['warmup']=DataLoader(train_dataset,shuffle=False)
train_loader=dataloaders['warmup']

AttributeError: 'list' object has no attribute 'sample'