In [1]:
import fire
from visdom import Visdom
import pickle
import sys, os
import umap, numba
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os,glob, pandas as pd
from sklearn.metrics import f1_score
import copy
from collections import Counter
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.nn import GCNConv, GATConv, DeepGraphInfomax, SAGEConv
from torch_geometric.nn import DenseGraphConv
from torch_geometric.utils import to_dense_batch, to_dense_adj, dense_to_sparse
from torch_geometric.nn import GINEConv
from torch_geometric.utils import dropout_adj
from torch_geometric.nn import APPNP
from torch_cluster import knn_graph
from torch_geometric.data import Data 
from torch_geometric.utils import train_test_split_edges
from torch_geometric.utils.convert import to_networkx
from torch_geometric.data import InMemoryDataset,DataLoader
from sklearn.utils.class_weight import compute_class_weight

2022-12-12 18:08:01.658173: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [33]:
EPS = 1e-15

class GCNNet(torch.nn.Module):
    def __init__(self, inp_dim, out_dim, hidden_topology=[32,64,128,128], p=0.5, p2=0.1, drop_each=True):
        super(GCNNet, self).__init__()
        self.out_dim=out_dim
        self.convs = nn.ModuleList([GATConv(inp_dim, hidden_topology[0])]+[GATConv(hidden_topology[i],hidden_topology[i+1]) for i in range(len(hidden_topology[:-1]))])
        self.drop_edge = lambda edge_index: dropout_adj(edge_index,p=p2)[0]
        self.dropout = nn.Dropout(p)
        self.fc = nn.Linear(hidden_topology[-1], out_dim)
        self.drop_each=drop_each

    def forward(self, x, edge_index, edge_attr=None):
        for conv in self.convs:
            if self.drop_each and self.training: edge_index=self.drop_edge(edge_index)
            x = F.relu(conv(x, edge_index, edge_attr))
        if self.training:
            x = self.dropout(x)
        x = self.fc(x)
        return x
    
class GCNFeatures(torch.nn.Module):
    def __init__(self, gcn, bayes=False):
        super(GCNFeatures, self).__init__()
        self.gcn=gcn
        self.drop_each=bayes
    
    def forward(self, x, edge_index, edge_attr=None):
        for conv in self.gcn.convs:
            if self.drop_each: edge_index=self.gcn.drop_edge(edge_index)
            x = F.relu(conv(x, edge_index, edge_attr))
        if self.drop_each:
            x = self.gcn.dropout(x)
        y = F.softmax(self.gcn.fc(x))
        return x,y

def fit_model(graph_data='',
                use_weights=False,
                use_model=None,
                n_batches_backward=1,
                f1_metric='weighted',
                n_epochs=1500,
                out_dir="/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Sophie_Chen/gnn/",
                lr=1e-2,
                eta_min=1e-4,
                T_max=20,
                wd=0,
                hidden_topology=[32,64,128,128],
                p=0.5,
                p2=0.3,
                burnin=400,
                warmup=100,
                gpu_id=0,
                batch_size=1
                ):
    print(gpu_id); torch.cuda.set_device(gpu_id)
    datasets=pickle.load(open(graph_data,'rb'))
    train_dataset = [dataset for key in datasets for dataset in datasets[key]['train']]
    val_dataset = [dataset for key in datasets for dataset in datasets[key]['val']]
    print(len(train_dataset), 'training graphs,', len(val_dataset), 'validation graphs')
    
    print("training graph sizes:", end='')
    for x in train_dataset:
        print(x.x.shape[0], end=", ")
    print("\nValidation graph sizes:", end=" ")
    for x in val_dataset:
        print(x.x.shape[0], end=", ")
    
    y_train=np.hstack(graph.y.numpy() for graph in train_dataset)
    y_true=np.hstack(graph.y_true.numpy() for graph in train_dataset)
    if use_weights: 
        weights=compute_class_weight('balanced',classes=np.unique(y_train),y=y_train)
    else: 
        weights=None 
       
    # load model
    model=GCNNet(train_dataset[0].x.shape[1],len(np.unique(y_train)),hidden_topology=hidden_topology,p=p,p2=p2)
    model=model.cuda()

    # load optimizer
    optimizer = torch.optim.Adam(model.parameters(),lr=lr,weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max, eta_min=eta_min, last_epoch=-1)
    criterion=nn.CrossEntropyLoss(weight=torch.tensor(weights).float() if use_weights else None)
    criterion=criterion.cuda()

    # initialize val saving
    save_mod=False
    past_performance=[0]

    # dataloaders
    dataloaders={}

    dataloaders['train']=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
    dataloaders['val']=DataLoader(val_dataset,shuffle=True)
    dataloaders['warmup']=DataLoader(train_dataset,shuffle=False)
    train_loader=dataloaders['warmup']

    n_total_batches=0
    train_val_f1=[]
    for epoch in range(n_epochs):
        Y,Y_Pred=[],[]
        for i,data in enumerate(train_loader):
            n_total_batches+=1
            model.train(True)
            x=data.x.cuda()
            edge_index=data.edge_index.cuda()
            y=data.y.cuda()
            y_out=model(x,edge_index)
            loss = criterion(y_out, y) / n_batches_backward
            loss.backward()
            if n_total_batches%n_batches_backward==0 or (i==len(train_loader.dataset)-1):
                optimizer.step()
                optimizer.zero_grad()
            Y_Pred.append(F.softmax(y_out).argmax(1).detach().cpu().numpy().flatten())
            Y.append(y.detach().cpu().numpy().flatten())
            del x, edge_index, loss, y_out
            if epoch <=warmup:
                break 
        if epoch == warmup:
            train_loader=dataloaders['train']
        if epoch>=burnin:
            save_mod=True
        train_f1=f1_score(np.hstack(Y),np.hstack(Y_Pred),average=f1_metric)
        scheduler.step()
        Y,Y_Pred=[],[]
        for i,data in enumerate(dataloaders['val']):
            model.train(False)
            x=data.x.cuda()
            edge_index=data.edge_index.cuda()
            y=data.y.cuda()
            y_out=model(x,edge_index)
            loss = criterion(y_out, y) 
            y_prob=F.softmax(y_out).detach().cpu().numpy()
            y_pred=y_prob.argmax(1).flatten()
            y_true=y.detach().cpu().numpy().flatten()
            Y_Pred.append(y_pred)
            Y.append(y_true)
            #if vis_every and epoch%vis_every==0 and not i:
               # vis.scatter(data.pos.numpy(),opts=dict(markercolor=(y_pred*255).astype(int),webgl=False,markerborderwidth=0,markersize=5),win="pred")
               # vis.scatter(data.pos.numpy(),opts=dict(markercolor=y_true*255,webgl=False,markerborderwidth=0,markersize=5),win="true")
            del x, edge_index, loss, y_out
        val_f1=f1_score(np.hstack(Y),np.hstack(Y_Pred),average=f1_metric)
        if save_mod and val_f1>=max(past_performance):
            best_model_dict=copy.deepcopy(model.state_dict())
            past_performance.append(val_f1)
        print(epoch,train_f1,val_f1,flush=True)
        train_val_f1.append((train_f1,val_f1))

    model.load_state_dict(best_model_dict)
    torch.save(model.state_dict(),os.path.join(out_dir,f"model.pth"))
    torch.save(train_val_f1,os.path.join(out_dir,f"f1.log.pth"))
    return Y, Y_Pred

In [34]:
y, y_pred = fit_model("/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Sophie_Chen/graph_dataset/graph_dataset.pkl", 
                      use_weights=True, 
                      lr=1e-3, 
                      out_dir='/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Sophie_Chen/gnn/gnn_model', 
                      n_epochs=1000, 
                      gpu_id=0, 
                      burnin=0)




0
234 training graphs, 90 validation graphs
training graph sizes:

AttributeError: 'list' object has no attribute 'shape'

In [11]:
import pandas as pd
df=pd.DataFrame(data)

In [9]:
data=pd.read_pickle("/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Sophie_Chen/graph_dataset/graph_dataset.pkl")

In [12]:
df

Unnamed: 0,109_A1c_ASAP_tumor_map,10_A1a_ASAP_tumor_map,10_A1b_ASAP_tumor_map,10_A2b_ASAP_tumor_map,110_A2b_ASAP_tumor_map,112_a_ASAP_tumor_map,112_b_ASAP_tumor_map,123_A1a_ASAP_tumor_map,12_A1c_ASAP_tumor_map,14_A1b_ASAP_tumor_map,...,370_A1b_ASAP_tumor_map,370_A2a_ASAP_tumor_map,370_A2b_ASAP_tumor_map,37_A2d_ASAP_tumor_map,61_A1a_ASAP_tumor_map,61_B1a_ASAP_tumor_map,70_A2b_ASAP_tumor_map,7_A1c_ASAP_tumor_map,7_A1d_ASAP_tumor_map,7_A1e_ASAP_tumor_map
train,"[[(x, [20589, 2048]), (edge_index, [2, 102219]...","[[(x, [5563, 2048]), (edge_index, [2, 27309]),...","[[(x, [5719, 2048]), (edge_index, [2, 28113]),...","[[(x, [6098, 2048]), (edge_index, [2, 29956]),...","[[(x, [4220, 2048]), (edge_index, [2, 20574]),...","[[(x, [817, 2048]), (edge_index, [2, 3897]), (...","[[(x, [902, 2048]), (edge_index, [2, 4298]), (...","[[(x, [2626, 2048]), (edge_index, [2, 12492]),...","[[(x, [7126, 2048]), (edge_index, [2, 35066]),...","[[(x, [5176, 2048]), (edge_index, [2, 25114]),...",...,"[[(x, [3522, 2048]), (edge_index, [2, 17230]),...","[[(x, [3051, 2048]), (edge_index, [2, 14891]),...","[[(x, [3236, 2048]), (edge_index, [2, 15876]),...","[[(x, [7697, 2048]), (edge_index, [2, 36761]),...","[[(x, [1708, 2048]), (edge_index, [2, 8198]), ...","[[(x, [2145, 2048]), (edge_index, [2, 10425]),...","[[(x, [1686, 2048]), (edge_index, [2, 8148]), ...","[[(x, [6353, 2048]), (edge_index, [2, 31203]),...","[[(x, [5494, 2048]), (edge_index, [2, 26986]),...","[[(x, [4710, 2048]), (edge_index, [2, 23116]),..."
val,"[[(x, [20706, 2048]), (edge_index, [2, 102704]...","[[(x, [5804, 2048]), (edge_index, [2, 28532]),...","[[(x, [5642, 2048]), (edge_index, [2, 27702]),...","[[(x, [6085, 2048]), (edge_index, [2, 29929]),...","[[(x, [4472, 2048]), (edge_index, [2, 21880]),...","[[(x, [935, 2048]), (edge_index, [2, 4489]), (...","[[(x, [850, 2048]), (edge_index, [2, 4038]), (...","[[(x, [3280, 2048]), (edge_index, [2, 15970]),...","[[(x, [6814, 2048]), (edge_index, [2, 33326]),...","[[(x, [4577, 2048]), (edge_index, [2, 22143]),...",...,"[[(x, [3362, 2048]), (edge_index, [2, 16446]),...","[[(x, [3383, 2048]), (edge_index, [2, 16557]),...","[[(x, [3095, 2048]), (edge_index, [2, 15181]),...","[[(x, [7718, 2048]), (edge_index, [2, 36598]),...","[[(x, [2201, 2048]), (edge_index, [2, 10721]),...","[[(x, [2088, 2048]), (edge_index, [2, 10144]),...","[[(x, [1672, 2048]), (edge_index, [2, 8158]), ...","[[(x, [5921, 2048]), (edge_index, [2, 29101]),...","[[(x, [5516, 2048]), (edge_index, [2, 27058]),...","[[(x, [4639, 2048]), (edge_index, [2, 22765]),..."


In [3]:
import pandas as pd
import torch
data=torch.load("/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Sophie_Chen/cnn_embeddings/109_A1c_ASAP_tumor_map.pkl")