In [1]:
from ordered_set import OrderedSet
from six.moves import cPickle as pickle 
from collections import defaultdict
from scipy.sparse import load_npz
from scipy.sparse import csr_matrix

import numpy as np
import torch
import torch_geometric
import networkx as nx

In [2]:
mat = load_npz('/gpfs/data/rsingh47/jbigness/data/E116/hic_sparse_vcsqrt_oe_edge_v7.npz')
hms = np.load('/gpfs/data/rsingh47/jbigness/data/E116/np_hmods_norm_vcsqrt_oe_edge_v7.npy')
labs = np.load('/gpfs/data/rsingh47/jbigness/data/E116/np_nodes_lab_genes_vcsqrt_oe_edge_v7.npy')

In [3]:
mask = torch.tensor(labs[:,-1]).long()
extract = torch_geometric.utils.from_scipy_sparse_matrix(mat)
G = torch_geometric.data.Data(edge_index = extract[0], 
                              edge_attr = extract[1], 
                              x = torch.tensor(hms[:mat.shape[0]]).float().reshape(-1, 1, 100, 5), 
                              y = torch.tensor(labs[:,-2]).long())

In [11]:
from torch_geometric.data import ClusterData, ClusterLoader

cluster_data = ClusterData(G, num_parts=10, recursive=False,
                           save_dir='/gpfs_home/spate116/singhlab/GCN_Integration/notebooks/JX')
train_loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True,
                             num_workers=6)

for d in train_loader:
    print(d)

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/gpfs_home/spate116/ml/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/gpfs_home/spate116/ml/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/gpfs_home/spate116/ml/lib/python3.7/site-packages/torch_geometric/data/cluster.py", line 148, in __collate__
    start = self.cluster_data.partptr[batch].tolist()
RuntimeError: CUDA error: initialization error


In [None]:
from torch_geometric.nn import SAGEConv, ChebConv, TAGConv, GATConv, ARMAConv
import torch.nn as nn
import torch.nn.functional as F

class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, hidden_size1, num_classes):
        super(GCN, self).__init__()
        self.conv1 = TAGConv(in_feats, hidden_size, K = 3)
        self.conv2 = TAGConv(hidden_size, hidden_size1)
        self.conv3 = TAGConv(hidden_size1, num_classes)
        x = 10
        self.encoder = nn.Sequential(
            nn.Conv2d(1, x, (3, 3)),
            nn.LeakyReLU(),
            nn.Dropout2d(),
            nn.Conv2d(x, 2*x, (3, 2)),
            nn.LeakyReLU(),
            nn.Dropout2d(),
            nn.Conv2d(2*x, 1, (3, 2)),
        )

    def forward(self, g, inputs):
        h = self.encoder(inputs).reshape(-1, 94)
        h = torch.tanh(h)
        h = F.dropout(h, training=self.training)
        h = self.conv1(h, g.edge_index)
        h = torch.tanh(h)
        h = F.dropout(h, training=self.training)
        h = self.conv2(h, g.edge_index)
        h = torch.tanh(h)
        h = F.dropout(h, training=self.training)
        h = self.conv3(h, g.edge_index)
        h = F.softmax(h, dim=1)
        return h

In [None]:
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve

device = torch.device('cuda')
def train_model(net, data_loader, epochs, learning_rate, train_mask, test_mask, mask):
    model = net.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
    losses_train = []
    losses_test = []
    best_auc = -1

    pbar = tqdm(range(epochs))
    for epoch in pbar:
        logits = []
        y = []
        for d in data_loader:
            d = d.to(device)
            model.train()
            logits.append(model(d, d.x.float()))
            y.append(d.y)
        
        logits = torch.cat(logits, dim=0)[mask]
        y = torch.cat(y, dim = 0)
        
        loss = F.cross_entropy(logits[train_mask], y[train_mask])
        loss_test = F.cross_entropy(logits[test_mask], y[test_mask])
        losses_train.append(loss.item())
        losses_test.append(loss_test.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        model.eval()
        pred = list(map(lambda x: np.argmax(x, axis = 0), torch.exp(F.log_softmax(logits, 1)).cpu().detach().numpy()))
        auc = roc_auc_score(y[test_mask], [pred[i] for i in test_mask], average='weighted')
        best_auc = best_auc if best_auc > auc else auc

        pbar.set_description('Best Test AUC: %.4f | Train Loss: %.4f | Test Loss: %.4f' % (best_auc, loss.item(), loss_test.item()))

    return losses_train, losses_test, model, best_auc

In [None]:
import random
random.seed(30)
idx = list(range(G.y.shape[0]))
random.shuffle(idx)
train_mask = idx[:10000]
test_mask = idx[10000:]

In [None]:
net = GCN(94, 100, 50, 2)

In [None]:
losses_train, losses_test, model, best_auc = train_model(net, train_loader, 4000, 0.01, train_mask, test_mask, mask)