# Setup

In [None]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
!pip install torch-geometric
!pip install -q captum
!pip install sklearn

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils
from torch_geometric.utils import to_networkx

import time
from datetime import datetime

import numpy as np
import torch
import torch.optim as optim
import networkx as nx

from torch_geometric.data import DataLoader
from torch_geometric.data import Data

import torch_geometric.transforms as T

from torch.utils.tensorboard import SummaryWriter
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import random
import pickle

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# use encoding="bytes" to convert python2 binaries to python3
pickle_off = open("/content/drive/MyDrive/COGS185Final/train.cpkl","rb")
train_list, train_data = pickle.load(pickle_off, encoding="bytes")
pickle_off = open("/content/drive/MyDrive/COGS185Final/test.cpkl","rb")
test_list, test_data = pickle.load(pickle_off, encoding="bytes")

In [6]:
def transformDataset(dataset_in):
    datalist1 = []
    datalist2 = []
    for data in dataset_in:
        x1 = torch.tensor(data[b'l_vertex'], dtype=torch.float) # x1
        x2 = torch.tensor(data[b'r_vertex'], dtype=torch.float) # x2
        nd_dim1 = data[b'l_hood_indices'].shape[0] # node dimension
        nh_dim1 = data[b'l_hood_indices'].shape[1] # neighborhood dimension
        nd_dim2 = data[b'r_hood_indices'].shape[0] # node dimension
        nh_dim2 = data[b'r_hood_indices'].shape[1] # neighborhood dimension
        sid1 = np.arange(nd_dim1).reshape(nd_dim1,1) # sid is self index
        sid2 = np.arange(nd_dim2).reshape(nd_dim2,1) # sid is self index
        bsid1 = sid1.copy()
        bsid2 = sid2.copy()
        for i in range(nh_dim1-1):
            sid1 = np.hstack((sid1,bsid1))
        for i in range(nh_dim2-1):
            sid2 = np.hstack((sid2,bsid2))
        sid1 = sid1.reshape(nd_dim1,nh_dim1,1)
        sid2 = sid2.reshape(nd_dim2,nh_dim2,1)
        nid1 = data[b'l_hood_indices']
        nid2 = data[b'r_hood_indices']
        edge_id1 = np.stack([sid1,nid1], axis=2)
        edge_id1 = edge_id1.reshape(nd_dim1,nh_dim1,2)
        edge_id2 = np.stack([sid2,nid2], axis=2)
        edge_id2 = edge_id2.reshape(nd_dim2,nh_dim2,2)
        edge_index1 = edge_id1.reshape(nd_dim1*nh_dim1,2)
        edge_index1 = edge_index1.T
        edge_index1 = torch.tensor(edge_index1, dtype=torch.long) # edge_index1
        edge_index2 = edge_id2.reshape(nd_dim2*nh_dim2,2)
        edge_index2 = edge_index2.T
        edge_index2 = torch.tensor(edge_index2, dtype=torch.long) # edge_index2
        ef1 = data[b'l_edge'].shape[2]
        ef2 = data[b'r_edge'].shape[2]
        edge_attr1 = data[b'l_edge'].reshape(nd_dim1*nh_dim1,ef1)
        edge_attr2 = data[b'r_edge'].reshape(nd_dim2*nh_dim2,ef2)
        edge_attr1 = torch.tensor(edge_attr1, dtype=torch.float) # edge_attr1
        edge_attr2 = torch.tensor(edge_attr2, dtype=torch.float) # edge_attr2
        pair_label = data[b'label'] #residual pair connectivity label
        y = torch.tensor(pair_label, dtype=torch.float) # y

        # datalist.append(Data(x1=x1,x2=x2,y=y, edge_index1=edge_index1, edge_index2=edge_index2, edge_attr1=edge_attr1, edge_attr2=edge_attr2))
        datalist1.append(Data(x=x1,y=y, edge_index=edge_index1))
        datalist2.append(Data(x=x2,y=y, edge_index=edge_index2))
        
    return datalist1, datalist2

In [7]:
train_datalist1, train_datalist2 = transformDataset(train_data)
test_datalist1, test_datalist2 = transformDataset(test_data)

# Model

In [8]:
class PairGCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(PairGCN, self).__init__()
        self.convs = nn.ModuleList()
        self.convs.append(pyg_nn.GCNConv(input_dim, hidden_dim))
        for l in range(2): # Can adjust
            self.convs.append(pyg_nn.GCNConv(hidden_dim, hidden_dim))
        # Linear layer: [node1_dim*node2_dim, 70+70] --> [node1_dim*node2_dim, 2]; hidden_dim * 2 = 70 + 70
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim * 2), nn.Dropout(0.5), 
            nn.Linear(hidden_dim * 2, output_dim))
        self.num_layers = 2 # Can adjust
        self.dropout = 0.5

    def forward(self, data1, data2):
        data1 = data1.to(device)
        data2 = data2.to(device)
        x1, edge_index1, y = data1.x, data1.edge_index, data1.y
        x2, edge_index2 = data2.x, data2.edge_index

        # y is only used to extract the example pairs

        for i in range(self.num_layers):
            x1 = self.convs[i](x1, edge_index1)
            x2 = self.convs[i](x2, edge_index2) # The weights are shared
            emb1 = x1
            emb2 = x2
            x1 = F.relu(x1)
            x2 = F.relu(x2)
            x1 = F.dropout(x1, p=self.dropout, training=self.training)
            x2 = F.dropout(x2, p=self.dropout, training=self.training)

        # Merge Residues Example Pairs
        pairs = self.merge(x1, x2, y)

        # Linear Layer Classification
        out = self.classifier(pairs)

        return [emb1,emb2], F.log_softmax(out, dim=1)

    def merge(self, x1, x2, y):
        pair_list = y[:,:-1]
        left = pair_list[:,0].type(torch.long)
        right = pair_list[:,1].type(torch.long)
        return torch.hstack((x1[left],x2[right]))

    def loss(self, pred, label):
        label = label[:,-1]
        label = label.type(torch.long)
        label[label==-1]=0
        weight = torch.tensor([1,10], dtype=torch.float).to(device) # To compensate for the uneven distribution of labels
        return F.nll_loss(pred, label, weight)

In [9]:
def train(train_datalist1,train_datalist2, test_datalist1,test_datalist2, writer): # test_datalist added to check test scores
    train_loader1 = DataLoader(train_datalist1, batch_size=128, shuffle=False)
    train_loader2 = DataLoader(train_datalist2, batch_size=128, shuffle=False)
    test_loader1 = DataLoader(test_datalist1, batch_size=128, shuffle=False)
    test_loader2 = DataLoader(test_datalist2, batch_size=128, shuffle=False)

    # build model
    model = PairGCN(input_dim=70, hidden_dim=70, output_dim=2)
    model = model.to(device)
    opt = optim.Adam(model.parameters(), lr=0.01)
    
    # train
    for epoch in range(1000):
        total_loss = 0
        model.train()
        for (batch1, batch2) in zip(train_loader1, train_loader2):
            opt.zero_grad()
            embedding, pred = model(batch1, batch2)
            label = batch1.y # labels are the same for both batches
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch1.num_graphs
        total_loss /= len(train_loader1.dataset)
        writer.add_scalar("loss", total_loss, epoch)

        if epoch % 50 == 0:
            # test_acc = test(test_loader1, test_loader2, model)
            test_acc = test(test_loader1, test_loader2, model)
            print("Epoch {}. Loss: {:.4f}. Test AUC: {:.4f}".format(
                epoch, total_loss, test_acc))
            writer.add_scalar("test AUC", test_acc, epoch)

    return model

def test(loader1, loader2, model, is_validation=False):
    model.eval()

    correct = 0
    total = 0
    for (data1, data2) in zip(loader1, loader2):
        data1 = data1.to(device)
        data2 = data2.to(device)
        # indices = torch.randperm(len(pictures))[:10]
        with torch.no_grad():
            emb, pred = model(data1, data2)
            pred = pred.argmax(dim=1)
            label = data1.y
        label = label[:,-1]
        label = label.type(torch.long)
        label[label==-1]=0
        # print(pred.sum())
        # print(label.sum())
        # correct += pred.eq(label).sum()#.item()
        # total += len(label)
        label = label.detach().cpu().numpy()
        pred = pred.detach().cpu().numpy()
    
    # total = len(loader1.dataset)
    return roc_auc_score(label, pred)

# Train & Test

In [10]:
writer = SummaryWriter()

model = train(train_datalist1, train_datalist2, test_datalist1, test_datalist2, writer)

Epoch 0. Loss: 0.7014. Test AUC: 0.5000
Epoch 50. Loss: 0.6852. Test AUC: 0.4750
Epoch 100. Loss: 0.6804. Test AUC: 0.5004
Epoch 150. Loss: 0.6792. Test AUC: 0.5095
Epoch 200. Loss: 0.6779. Test AUC: 0.5015
Epoch 250. Loss: 0.6772. Test AUC: 0.5417
Epoch 300. Loss: 0.6801. Test AUC: 0.5006
Epoch 350. Loss: 0.6738. Test AUC: 0.5119
Epoch 400. Loss: 0.6714. Test AUC: 0.5355
Epoch 450. Loss: 0.6729. Test AUC: 0.5264
Epoch 500. Loss: 0.6753. Test AUC: 0.5356
Epoch 550. Loss: 0.6694. Test AUC: 0.5375
Epoch 600. Loss: 0.6712. Test AUC: 0.5439
Epoch 650. Loss: 0.6724. Test AUC: 0.5234
Epoch 700. Loss: 0.6701. Test AUC: 0.5216
Epoch 750. Loss: 0.6675. Test AUC: 0.5173
Epoch 800. Loss: 0.6723. Test AUC: 0.5191
Epoch 850. Loss: 0.6673. Test AUC: 0.5150
Epoch 900. Loss: 0.6733. Test AUC: 0.5078
Epoch 950. Loss: 0.6679. Test AUC: 0.5266


# Visualization

In [11]:
!tensorboard dev upload --logdir runs \
--name "My latest experiment" \
--description "Simple comparison of several hyperparameters"


***** TensorBoard Uploader *****

This will upload your TensorBoard logs to https://tensorboard.dev/ from
the following directory:

runs

This TensorBoard will be visible to everyone. Do not upload sensitive
data.

Your use of this service is subject to Google's Terms of Service
<https://policies.google.com/terms> and Privacy Policy
<https://policies.google.com/privacy>, and TensorBoard.dev's Terms of Service
<https://tensorboard.dev/policy/terms/>.

This notice will not be shown again while you are logged into the uploader.
To log out, run `tensorboard dev auth revoke`.

Continue? (yes/NO) yes

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=373649185512-8v619h5kft38l4456nm2dj4ubeqsrvh6.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email&state=7f3oNB6EgAGLauFt73bZs7IB7oHXBw&prompt=consent&access_type=offline
Enter th