In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing as prep
from torch.utils.data import DataLoader
import time
import networkx as nx

In [2]:
!python -c "import ogb; print(ogb.__version__)"

1.2.3


In [3]:
from ogb.nodeproppred import NodePropPredDataset

d = NodePropPredDataset('ogbn-arxiv', root='/datasets/ogb/ogbn-arxiv')

In [230]:
data = 'ogb'

In [247]:
if data =='cora':
    d1 = pd.read_csv('cora.content', sep ='\t', header=None)
    d2 = pd.read_csv('cora.cites', sep ='\t', header=None)
elif data == 'ogb':
    d2 = pd.DataFrame(d[0][0]['edge_index'].T)
    d1 = pd.DataFrame(d[0][0]['node_feat'])

'''d2 = d2.sort_values(0).iloc[:2000]
partial_idx = list(set(d2[0].unique()) | set(d2[1].unique()))
d1 = d1.iloc[partial_idx]'''

'd2 = d2.sort_values(0).iloc[:2000]\npartial_idx = list(set(d2[0].unique()) | set(d2[1].unique()))\nd1 = d1.iloc[partial_idx]'

In [249]:
# Adjacency matrix

G = nx.Graph()
G.add_edges_from(d2.values)
A = nx.adjacency_matrix(G).toarray()
d2 = pd.DataFrame(A)

In [250]:
# -- My Implementation ---
#d1 = (pd.read_csv('cora.content', sep ='\t', header=None))

# Label Encoder
le = prep.LabelEncoder()
le.fit(d1.iloc[:,-1])
d1.iloc[:,-1] = le.transform(d1.iloc[:,-1])

# Feature Matrix and Labels
if data == 'cora':

    d1 = d1.set_index(0)
    d1 = d1.sort_index()
    d1 = d1.reset_index()
    labels = d1.iloc[:,-1]
    
else:
    labels = d[0][1]

#labels = labels[:2137].flatten()

labels = torch.Tensor(labels).long()

if data == 'cora':
    columns_to_drop = [0, d1.iloc[:,-1].name]
else:
    columns_to_drop = [d1.iloc[:,-1].name]

d1 = d1.drop(columns=columns_to_drop)

X = np.array(d1)


# Create label distibution for LPA

labels_distr = np.zeros([len(labels), len(le.classes_)])
for row in range(len(labels)):
    labels_distr[row][labels[row]] = 1

In [253]:
# Make Train/Test Indexes
train_idx = list(d2[0].sample(frac=.9).index)
test_idx = list(set(d2.index) - set(train_idx))

'''train_A = d2.loc[train_idx, train_idx]
train_X = d1.loc[train_idx]
train_Y = labels[train_idx]

test_A = d2.loc[test_idx, test_idx]
test_X = d1.loc[test_idx]
test_Y = labels[test_idx]'''

def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)


In [254]:
# params:
# A is the adj matrix
# X is the feature matrix
class GCN_Layer(torch.nn.Module):
    """
    Simple GCN layer
    """
    def __init__(self, in_feats, out_feats):
        super(GCN_Layer, self).__init__()
        self.in_feats = in_feats
        self.out_feats = out_feats
        self.weight = np.random.randn(in_feats, out_feats)
        self.weight = nn.Parameter(torch.Tensor(self.weight))

    def forward(self, prev_output, A, prep_A=None):
        '''
        Propogation Rule:
        params: pred_A - specify how to prepare A, with or without normalization
        '''
        prev_output = torch.Tensor(prev_output)
        A = torch.Tensor(A)
        
        right_term = torch.mm(prev_output, self.weight)

        # Unnormalized
        if prep_A == None:
            output = torch.mm(A, right_term)    
        # Normalized with Kipf & Welling 
        elif prep_A == "norm":
            I = torch.eye(A.shape[0])
            A_hat = A + I
            D_hat = torch.Tensor(np.diag(A_hat.sum(axis=1) ** (-1/2)))
            output = torch.mm(D_hat, A_hat)
            output = torch.mm(output, D_hat)
            output = torch.mm(output, right_term)
            
            
        return output

class GCN(torch.nn.Module):
    def __init__(self, nfeat, nhid, nclass):
        """
        Simple GCN Model
        """
        super(GCN, self).__init__()

        # GCN Layers
        self.gc1 = GCN_Layer(nfeat, nhid) 
        self.gc2 = GCN_Layer(nhid, nclass)
        #self.gc3 = GCN_Layer(nhid-300, nclass)
        
    def forward(self, X, A, prep_A):
        """
        """
        
        X = F.relu(self.gc1(X, A, prep_A))
        X = self.gc2(X, A, prep_A)
        return F.log_softmax(X, dim=1)

In [255]:
# Aggregators
class Mean_Agg(torch.nn.Module):
    '''
    GraphSAGE Mean Aggregator
    '''
    def __init__(self):
        super(Mean_Agg, self).__init__()
        
    def forward(self, h, A, W, activation='relu'):
        A = torch.tensor(A)

        # X: batch of nodes
        h1 = h
        h = torch.matmul(h.T, A) / torch.sum(A)
        h = torch.cat((h1, h.T), 1)
        h = torch.matmul(W, h.T.float())
        
        if activation == 'relu':
            h = F.relu(h.T)

        return h
    
class MaxPool_Agg(torch.nn.Module):
    '''
    GraphSAGE Pooling Aggregator
    '''
    
    def __init__(self, in_feasts, out_feats):
        ...
    
    def forward(self, x, neigh):
        ...

In [256]:
# GraphSAGE Models and Layers

class GS_Layer(torch.nn.Module):
    '''
    GraphSAGE Layer
    '''
    
    def __init__(self):
        super(GS_Layer,self).__init__()
        ...
        
    def forward(self, X, steps, A):
        # X: batch of nodes
        # steps: steps from node for neighborhood
        # A: adjacency matrix to find nodes in neighborhood
        ...
        
     
# GraphSAGE
class GS(torch.nn.Module):
    def __init__(self, nfeat, nhid, nclass, agg='mean', num_samples=25, dropout=.5):
        """
        GraphSAGE Model
        """
        super(GS, self).__init__()

        self.nfeat = nfeat
        self.nhid = nhid
        self.nclass = nclass
        self.agg = agg
        self.num_samples = num_samples
        #self.num_layers = len(nhid) + 1
        
        
        self.W = torch.randn(nfeat, 2*nfeat)
        self.W = nn.Parameter(self.W)
        
        if self.agg == 'mean':
            self.agg = Mean_Agg()
        elif self.agg == 'maxpool':
            self.agg = MaxPool_Agg()
            
        self.gc1 = GCN_Layer(nfeat, nhid) 
        self.gc2 = GCN_Layer(nhid, nclass) 

    def forward(self, X, A, K=1, activation='relu', prep_A='norm'):      
        
        # shape of H = number of nodes x number of features
        h = torch.tensor(X)

        for k in np.arange(K):

            h = self.agg(h, A, self.W, activation)    
            
        h = F.relu(self.gc1(h.float(), A, prep_A))
        X = self.gc2(h, A, prep_A)


        return F.log_softmax(h, dim=1)

In [None]:
# GraphSAGE model
def run_GS(epochs=10, Lambda=10):
    
    print('GraphSAGE')
    GS_model = GS(X.shape[1], 300, len(le.classes_))
    optimizer = torch.optim.SGD(GS_model.parameters(), lr=.1)
    criterion = torch.nn.CrossEntropyLoss()

    # Train and Test functions
    def train(epoch, prep_A = None):
        t = time.time()
        GS_model.train()
        optimizer.zero_grad()
        output = GS_model(X, A)
        loss = criterion(output[train_idx], labels[train_idx])
        acc = accuracy(output[train_idx], labels[train_idx])
        loss.backward()
        optimizer.step()

        print('Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss.item()),
              'acc_train: {:.4f}'.format(acc.item()), 
              'time: {:.4f}s'.format(time.time() - t))

    def test(prep_A = None):
        GS_model.eval()
        output = GS_model(X, A)
        loss_test = criterion(output[test_idx], labels[test_idx])
        acc_test = accuracy(output[test_idx], labels[test_idx])
        print("Test set results:",
              "loss= {:.4f}".format(loss_test.item()),
              "accuracy= {:.4f}".format(acc_test.item()))

    
    t_total = time.time()
    for epoch in range(epochs):
        train(epoch)
    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

    # Testing
    test()
        
run_GS(3)

GraphSAGE


In [None]:
# LPA-GCN

class LPA_GCN_Layer(torch.nn.Module):
    def __init__(self, in_feats, out_feats, A):
        super(LPA_GCN_Layer, self).__init__()
        self.in_feats = in_feats
        self.out_feats = out_feats
        self.weight = np.random.randn(in_feats, out_feats)
        self.weight = nn.Parameter(torch.Tensor(self.weight))
        A = torch.Tensor(A)
        self.mask_A = A.clone()
        self.mask_A = nn.Parameter(self.mask_A)
        
    def forward(self, X, A, Y):
        X = torch.Tensor(X)
        A = torch.Tensor(A)
        Y = torch.Tensor(Y)
        
        right_term = torch.mm(X, self.weight)
        # Hadamard A'
        A = A * self.mask_A
        # Normalize D^-1 * A'
        A = F.normalize(A, p=1, dim=1)
        
        output = torch.mm(A, right_term)
        Y_hat = torch.mm(A, Y)
        return output, Y_hat
    
class GCN_LPA(torch.nn.Module):
    def __init__(self, nfeat, nhid, nclass, A):
        super(GCN_LPA, self).__init__()
        
        self.gcn_lpa1 = LPA_GCN_Layer(nfeat, nhid, A) 
        self.gcn_lpa2 = LPA_GCN_Layer(nhid, nclass, A) 
    
    def forward(self, X, A, Y):
        X, Y_hat = self.gcn_lpa1(X, A, Y)
        X = F.relu(X)
        X, Y_hat = self.gcn_lpa2(X, A, Y_hat)    
        
        return F.relu(X), F.relu(Y_hat)

In [None]:
def run_LPA_GCN(epochs=10, Lambda=10):
    GCN_LPA_model = GCN_LPA(X.shape[1], 300, len(le.classes_), A )
    optimizer = torch.optim.SGD(GCN_LPA_model.parameters(), lr=.1)
    criterion = torch.nn.CrossEntropyLoss()

    # Train
    for epoch in np.arange(epochs):
        t = time.time()
        GCN_LPA_model.train()
        optimizer.zero_grad()
        output, Y_hat = GCN_LPA_model(X, A, labels_distr)

        loss_gcn = criterion(output[train_idx], labels[train_idx])
        loss_lpa = criterion(Y_hat[train_idx], labels[train_idx])

        acc = accuracy(output[train_idx], labels[train_idx])
        loss_train = loss_gcn + Lambda * loss_lpa

        loss_train.backward()
        optimizer.step()

        print('Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc.item()), 
              'time: {:.4f}s'.format(time.time() - t))
    
    # Test
    GCN_LPA_model.eval()
    output, Y_hat = GCN_LPA_model(X, A, labels_distr)
    
    loss_gcn = criterion(output[test_idx], labels[test_idx])
    loss_lpa = criterion(Y_hat[test_idx], labels[test_idx])    
    acc_test = accuracy(output[test_idx], labels[test_idx])
    
    loss_test = loss_gcn + Lambda * loss_lpa
    
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test.item()))
        
run_LPA_GCN(10)

In [None]:
# Train model
def run_GCN(epochs=10, prep_A=None):
    print('GCN')
    # Model and Optimizer
    
    # GCN takes in number of papers, number hidden layers, and number of classes
    model = GCN(X.shape[1], 300, len(le.classes_))

    optimizer = torch.optim.SGD(model.parameters(), lr=.1)
    criterion = torch.nn.CrossEntropyLoss()
    
    # Train and Test functions
    def train(epoch, prep_A = None):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        output = model(X, A, prep_A)
        loss = criterion(output[train_idx], labels[train_idx])
        acc = accuracy(output[train_idx], labels[train_idx])
        loss.backward()
        optimizer.step()

        print('Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss.item()),
              'acc_train: {:.4f}'.format(acc.item()), 
              'time: {:.4f}s'.format(time.time() - t))

    def test(prep_A = None):
        model.eval()
        output = model(X, A, prep_A)
        loss_test = criterion(output[test_idx], labels[test_idx])
        acc_test = accuracy(output[test_idx], labels[test_idx])
        print("Test set results:",
              "loss= {:.4f}".format(loss_test.item()),
              "accuracy= {:.4f}".format(acc_test.item()))

    
    t_total = time.time()
    for epoch in range(epochs):
        train(epoch, prep_A)
    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

    # Testing
    test(prep_A)
    
run_GCN(3, 'norm')