Task : Node Classification

Dataset : https://graphsandnetworks.com/the-cora-dataset/ 

Reference : https://github.com/tkipf/pygcn/blob/master/pygcn/utils.py

In [81]:
import os 
import pandas as pd 
import wget
import numpy as np 
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder

In [4]:
url = 'https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz'
wget.download(url)

'cora.tgz'

In [6]:
!tar -xvzf cora.tgz

cora/
cora/README
cora/cora.cites
cora/cora.content


In [None]:
# encoder = OneHotEncoder().fit(node_list[['subject']])
# sparse_mat = encoder.transform(node_list[['subject']])
# sparse_mat.toarray()

# label = sparse_mat.toarray()
# labels = torch.LongTensor(np.where(label)[1])

In [82]:
node_list = pd.read_csv(os.path.join(data_dir, 'cora.content'), sep='\t', header=None, names=['paper_id']+['feature_'+str(ii) for ii in range(1433)]+['subject'])


Unnamed: 0,paper_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_1424,feature_1425,feature_1426,feature_1427,feature_1428,feature_1429,feature_1430,feature_1431,feature_1432,subject
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1128975,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2704,1128977,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2705,1128978,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2706,117328,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Case_Based


In [2]:
data_dir = 'cora'
edge_list = pd.read_csv(os.path.join(data_dir, 'cora.cites'), sep='\t', header=None, names=['target', 'source'])
node_list = pd.read_csv(os.path.join(data_dir, 'cora.content'), sep='\t', header=None, names=['paper_id']+['feature_'+str(ii) for ii in range(1433)]+['subject'])

idx = np.array(node_list.iloc[:,0])
idx_map = {j: i for i,j in enumerate(idx)}

label_dict = {j:i for i,j in enumerate(node_list['subject'].unique())}

node_list = node_list[['paper_id', 'subject']]
node_list['paper_id'] = node_list['paper_id'].map(idx_map)
node_list['subject'] = node_list['subject'].map(label_dict)

edge_list['target'] = edge_list['target'].map(idx_map)
edge_list['source'] = edge_list['source'].map(idx_map)

for_concat = edge_list.copy()
for_concat.rename(columns={'target':'source', 'source':'target'}, inplace=True)

edge_list = pd.concat([edge_list, for_concat], axis=0, ignore_index=True)

In [None]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [17]:
adj = sp.coo_matrix((np.ones(len(edge_list)), (edge_list['target'], edge_list['source'])), shape=(len(idx), len(idx)), dtype=np.float32)
adj = adj + sp.eye(adj.shape[0])
rowsum = np.array(adj.sum(1))
inv = np.power(rowsum, -1).flatten()
inv[np.isinf(inv)] = 0.
inv = sp.diags(inv)
adj = inv.dot(adj)

adj = sparse_mx_to_torch_sparse_tensor(adj)

In [56]:
idx_train = range(140)
idx_val = range(200, 500)
idx_test = range(500, 1500)

idx_train = torch.LongTensor(idx_train) 
idx_val = torch.LongTensor(idx_val) 
idx_test = torch.LongTensor(idx_test)



In [55]:
class GCN_Layer(nn.Module):
    def __init__(self,in_features,out_features):
        super(GCN_Layer,self).__init__()
        self.in_features = in_features 
        self.out_features = out_features
        self.layer = nn.Linear(in_features=self.in_features,out_features=self.out_features)
        
    def forward(self,x,A):
        x = self.layer(torch.spmm(A,x))
        output = F.relu(x)
        
        return output

class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate):
        super(GCN,self).__init__()
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.output_dim = output_dim 
        self.dropout_rate = dropout_rate 
        
        self.input_layer = GCN_Layer(self.input_dim,self.hidden_dims[0])
        
        self.output_layer = GCN_Layer(self.hidden_dims[-1],self.output_dim)
        
        layers = []
        
        for i in range(len(self.hidden_dims)-1):
            layers.append(GCN_Layer(self.hidden_dims[i],self.hidden_dims[i+1]))
        
        self.layers = nn.ModuleList(layers)
        
                
    def forward(self, x, A):
        x = self.input_layer(torch.spmm(A,x))
        for layer in self.layers:
            x = layer(x,A)
        x = self.output_layer(torch.spmm(A,x))
        output = F.softmax(x)
        
        return output 
        