In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils

import time
from datetime import datetime

import networkx as nx
import numpy as np
import torch
import torch.optim as optim

from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Planetoid
from torch_geometric.data import DataLoader

import torch_geometric.transforms as T

from tensorboardX import SummaryWriter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt



In [2]:
class GNNStack(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, task='node'):
        super(GNNStack, self).__init__()
        self.task = task
        self.convs = nn.ModuleList()
        self.convs.append(self.build_conv_model(input_dim, hidden_dim))
        self.lns = nn.ModuleList()
        self.lns.append(nn.LayerNorm(hidden_dim))
        self.lns.append(nn.LayerNorm(hidden_dim))
        for l in range(2):
            self.convs.append(self.build_conv_model(hidden_dim, hidden_dim))

        # post-message-passing
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(0.25),
            nn.Linear(hidden_dim, output_dim))
        if not (self.task == 'node' or self.task == 'graph'):
            raise RuntimeError('Unknown task.')

        self.dropout = 0.25
        self.num_layers = 3

    def build_conv_model(self, input_dim, hidden_dim):
        # refer to pytorch geometric nn module for different implementation of GNNs.
        if self.task == 'node':
            return pyg_nn.GCNConv(input_dim, hidden_dim)
        else:
            return pyg_nn.GINConv(nn.Sequential(nn.Linear(input_dim, hidden_dim),
                                  nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)))

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if data.num_node_features == 0:
          x = torch.ones(data.num_nodes, 1)

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            emb = x
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            if not i == self.num_layers - 1:
                x = self.lns[i](x)

        if self.task == 'graph':
            x = pyg_nn.global_mean_pool(x, batch)

        x = self.post_mp(x)

        return emb, F.log_softmax(x, dim=1)

    def loss(self, pred, label):
        return F.nll_loss(pred, label)

In [3]:
class CustomConv(pyg_nn.MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(CustomConv, self).__init__(aggr='add')  # "Add" aggregation.
        self.lin = nn.Linear(in_channels, out_channels)
        self.lin_self = nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        # Add self-loops to the adjacency matrix.
        edge_index, _ = pyg_utils.remove_self_loops(edge_index)

        # Transform node feature matrix.
        self_x = self.lin_self(x)
        #x = self.lin(x)

        return self_x + self.propagate(edge_index, size=(x.size(0), x.size(0)), x=self.lin(x))

    def message(self, x_i, x_j, edge_index, size):
        # Compute messages
        # x_j has shape [E, out_channels]

        row, col = edge_index
        deg = pyg_utils.degree(row, size[0], dtype=x_j.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        return x_j

    def update(self, aggr_out):
        # aggr_out has shape [N, out_channels]
        return aggr_out

In [4]:
def train(dataset, task, writer):
    if task == 'graph':
        data_size = len(dataset)
        loader = DataLoader(dataset[:int(data_size * 0.8)], batch_size=64, shuffle=True)
        test_loader = DataLoader(dataset[int(data_size * 0.8):], batch_size=64, shuffle=True)
    else:
        test_loader = loader = DataLoader(dataset, batch_size=64, shuffle=True)

    # build model
    model = GNNStack(max(dataset.num_node_features, 1), 32, dataset.num_classes, task=task)
    opt = optim.Adam(model.parameters(), lr=0.01)

    # train
    for epoch in range(200):
        total_loss = 0
        model.train()
        for batch in loader:
            #print(batch.train_mask, '----')
            opt.zero_grad()
            embedding, pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        writer.add_scalar("loss", total_loss, epoch)

        if epoch % 10 == 0:
            test_acc = test(test_loader, model)
            print("Epoch {}. Loss: {:.4f}. Test accuracy: {:.4f}".format(
                epoch, total_loss, test_acc))
            writer.add_scalar("test accuracy", test_acc, epoch)

    return model


In [5]:
def test(loader, model, is_validation=False):
    model.eval()

    correct = 0
    for data in loader:
        with torch.no_grad():
            emb, pred = model(data)
            pred = pred.argmax(dim=1)
            label = data.y

        if model.task == 'node':
            mask = data.val_mask if is_validation else data.test_mask
            # node classification: only evaluate on nodes in test set
            pred = pred[mask]
            label = data.y[mask]

        correct += pred.eq(label).sum().item()

    if model.task == 'graph':
        total = len(loader.dataset)
    else:
        total = 0
        for data in loader.dataset:
            total += torch.sum(data.test_mask).item()
    return correct / total

In [31]:
import gzip,os
import pandas as pd
import torch
from torch_geometric.data import Data, Dataset, DataLoader

from torch_geometric.utils import one_hot

def read_gzipped_csv(file_path):
    #include the header
    df = pd.read_csv(gzip.open(file_path), header=None)
    return df


class MyDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MyDataset, self).__init__(root, transform, pre_transform)
        dataset_path = os.path.join(root)

        # Load your dataset files here
        self.graph_labels = read_gzipped_csv(dataset_path +  '/graph_labels.csv.gz')
        self.num_nodes = read_gzipped_csv(dataset_path +  '/num_nodes.csv.gz')
        self.num_edges = read_gzipped_csv(dataset_path + '/num_edges.csv.gz')
        self.node_features = read_gzipped_csv(dataset_path +  '/node_features.csv.gz')
        self.edges = read_gzipped_csv(dataset_path +  '/edges.csv.gz')
        self.edge_features = read_gzipped_csv(dataset_path +  '/edge_features.csv.gz')
        #define a dataframe which stroes the start of node_features for each graph
        self.node_features_start = pd.DataFrame(columns=['start'])
        self.node_features_start.loc[0] = 0
        for i in range(1, len(self.num_nodes)):
            self.node_features_start.loc[i] = self.node_features_start.loc[i-1] + self.num_nodes.iloc[i-1, 0]
        #define a dataframe which stroes the start of edge_features for each graph
        self.edge_features_start = pd.DataFrame(columns=['start'])
        self.edge_features_start.loc[0] = 0
        for i in range(1, len(self.num_edges)):
            self.edge_features_start.loc[i] = self.edge_features_start.loc[i-1] + self.num_edges.iloc[i-1, 0]

        valid_indices = ~self.graph_labels.iloc[:, 0].isna()
        self.graph_labels = self.graph_labels[valid_indices]
        self.num_nodes = self.num_nodes[valid_indices]
        self.num_edges = self.num_edges[valid_indices]
        self.node_features_start = self.node_features_start[valid_indices]
        self.edge_features_start = self.edge_features_start[valid_indices]


    def len(self):
        return len(self.graph_labels)

    def get(self, idx):
        label = self.graph_labels.iloc[idx, 0]
        label = torch.tensor(self.graph_labels.iloc[idx, 0], dtype=torch.float32).round().long()
        #make label one dimensional
        label_shape = [1]
        labelo = torch.zeros(label_shape, dtype=torch.long)
        labelo[0] = label
        label = labelo

        #encode the label into one-hot vector

        num_nodes = self.num_nodes.iloc[idx, 0]
        num_edges = self.num_edges.iloc[idx, 0]

        # Extract node features for the current graph
        start_node_features = self.node_features_start.iloc[idx, 0]
        end_node_features = start_node_features + num_nodes
        #datatype of node_features is numpy.ndarray
        node_features = torch.tensor(self.node_features.iloc[start_node_features:end_node_features, :].values, dtype=torch.float32)

        # Extract edge features for the current graph
        start_edge_features = self.edge_features_start.iloc[idx, 0]
        end_edge_features = start_edge_features + num_edges
        edge_features = torch.tensor(self.edge_features.iloc[start_edge_features:end_edge_features, :].values, dtype=torch.float32)

        # Extract edges for the current graph
        edges = torch.tensor(self.edges.iloc[start_edge_features:end_edge_features, :].values, dtype=torch.long)

        # Construct the graph
        graph = Data(x=node_features, edge_index=edges.t().contiguous(), edge_attr=edge_features, y=label)
        return graph

In [32]:
writer = SummaryWriter("./log/" + datetime.now().strftime("%Y%m%d-%H%M%S"))


train_dataset_path = "/home/slowblow/sem7/col761/ass-git/A3/dataset/dataset_2/train"
dataset = MyDataset(root=train_dataset_path)

In [34]:
for i in range(len(dataset)):
    data = dataset[i]
    print(data.edge_index)
    break

print(dataset.num_classes)


tensor([[0, 1, 1, 2, 1, 3, 3, 4, 4, 5, 5, 6, 6, 7, 6, 8, 8, 9, 9, 3],
        [1, 0, 2, 1, 3, 1, 4, 3, 5, 4, 6, 5, 7, 6, 8, 6, 9, 8, 3, 9]])
2


In [35]:
task = 'graph'
model = train(dataset, task, writer)



Epoch 0. Loss: 0.5033. Test accuracy: 0.8033
Epoch 10. Loss: 0.4854. Test accuracy: 0.8033
Epoch 20. Loss: 0.4849. Test accuracy: 0.8033
Epoch 30. Loss: 0.4850. Test accuracy: 0.8033
Epoch 40. Loss: 0.4845. Test accuracy: 0.8033
Epoch 50. Loss: 0.4853. Test accuracy: 0.8033
Epoch 60. Loss: 0.4844. Test accuracy: 0.8033
Epoch 70. Loss: 0.4852. Test accuracy: 0.8033
Epoch 80. Loss: 0.4848. Test accuracy: 0.8033
Epoch 90. Loss: 0.4854. Test accuracy: 0.8033
Epoch 100. Loss: 0.4847. Test accuracy: 0.8033
Epoch 110. Loss: 0.4849. Test accuracy: 0.8033
Epoch 120. Loss: 0.4848. Test accuracy: 0.8033
Epoch 130. Loss: 0.4847. Test accuracy: 0.8033
Epoch 140. Loss: 0.4847. Test accuracy: 0.8033
Epoch 150. Loss: 0.4846. Test accuracy: 0.8033
Epoch 160. Loss: 0.4851. Test accuracy: 0.8033
Epoch 170. Loss: 0.4852. Test accuracy: 0.8033
Epoch 180. Loss: 0.4849. Test accuracy: 0.8033
Epoch 190. Loss: 0.4849. Test accuracy: 0.8033


In [39]:
test_path = "/home/slowblow/sem7/col761/ass-git/A3/dataset/dataset_2/valid"
test_dataset =  MyDataset(root=test_path)
count1 =0
count0 =0

for g in test_dataset:
    if g.y.item()==0:
        count0+=1
    else:
        count1+=1

print(count1, count0)

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

test_acc = test(test_loader, model)
print(test_acc)

252 536




0.6802030456852792


0.6802030456852792
