## Cora Dataset

In [1]:
raw_dataset_path = "/Users/kashyappatel/datasets/cora"

In [2]:
content_path = raw_dataset_path + "/cora.content"
cite_path = raw_dataset_path + "/cora.cites"

In [3]:
import numpy as np
import pandas as pd
import torch
from torch.nn import Linear
import torch.nn.functional as F
from types import SimpleNamespace

In [4]:
with open(content_path, "r") as fp:
    contents = fp.readlines()

In [5]:
with open(cite_path, "r") as fp:
    cites = fp.readlines()

In [6]:
contents = np.array([np.array(l.strip().split("\t")) for l in contents])
contents[0]

array(['31336', '0', '0', ..., '0', '0', 'Neural_Networks'], dtype='<U22')

In [7]:
contents.shape

(2708, 1435)

In [8]:
paper_list, feat_list, label_list = np.split(contents, [1,-1], axis=1)
(paper_list.shape, feat_list.shape, label_list.shape)

((2708, 1), (2708, 1433), (2708, 1))

In [9]:
paper_list, label_list = np.squeeze(paper_list), np.squeeze(label_list)
(paper_list.shape, label_list.shape)

((2708,), (2708,))

In [10]:
feat_Matrix = torch.Tensor(feat_list.astype(np.float32))

In [11]:
labels = list(set(label_list))
label_dict = dict([(key, val) for val, key in enumerate(labels)])

In [12]:
label_list = np.array([label_dict[i] for i in label_list])
label_list = torch.from_numpy(label_list)

In [13]:
num_features = len(feat_Matrix[0])
num_classes = len(labels)
(num_features, num_classes)

(1433, 7)

In [14]:
def accuracy(y_pred, y_true):
    return torch.sum(y_pred == y_true) / len(y_true)

In [15]:
class MLP(torch.nn.Module):
    
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.linear1 = Linear(dim_in, dim_h)
        self.linear2 = Linear(dim_h, dim_out)

    def forward(self, x):
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.linear2(x)
        return F.log_softmax(x, dim=1)
    
    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            out = self(data.x)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if epoch % 20 == 0:
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | Val Acc: {val_acc*100:.2f}%')
    
    @torch.no_grad()
    def test(self, data):
        self.eval()
        out = self(data.x)
        acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc

In [16]:
mlp = MLP(num_features, 16, num_classes)
print(mlp)

MLP(
  (linear1): Linear(in_features=1433, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=7, bias=True)
)


In [17]:
num_samples = feat_Matrix.shape[0]
num_samples

2708

In [18]:
train_mask = torch.zeros(num_samples, dtype=torch.bool)
val_mask = torch.zeros(num_samples, dtype=torch.bool)
test_mask = torch.zeros(num_samples, dtype=torch.bool)

In [19]:
train_mask[:2000] = True
val_mask[2000:2350] = True
test_mask[2350:] = True

In [20]:
data = SimpleNamespace(
    x=feat_Matrix,
    y=label_list,
    train_mask=train_mask,
    val_mask=val_mask,
    test_mask=test_mask
)

data

namespace(x=tensor([[0., 0., 0.,  ..., 0., 0., 0.],
                    [0., 0., 0.,  ..., 0., 0., 0.],
                    [0., 0., 0.,  ..., 0., 0., 0.],
                    ...,
                    [0., 0., 0.,  ..., 0., 0., 0.],
                    [0., 0., 0.,  ..., 0., 0., 0.],
                    [0., 0., 0.,  ..., 0., 0., 0.]]),
          y=tensor([0, 2, 6,  ..., 1, 5, 0]),
          train_mask=tensor([ True,  True,  True,  ..., False, False, False]),
          val_mask=tensor([False, False, False,  ..., False, False, False]),
          test_mask=tensor([False, False, False,  ...,  True,  True,  True]))

In [21]:
feat_Matrix.dtype, label_list.dtype

(torch.float32, torch.int64)

In [22]:
mlp.fit(data, epochs=50)

Epoch   0 | Train Loss: 2.003 | Train Acc: 10.25% | Val Loss: 2.02 | Val Acc: 8.86%
Epoch  20 | Train Loss: 0.503 | Train Acc: 91.20% | Val Loss: 1.01 | Val Acc: 66.00%
Epoch  40 | Train Loss: 0.107 | Train Acc: 99.05% | Val Loss: 0.85 | Val Acc: 69.14%


In [23]:
acc = mlp.test(data)
print(f'MLP test accuracy: {acc*100:.2f}%')

MLP test accuracy: 70.67%


## Facebook Dataset

In [24]:
from torch_geometric.datasets import FacebookPagePage

In [25]:
dataset = FacebookPagePage(root=".")
data = dataset[0]
data

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


Data(x=[22470, 128], edge_index=[2, 342004], y=[22470])

In [26]:
print(f'Dataset: {dataset}')
print('-----------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Dataset: FacebookPagePage()
-----------------------
Number of graphs: 1
Number of nodes: 22470
Number of features: 128
Number of classes: 4


In [27]:
print(f'\nGraph:')
print('------')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')


Graph:
------
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: True


In [28]:
data.train_mask = range(18000)
data.val_mask = range(18001, 20000)
data.test_mask = range(20001, 22470)

In [29]:
from torch_geometric.utils import to_dense_adj

In [30]:
adjacency = to_dense_adj(data.edge_index)[0]
adjacency += torch.eye(len(adjacency))
adjacency

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [31]:
mlp = MLP(dataset.num_features, 16, dataset.num_classes)
print(mlp)

MLP(
  (linear1): Linear(in_features=128, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=4, bias=True)
)


In [32]:
mlp.fit(data, epochs=100)

Epoch   0 | Train Loss: 1.388 | Train Acc: 25.68% | Val Loss: 1.39 | Val Acc: 24.66%
Epoch  20 | Train Loss: 0.656 | Train Acc: 73.98% | Val Loss: 0.67 | Val Acc: 73.09%
Epoch  40 | Train Loss: 0.575 | Train Acc: 76.94% | Val Loss: 0.62 | Val Acc: 74.69%
Epoch  60 | Train Loss: 0.547 | Train Acc: 78.19% | Val Loss: 0.60 | Val Acc: 75.64%
Epoch  80 | Train Loss: 0.530 | Train Acc: 78.82% | Val Loss: 0.59 | Val Acc: 75.44%
Epoch 100 | Train Loss: 0.517 | Train Acc: 79.56% | Val Loss: 0.59 | Val Acc: 75.94%


In [33]:
acc = mlp.test(data)
print(f'\nMLP test accuracy: {acc*100:.2f}%\n')


MLP test accuracy: 74.93%

