In [1091]:
import pandas as pd 
import numpy as np

In [1300]:
# Load data
df = pd.read_csv('../data/inputs/NER2.csv')

In [None]:
# Check the shape
print('NER', df.shape[0])
df

In [None]:
# Updating column name
df.rename(columns = {'Unnamed: 0':'index'}, inplace = True)
df

In [None]:
# Replacing null values (pages with no relevant named entities) with zeroes
df = df.fillna(0)
# Replacing any anomolous entries with 0
df = df.replace(r'^\s*$', 0, regex=True)
df

In [None]:
# Make sure we have no duplicate nodes
max(df["url"].value_counts())

In [1305]:
# Changing the index
df.set_index('index')
df = df.drop(columns = ['url'])

In [1306]:
# Convert to numpy
x = df.to_numpy(dtype=np.compat.long)

In [1307]:
print(x.shape)
print(x)

(10614, 1556)
[[    0     0     0 ...     0     0     0]
 [    1     0     0 ...     0     0     0]
 [    2     0     0 ...     0     0     0]
 ...
 [10611     0     0 ...     0     0     0]
 [10612     0     0 ...     0     0     0]
 [10613     0     0 ...     0     0     0]]


In [1308]:
# Bringing in the node classification labels 
df_labels = pd.read_csv("../data/inputs/pages_ranked_with_data_labelled.csv")
# df_labels = df_labels[["page path", "label"]]
df_labels.head()

Unnamed: 0,index,url,label
0,0,https://www.gov.uk/view-prove-immigration-status,0
1,1,https://www.gov.uk/prove-right-to-work,1
2,2,https://www.gov.uk/browse/working,1
3,3,https://www.gov.uk/find-a-job,1
4,4,https://www.gov.uk/check-state-pension,0


In [None]:
# Need to have labels for each node (economic recovery or not economic recovery)
'''
In the form:

|| page_link   || relevant (y/n) ||
|| ------------||----------------||
|| page_link_0 ||        1       ||
|| page_link_1 ||        0       ||
|| page_link_2 ||        1       ||
...
'''
# Make sure we have no duplicate nodes
max(df_labels["url"].value_counts())

# Remove irrelevant columns
df_labels = df_labels.drop(columns=["url", "index"])

df_labels.shape[0]

# To numpy again... 
# y = df_labels.to_numpy(dtype=np.compat.long)
y = df_labels.label.tolist()
# print(y.shape)
print(y[0:5])

In [None]:
# Creating train/val/test idx masks
tm = pd.read_csv("../data/inputs/test_masks.csv")
tm.head()

In [1311]:
# Moving masks into lists
train_mask = tm.index[0:100].tolist()
val_mask = tm.index[100:224].tolist()
test_mask = tm.index[224:].tolist()

In [1312]:
type(val_mask)

list

In [None]:
# Now creating the edge df
df_edges = pd.read_csv("../edge_list.csv")
df_edges

In [1314]:
'''
Some pages were removed from the list prior to NER work
due to the pae no longer existing or there being < 20 
characters in the page's text field.

It is now necessary to remove these nodes from the edge dataset,
as this was attained from N2V approach.
'''

# Creating a list of pages to be removed
pages_to_remove = [
    "/",
    "/search/all",
    "/find-covid-19-lateral-flow-test-site",
    "/guidance/coronavirus-covid-19-getting-tested",
    "/register-coronavirus-antibody-test",
    "/entering-staying-uk/foreign-nationals-working-in-uk",
    "/business-finance-support/business-cash-advance-uk",
    "/government/publications/applying-to-the-register-of-apprenticeship-training-providers-roatp",
    "/guidance/esfa-business-operations-help-and-support",
    "/business-finance-support/business-growth-calderdale",
    "/business-finance-support/low-carbon-workspaces-buckinghamshire",
    "/log-test-site-covid19-results",
    "/guidance/apprenticeships-resources-for-teachers-and-advisers",
    "/business-finance-support/south-east-creatives-seccads",
    "/business-finance-support/construction-industry-training-board-citb-grants-scheme-england",
    "/government/publications/turkey-list-of-lawyers/list-of-lawyers-in-ankara-and-gaziantep",
    "/business-finance-support/agri-tech-cornwall-cornwall-and-the-isles-of-scilly"
]

In [1315]:
# Removing pages from sources nodes
df_edges_1 = df_edges[~df_edges["source"].isin(pages_to_remove)]

In [1316]:
# Removing pages from target nodes
df_edges_2 = df_edges_1[~df_edges_1["target"].isin(pages_to_remove)]

In [1317]:
# Removing null rows
df_edges_3 = df_edges_2.dropna()

In [1318]:
# Now need to extract the edges
''' 
In the form:

[[0, 0, 0, 0, 0 , 0 , 0 , ...],
 [1, 5, 7, 9, 11, 14, 16, ...]]

Where this represents links existing between page 0 and 1, 5, 7, 9, 11, 14, 16...
'''

# gives df with edge weights (3 rows)
# want to use this later when get all working
# df_edges_3 = df_edges_3.drop(columns=["index", "source", "target"])

# gives df without edge weights (2 rows)
df_edges_3 = df_edges_3.drop(columns=["index", "source", "target", "edgeWeight"])


df_edges_3.shape[0]

71781

In [None]:
# Transposing and changing type
df_edges = df_edges_3.transpose()
df_edges = df_edges.astype(np.compat.long)
df_edges

In [None]:
# To numpy again
z = df_edges.to_numpy(dtype=np.compat.long)
print(z.shape)
print(z[0:5])

# Creating pytorch-geometric dataset

In [1321]:
import torch
import torch_geometric
from torch_geometric.data import Data

In [1322]:
# Defining the number of classes
# In this case 1 = relevant, 0 = not relevant
# Therefore, 2 classes
num_classes = 2

In [1323]:
# Converting np arrays to tensors
x = torch.Tensor(x)
y = torch.Tensor(y)
edge_index = torch.Tensor(z)
# train_mask = torch.Tensor(train_mask)
# val_mask = torch.Tensor(val_mask)
# test_mask = torch.Tensor(test_mask)
num_classes = torch.Tensor(num_classes)

In [1324]:
# Changing tensor type from float to long
# Uncomment for GCN
x = x.type(torch.LongTensor)
y = y.type(torch.LongTensor)
edge_index = edge_index.type(torch.LongTensor)
num_classes = num_classes.type(torch.LongTensor)

In [1325]:
# test_mask.type()

In [1326]:
# Creating pytorch-geometric dataset
dataset = Data(x = x, 
            edge_index=edge_index,
            y=y,
            # train_mask=train_mask,
            # val_mask=val_mask,
            # test_mask=test_mask,
            num_classes=num_classes)

In [1327]:
dataset.x
type(dataset.x)

torch.Tensor

In [1328]:
dataset.x = dataset.x.type(torch.long)
dataset.y = dataset.y.type(torch.long)
dataset.edge_index = dataset.edge_index.type(torch.long)
dataset.num_classes = dataset.num_classes.type(torch.long)

In [1329]:
dataset.edge_index.dtype

torch.int64

In [1330]:
from sklearn.model_selection import train_test_split

In [1331]:
X_train, X_test = train_test_split(pd.Series(list(x[:,0])), train_size=0.02117, shuffle=False)

In [None]:
X_train

In [1333]:
train_mask = torch.zeros(10614, dtype=torch.bool)
test_mask = torch.zeros(10614, dtype=torch.bool)
train_mask[X_train.index] = True
test_mask[X_test.index] = True
dataset['train_mask'] = train_mask
dataset['test_mask'] = test_mask

In [None]:
len(dataset.train_mask)

In [None]:
dataset.y

In [None]:
# Gather some statistics about the graph.
print(f'Number of nodes: {dataset.num_nodes}')
print(f'Number of edges: {dataset.num_edges}')
print(f'Average node degree: {dataset.num_edges / dataset.num_nodes:.2f}')
print(f'Number of training nodes: {dataset.train_mask.sum()}')
print(f'Training node label rate: {int(dataset.train_mask.sum()) / dataset.num_nodes:.2f}')
print(f'Has isolated nodes: {dataset.has_isolated_nodes()}')
print(f'Has self-loops: {dataset.has_self_loops()}')
print(f'Is undirected: {dataset.is_undirected()}')

# Building out the GNN

https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html

In [1337]:
import torch.nn.functional as F 
from torch_geometric.nn import GCNConv 

In [1338]:
# from torch_geometric.datasets import Planetoid

# dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [1339]:
# dataset[0]

In [1340]:
# dataset = data

In [1341]:
# Creating GCN
from numpy import dtype


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1556, 16)
        self.conv2 = GCNConv(16, 2)

    def forward(self, data):
        
        x, edge_index = data.x.type(dtype=torch.float64), data.edge_index.type(dtype=torch.int64)
        # x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [1349]:
device = torch.device('cpu')
model = GCN().to(device)
d = dataset.to(device)
# print(type(d))
# d = d.type(torch.LongTensor)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-5)

model.train()
model = model.double()
for epoch in range(201):
    optimizer.zero_grad()
    out = model(d)
    loss = F.nll_loss(out[d.train_mask], d.y[d.train_mask])
    loss.backward()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    optimizer.step()

Epoch: 000, Loss: 200.0054
Epoch: 001, Loss: 105.5870
Epoch: 002, Loss: 10.0756
Epoch: 003, Loss: 21.5778
Epoch: 004, Loss: 39.8721
Epoch: 005, Loss: 22.8183
Epoch: 006, Loss: 13.5297
Epoch: 007, Loss: 12.5046
Epoch: 008, Loss: 20.8097
Epoch: 009, Loss: 11.1051
Epoch: 010, Loss: 15.0373
Epoch: 011, Loss: 12.8902
Epoch: 012, Loss: 11.4885
Epoch: 013, Loss: 12.9546
Epoch: 014, Loss: 12.7748
Epoch: 015, Loss: 14.5309
Epoch: 016, Loss: 11.9609
Epoch: 017, Loss: 16.5935
Epoch: 018, Loss: 9.0783
Epoch: 019, Loss: 15.0847
Epoch: 020, Loss: 13.3456
Epoch: 021, Loss: 12.7321
Epoch: 022, Loss: 15.1803
Epoch: 023, Loss: 11.0215
Epoch: 024, Loss: 12.2430
Epoch: 025, Loss: 13.6810
Epoch: 026, Loss: 9.0897
Epoch: 027, Loss: 7.6513
Epoch: 028, Loss: 9.7775
Epoch: 029, Loss: 9.6132
Epoch: 030, Loss: 9.5323
Epoch: 031, Loss: 7.2759
Epoch: 032, Loss: 8.5851
Epoch: 033, Loss: 9.5027
Epoch: 034, Loss: 8.6761
Epoch: 035, Loss: 6.7980
Epoch: 036, Loss: 9.5635
Epoch: 037, Loss: 8.0647
Epoch: 038, Loss: 6.820

In [None]:
model.eval()
pred = model(dataset).argmax(dim=1)
correct = (pred[dataset.test_mask] == dataset.y[dataset.test_mask]).sum()
acc = int(correct) / int(dataset.test_mask.sum())
print(f'Accuracy: {acc:.4f}')