In [649]:
import pandas as pd 
import numpy as np

In [727]:
# Load data
df = pd.read_csv('../data/inputs/NER2.csv')

In [None]:
# Check the shape
print('NER', df.shape[0])
df

In [None]:
# Updating column name
df.rename(columns = {'Unnamed: 0':'index'}, inplace = True)
df

In [None]:
# Replacing null values (pages with no relevant named entities) with zeroes
df = df.fillna(0)
# Replacing any anomolous entries with 0
df = df.replace(r'^\s*$', 0, regex=True)
df

In [None]:
# Make sure we have no duplicate nodes
max(df["url"].value_counts())

In [732]:
# Changing the index
df.set_index('index')
df = df.drop(columns = ['url'])

In [733]:
# Convert to numpy
x = df.to_numpy(dtype=np.compat.long)

In [None]:
print(x.shape)
print(x)

In [None]:
# Bringing in the node classification labels 
df_labels = pd.read_csv("../data/inputs/pages_ranked_with_data_labelled.csv")
# df_labels = df_labels[["page path", "label"]]
df_labels.head()

In [None]:
# Need to have labels for each node (economic recovery or not economic recovery)
'''
In the form:

|| page_link   || relevant (y/n) ||
|| ------------||----------------||
|| page_link_0 ||        1       ||
|| page_link_1 ||        0       ||
|| page_link_2 ||        1       ||
...
'''
# Make sure we have no duplicate nodes
max(df_labels["url"].value_counts())

# Remove irrelevant columns
df_labels = df_labels.drop(columns=["url", "index"])

df_labels.shape[0]

# To numpy again... 
y = df_labels.to_numpy(dtype=np.compat.long)
print(y.shape)
print(y[0:5])

In [None]:
# Creating train/val/test idx masks
tm = pd.read_csv("../data/inputs/test_masks.csv")
tm.head()

In [738]:
# Moving masks into numpy
train_mask = tm[0:100].to_numpy(dtype=np.compat.long)
val_mask = tm[100:224].to_numpy(dtype=np.compat.long)
test_mask = tm[224:].to_numpy(dtype=np.compat.long)

In [None]:
# Now creating the edge df
df_edges = pd.read_csv("../edge_list.csv")
df_edges

In [740]:
'''
Some pages were removed from the list prior to NER work
due to the pae no longer existing or there being < 20 
characters in the page's text field.

It is now necessary to remove these nodes from the edge dataset,
as this was attained from N2V approach.
'''

# Creating a list of pages to be removed
pages_to_remove = [
    "/",
    "/search/all",
    "/find-covid-19-lateral-flow-test-site",
    "/guidance/coronavirus-covid-19-getting-tested",
    "/register-coronavirus-antibody-test",
    "/entering-staying-uk/foreign-nationals-working-in-uk",
    "/business-finance-support/business-cash-advance-uk",
    "/government/publications/applying-to-the-register-of-apprenticeship-training-providers-roatp",
    "/guidance/esfa-business-operations-help-and-support",
    "/business-finance-support/business-growth-calderdale",
    "/business-finance-support/low-carbon-workspaces-buckinghamshire",
    "/log-test-site-covid19-results",
    "/guidance/apprenticeships-resources-for-teachers-and-advisers",
    "/business-finance-support/south-east-creatives-seccads",
    "/business-finance-support/construction-industry-training-board-citb-grants-scheme-england",
    "/government/publications/turkey-list-of-lawyers/list-of-lawyers-in-ankara-and-gaziantep",
    "/business-finance-support/agri-tech-cornwall-cornwall-and-the-isles-of-scilly"
]

In [741]:
# Removing pages from sources nodes
df_edges_1 = df_edges[~df_edges["source"].isin(pages_to_remove)]

In [742]:
# Removing pages from target nodes
df_edges_2 = df_edges_1[~df_edges_1["target"].isin(pages_to_remove)]

In [743]:
# Removing null rows
df_edges_3 = df_edges_2.dropna()

In [None]:
# Now need to extract the edges
''' 
In the form:

[[0, 0, 0, 0, 0 , 0 , 0 , ...],
 [1, 5, 7, 9, 11, 14, 16, ...]]

Where this represents links existing between page 0 and 1, 5, 7, 9, 11, 14, 16...
'''

# gives df with edge weights (3 rows)
# want to use this later when get all working
# df_edges_3 = df_edges_3.drop(columns=["index", "source", "target"])

# gives df without edge weights (2 rows)
df_edges_3 = df_edges_3.drop(columns=["index", "source", "target", "edgeWeight"])


df_edges_3.shape[0]

In [None]:
# Transposing and changing type
df_edges = df_edges_3.transpose()
df_edges = df_edges.astype(np.compat.long)
df_edges

In [None]:
# To numpy again
z = df_edges.to_numpy(dtype=np.compat.long)
print(z.shape)
print(z[0:5])

# Creating pytorch-geometric dataset

In [749]:
import torch
import torch_geometric
from torch_geometric.data import Data

In [750]:
# Defining the number of classes
# In this case 1 = relevant, 0 = not relevant
# Therefore, 2 classes
num_classes = 2

In [751]:
# Converting np arrays to tensors
x = torch.Tensor(x)
y = torch.Tensor(y)
edge_index = torch.Tensor(z)
train_mask = torch.Tensor(train_mask)
val_mask = torch.Tensor(val_mask)
test_mask = torch.Tensor(test_mask)
num_classes = torch.Tensor(num_classes)

In [754]:
# Changing tensor type from float to long
x = x.type(torch.LongTensor)
y = y.type(torch.LongTensor)
edge_index = edge_index.type(torch.LongTensor)
train_mask = train_mask.type(torch.LongTensor)
val_mask = val_mask.type(torch.LongTensor)
test_mask = test_mask.type(torch.LongTensor)
num_classes = num_classes.type(torch.LongTensor)

In [None]:
test_mask.type()

In [782]:
# Creating pytorch-geometric dataset
dataset = Data(x = x, 
            edge_index=edge_index,
            y=y,
            train_mask=train_mask,
            val_mask=val_mask,
            test_mask=test_mask,
            num_classes=num_classes)

In [None]:
dataset

# Building out the GNN

https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html

In [784]:
import torch.nn.functional as F 
from torch_geometric.nn import GCNConv 

In [785]:
# from torch_geometric.datasets import Planetoid

# dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [786]:
# dataset[0]

In [787]:
# dataset = data

In [791]:
# Creating GCN
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [None]:
device = torch.device('cpu')
model = GCN().to(device)
data = dataset.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [None]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')