In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, f1_score

In [None]:
zip_file = keras.utils.get_file(
    fname="webkb.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/WebKB.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "webkb")

In [None]:
citations_cornell = pd.read_csv(
    os.path.join(data_dir, "cornell.cites"),
    sep="\t",
    header=None,
    names = ['urls']
)

citations_texas = pd.read_csv(
    os.path.join(data_dir, "texas.cites"),
    sep="\t",
    header=None,
    names = ['urls']
)

citations_washington = pd.read_csv(
    os.path.join(data_dir, "washington.cites"),
    sep="\t",
    header=None,
    names = ['urls']
)

citations_wisconsin = pd.read_csv(
    os.path.join(data_dir, "wisconsin.cites"),
    sep="\t",
    header=None,
    names = ['urls']
)

citations = citations_cornell.append(citations_texas.append(citations_washington.append(citations_wisconsin,  ignore_index=True),  ignore_index=True),  ignore_index=True)

In [None]:
citations['urls'].str.split(" ")

citations = citations["urls"].str.split(" ", n = 1, expand = True)
citations.rename(columns = {0 : 'source', 1 : 'target'}, inplace = True)

In [None]:
column_names = ["url"] + [f"term_{idx}" for idx in range(1703)] + ["subject"]
urls_cornell = pd.read_csv(
    os.path.join(data_dir, "cornell.content"), sep = "\t", header = None, names=column_names)

urls_texas = pd.read_csv(
    os.path.join(data_dir, "texas.content"), sep = "\t", header = None, names=column_names)

urls_washington = pd.read_csv(
    os.path.join(data_dir, "washington.content"), sep = "\t", header = None, names=column_names)

urls_wisconsin = pd.read_csv(
    os.path.join(data_dir, "wisconsin.content"), sep = "\t", header = None, names=column_names)

urls = urls_cornell.append(urls_texas.append(urls_washington.append(urls_wisconsin,  ignore_index=True),  ignore_index=True),  ignore_index=True)

In [None]:
class_values = sorted(urls["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
url_idx = {name: idx for idx, name in enumerate(sorted(urls["url"].unique()))}

urls["url"] = urls["url"].apply(lambda name: url_idx[name])
citations["source"] = citations["source"].apply(lambda name: url_idx[name])
citations["target"] = citations["target"].apply(lambda name: url_idx[name])
urls["subject"] = urls["subject"].apply(lambda value: class_idx[value])

In [None]:
train_data, test_data = [], []

for _, group_data in urls.groupby("subject"):
    # Select around 80% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.8
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (711, 1705)
Test data shape: (166, 1705)


In [None]:
feature_cols = ["url"] + [f"term_{idx}" for idx in range(1703)]
train_data_X = train_data[feature_cols]
train_data_y = train_data['subject']

test_data_X = test_data[feature_cols]
test_data_y = test_data['subject']

In [None]:
# to decide the assigned label:
# keep al links between the papers
# to predict the value for a node: use all links of which we have a node label

# for all nodes in the test set:
#   find all incoming and outgoing links that are in the train set
#   decide the most common label and assign it

In [None]:
flatten_list = lambda y:[x for a in y for x in flatten_list(a)] if type(y) is list else [y]
y_pred = np.empty((len(test_data_X)))

for i in range(0, len(test_data_X)):
  pred_node = test_data_X.iloc[i,0]
  all_links = flatten_list([(citations[citations['source'] == pred_node].iloc[:,1]).tolist(), (citations[citations['target'] == pred_node].iloc[:,0]).tolist()])
  trgt_labels = []
  for j in range(0, len(all_links)):
    trgt = all_links[j]
    if trgt in train_data_X.iloc[:,0].tolist():
      trgt_labels.append(urls[urls['url'] == trgt].iloc[0, -1])
  trgt_labels = np.array(trgt_labels)
  if len(trgt_labels) == 0:
    y_pred[i] = 10
  else:
    y_pred[i] = np.argmax(np.bincount(trgt_labels.astype(int)))

In [None]:
# there are some nodes that could not be predicted, because they only contained links with other train nodes
# these are the nodes with label -1
# now do the prediction for these nodes based on the new information

In [None]:
for i in range(0, len(test_data_X)):
  if y_pred[i] == 10:
    pred_node = test_data_X.iloc[i,0]
    all_links = flatten_list([(citations[citations['source'] == pred_node].iloc[:,1]).tolist(), (citations[citations['target'] == pred_node].iloc[:,0]).tolist()])
    trgt_labels = []
    for j in range(0, len(all_links)):
      trgt = all_links[j]
      trgt_labels.append(y_pred[j])
    trgt_labels = np.array(trgt_labels)
    if len(trgt_labels) == 0:
      y_pred[i] = 10
    else:
      y_pred[i] = np.argmax(np.bincount(trgt_labels.astype(int)))

In [None]:
correct = 0
for i in range(0, len(y_pred)):
  if np.array(test_data_y)[i] == y_pred[i]:
    correct += 1

In [None]:
print("Accuracy: ", correct / len(y_pred))
print(" ")
print('Confucsion matrix: ')
print(confusion_matrix(np.array(test_data_y) , y_pred))
print(" ")
print("F1 score: ", f1_score(np.array(test_data_y) , y_pred, average = 'macro'))

Accuracy:  0.12650602409638553
 
Confucsion matrix: 
[[14 11  2  0 13]
 [13  1  4  0  9]
 [ 6  7  2  2  3]
 [ 7  1  1  1  0]
 [52  9  5  0  3]]
 
F1 score:  0.11623687612165479
