In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, f1_score

In [None]:
zip_file = keras.utils.get_file(
    fname="cora.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "cora")

Downloading data from https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz


In [None]:
citations = pd.read_csv(
    os.path.join(data_dir, "cora.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)
print("Citations shape:", citations.shape)
type(citations)

Citations shape: (5429, 2)


pandas.core.frame.DataFrame

In [None]:
column_names = ["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"]
papers = pd.read_csv(
    os.path.join(data_dir, "cora.content"), sep="\t", header=None, names=column_names,
)
print("Papers shape:", papers.shape)

Papers shape: (2708, 1435)


In [None]:
class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

In [None]:
train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 80% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.8
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (2163, 1435)
Test data shape: (545, 1435)


In [None]:
feature_cols = ["paper_id"] + [f"term_{idx}" for idx in range(1433)]
train_data_X = train_data[feature_cols]
train_data_y = train_data['subject']

test_data_X = test_data[feature_cols]
test_data_y = test_data['subject']

In [None]:
# to decide the assigned label:
# keep al links between the papers
# to predict the value for a node: use all links of which we have a node label

# for all nodes in the test set:
#   find all incoming and outgoing links that are in the train set
#   decide the most common label and assign it

In [None]:
flatten_list = lambda y:[x for a in y for x in flatten_list(a)] if type(y) is list else [y]
y_pred = np.empty((len(test_data_X)))

for i in range(0, len(test_data_X)):
  # select the node to predict and find all incoming and outgoing link from that node
  pred_node = test_data_X.iloc[i,0]
  all_links = flatten_list([(citations[citations['source'] == pred_node].iloc[:,0]).tolist(), (citations[citations['target'] == pred_node].iloc[:,1]).tolist()])
  trgt_labels = []
  # for every link from/to the selected node:
  for j in range(0, len(all_links)):
    trgt = all_links[j]
    # if the connected node is in the train set (so it has a label)
    if trgt in train_data_X.iloc[:,0].tolist():
      # add the label to trgt_labels
      trgt_labels.append(papers[papers['paper_id'] == trgt].iloc[0, -1])
  trgt_labels = np.array(trgt_labels)
  # if there are no labels in trgt_labels (so we cannot predict), make the first prediction -1
  if len(trgt_labels) == 0:
    y_pred[i] = -1
  else:
    # otherwise find the most frequent label and assign it
    y_pred[i] = np.argmax(np.bincount(trgt_labels.astype(int)))

In [None]:
# there are some nodes that could not be predicted, because they only contained links with other train nodes
# these are the nodes with label -1
# now do the prediction for these nodes based on the new information

In [None]:
for i in range(0, len(test_data_X)):
  # for every node with prediction -1
  if y_pred[i] == -1:
    pred_node = test_data_X.iloc[i,0]
    # find all incoming and outgoing links that are also in the test set
    all_links = flatten_list([(citations[citations['source'] == pred_node].iloc[:,0]).tolist(), (citations[citations['target'] == pred_node].iloc[:,1]).tolist()])
    trgt_labels = []
    for j in range(0, len(all_links)):
      trgt = all_links[j]
      trgt_labels.append(y_pred[j])
    trgt_labels = np.array(trgt_labels)
    if len(trgt_labels) == 0:
      y_pred[i] = -1
    else:
      y_pred[i] = np.argmax(np.bincount(trgt_labels.astype(int)))

In [None]:
correct = 0
for i in range(0, len(y_pred)):
  if np.array(test_data_y)[i] == y_pred[i]:
    correct += 1

In [None]:
print("Accuracy: ", correct / len(y_pred))
print(" ")
print('Confucsion matrix: ')
print(confusion_matrix(np.array(test_data_y) , y_pred))
print(" ")
print("F1 score: ", f1_score(np.array(test_data_y) , y_pred, average = 'macro'))

Accuracy:  0.8440366972477065
 
Confucsion matrix: 
[[ 46   2   4   1   0   0   1]
 [  4  79   2   1   0   0   0]
 [  0   6 143  10   1   0   6]
 [  2   1   6  78   2   0   4]
 [  1   2   2   0  26   0   2]
 [  5   1   1   1   0  38   2]
 [  5   1   5   3   1   0  50]]
 
F1 score:  0.8365484673439608
