In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, f1_score

In [2]:
zip_file = keras.utils.get_file(
    fname="citeseer.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "citeseer")

Downloading data from https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz


In [3]:
citations = pd.read_csv(
    os.path.join(data_dir, "citeseer.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)
print("Citations shape:", citations.shape)
citations['source'] = citations['source'].astype(str)
citations['target'] = citations['target'].astype(str)

Citations shape: (4732, 2)


In [4]:
column_names = ["paper_id"] + [f"term_{idx}" for idx in range(3703)] + ["subject"]
papers = pd.read_csv(
    os.path.join(data_dir, "citeseer.content"), sep = "\t", header = None, names=column_names)
papers['paper_id'] = papers['paper_id'].astype(str)

print("Papers shape:", papers.shape)

Papers shape: (3312, 3705)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
all_citations = citations.stack().unique().astype(str)

union = pd.Series(np.union1d(all_citations, papers['paper_id']))
intersect = pd.Series(np.intersect1d(all_citations, papers['paper_id'])) 
notcommonseries = union[~union.isin(intersect)]

citations = citations[~citations['source'].isin(notcommonseries)]
citations = citations[~citations['target'].isin(notcommonseries)]

In [6]:
class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

In [7]:
train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 80% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.8
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (2629, 3705)
Test data shape: (683, 3705)


In [8]:
feature_cols = ["paper_id"] + [f"term_{idx}" for idx in range(3703)]
train_data_X = train_data[feature_cols]
train_data_y = train_data['subject']

test_data_X = test_data[feature_cols]
test_data_y = test_data['subject']

In [None]:
# to decide the assigned label:
# keep al links between the papers
# to predict the value for a node: use all links of which we have a node label

# for all nodes in the test set:
#   find all incoming and outgoing links that are in the train set
#   decide the most common label and assign it

In [9]:
flatten_list = lambda y:[x for a in y for x in flatten_list(a)] if type(y) is list else [y]
y_pred = np.empty((len(test_data_X)))

for i in range(0, len(test_data_X)):
  pred_node = test_data_X.iloc[i,0]
  all_links = flatten_list([(citations[citations['source'] == pred_node].iloc[:,0]).tolist(), (citations[citations['target'] == pred_node].iloc[:,1]).tolist()])
  trgt_labels = []
  for j in range(0, len(all_links)):
    trgt = all_links[j]
    if trgt in train_data_X.iloc[:,0].tolist():
      trgt_labels.append(papers[papers['paper_id'] == trgt].iloc[0, -1])
  trgt_labels = np.array(trgt_labels)
  if len(trgt_labels) == 0:
    y_pred[i] = 10
  else:
    y_pred[i] = np.argmax(np.bincount(trgt_labels.astype(int)))

In [None]:
# there are some nodes that could not be predicted, because they only contained links with other train nodes
# these are the nodes with label -1
# now do the prediction for these nodes based on the new information

In [10]:
for i in range(0, len(test_data_X)):
  if y_pred[i] == 10:
    pred_node = test_data_X.iloc[i,0]
    all_links = flatten_list([(citations[citations['source'] == pred_node].iloc[:,0]).tolist(), (citations[citations['target'] == pred_node].iloc[:,1]).tolist()])
    trgt_labels = []
    for j in range(0, len(all_links)):
      trgt = all_links[j]
      trgt_labels.append(y_pred[j])
    trgt_labels = np.array(trgt_labels)
    if len(trgt_labels) == 0:
      y_pred[i] = 10
    else:
      y_pred[i] = np.argmax(np.bincount(trgt_labels.astype(int)))

In [11]:
correct = 0
for i in range(0, len(y_pred)):
  if np.array(test_data_y)[i] == y_pred[i]:
    correct += 1

0.6925329428989752


In [12]:
print("Accuracy: ", correct / len(y_pred))
print(" ")
print('Confucsion matrix: ')
print(confusion_matrix(np.array(test_data_y) , y_pred))
print(" ")
print("F1 score: ", f1_score(np.array(test_data_y) , y_pred, average = 'macro'))

Accuracy:  0.6925329428989752
 
Confucsion matrix: 
[[ 24   5   4   1   4   8]
 [ 14  93   2   1   3   2]
 [ 25   2 107   5  13   4]
 [ 15   6   2  81   8   0]
 [ 14   5   9   3 105   7]
 [ 30   3   6   0   9  63]]
 
F1 score:  0.6712031027115154
