In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, f1_score

In [2]:
zip_file = keras.utils.get_file(
    fname="citeseer.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "citeseer")

Downloading data from https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz


In [3]:
citations = pd.read_csv(
    os.path.join(data_dir, "citeseer.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)
print("Citations shape:", citations.shape)
citations['source'] = citations['source'].astype(str)
citations['target'] = citations['target'].astype(str)

Citations shape: (4732, 2)


In [4]:
column_names = ["paper_id"] + [f"term_{idx}" for idx in range(3703)] + ["subject"]
papers = pd.read_csv(
    os.path.join(data_dir, "citeseer.content"), sep = "\t", header = None, names=column_names)
papers['paper_id'] = papers['paper_id'].astype(str)

print("Papers shape:", papers.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


Papers shape: (3312, 3705)


In [5]:
all_citations = citations.stack().unique().astype(str)

union = pd.Series(np.union1d(all_citations, papers['paper_id']))
intersect = pd.Series(np.intersect1d(all_citations, papers['paper_id'])) 
notcommonseries = union[~union.isin(intersect)]

citations = citations[~citations['source'].isin(notcommonseries)]
citations = citations[~citations['target'].isin(notcommonseries)]

In [6]:
class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

In [30]:
train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 80% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.8
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (2618, 3705)
Test data shape: (694, 3705)


In [31]:
feature_cols = [f"term_{idx}" for idx in range(3703)]
train_data_X = train_data[feature_cols]
train_data_y = train_data['subject']

test_data_X = test_data[feature_cols]
test_data_y = test_data['subject']

In [34]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
clf = clf.fit(train_data_X, train_data_y)
y_pred = clf.predict(test_data_X)

In [36]:
print("Accuracy:",metrics.accuracy_score(test_data_y, y_pred))
print(" ")
print("Confusion matrix: ")
print(confusion_matrix(test_data_y , y_pred))
print(" ")
print("F1 score: ", f1_score(test_data_y , y_pred, average = 'macro'))

Accuracy: 0.5994236311239193
 
Confusion matrix: 
[[  8   5   9  11   5  12]
 [  7  98   7  10  10   9]
 [  6   4 100   8  23  14]
 [  4  11   7  67  14  10]
 [  4   6  20   6  75  11]
 [  5   6  13  11  10  68]]
 
F1 score:  0.550453472541485
