In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, f1_score

In [2]:
zip_file = keras.utils.get_file(
    fname="webkb.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/WebKB.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "webkb")

Downloading data from https://linqs-data.soe.ucsc.edu/public/lbc/WebKB.tgz


In [3]:
citations_cornell = pd.read_csv(
    os.path.join(data_dir, "cornell.cites"),
    sep="\t",
    header=None,
    names = ['urls']
)

citations_texas = pd.read_csv(
    os.path.join(data_dir, "texas.cites"),
    sep="\t",
    header=None,
    names = ['urls']
)

citations_washington = pd.read_csv(
    os.path.join(data_dir, "washington.cites"),
    sep="\t",
    header=None,
    names = ['urls']
)

citations_wisconsin = pd.read_csv(
    os.path.join(data_dir, "wisconsin.cites"),
    sep="\t",
    header=None,
    names = ['urls']
)

citations = citations_cornell.append(citations_texas.append(citations_washington.append(citations_wisconsin,  ignore_index=True),  ignore_index=True),  ignore_index=True)

In [4]:
citations['urls'] .str.split(" ")

citations = citations["urls"].str.split(" ", n = 1, expand = True)
citations.rename(columns = {0 : 'source', 1 : 'target'}, inplace = True)

In [5]:
column_names = ["url"] + [f"term_{idx}" for idx in range(1703)] + ["subject"]
urls_cornell = pd.read_csv(
    os.path.join(data_dir, "cornell.content"), sep = "\t", header = None, names=column_names)

urls_texas = pd.read_csv(
    os.path.join(data_dir, "texas.content"), sep = "\t", header = None, names=column_names)

urls_washington = pd.read_csv(
    os.path.join(data_dir, "washington.content"), sep = "\t", header = None, names=column_names)

urls_wisconsin = pd.read_csv(
    os.path.join(data_dir, "wisconsin.content"), sep = "\t", header = None, names=column_names)

urls = urls_cornell.append(urls_texas.append(urls_washington.append(urls_wisconsin,  ignore_index=True),  ignore_index=True),  ignore_index=True)

In [6]:
class_values = sorted(urls["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
url_idx = {name: idx for idx, name in enumerate(sorted(urls["url"].unique()))}

urls["url"] = urls["url"].apply(lambda name: url_idx[name])
citations["source"] = citations["source"].apply(lambda name: url_idx[name])
citations["target"] = citations["target"].apply(lambda name: url_idx[name])
urls["subject"] = urls["subject"].apply(lambda value: class_idx[value])

In [7]:
train_data, test_data = [], []

for _, group_data in urls.groupby("subject"):
    # Select around 50% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.5
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (468, 1705)
Test data shape: (409, 1705)


In [8]:
feature_cols = [f"term_{idx}" for idx in range(1703)]
train_data_X = train_data[feature_cols]
train_data_y = train_data['subject']

test_data_X = test_data[feature_cols]
test_data_y = test_data['subject']

In [9]:
clf = DecisionTreeClassifier()
clf = clf.fit(train_data_X, train_data_y)
y_pred = clf.predict(test_data_X)

In [11]:
print("Accuracy:",metrics.accuracy_score(test_data_y, y_pred))
print(" ")
print("Confusion matrix: ")
print(confusion_matrix(test_data_y , y_pred))
print(" ")
print("F1 score: ", f1_score(test_data_y , y_pred, average = 'macro'))

Accuracy: 0.6894865525672371
 
Confusion matrix: 
[[ 87   3   4   1   9]
 [  0  32   0   2  14]
 [  3   2   9   6  20]
 [  1   1   3   3   9]
 [ 20  13  12   4 151]]
 
F1 score:  0.5303341377645248
