# Hierarchical Clustering of Research Papers

This notebook demonstrates how to perform hierarchical clustering of research papers based on their abstracts. The research papers are clustered based on their titles and abstracts using hierarchical clustering. The clustering is then compared with the ground truth labels to evaluate the performance of the clustering algorithm.

In [19]:
import os

base_dir = os.path.join('..', '..')
data_dir = os.path.join(base_dir, 'data')

orkg_file = os.path.join(data_dir, 'orkg', 'orkg_data.csv')

evaluation_dir = os.path.join(base_dir, 'reports', 'scincl_clustering')
os.makedirs(evaluation_dir, exist_ok=True)

sample_df_file = os.path.join(evaluation_dir, 'sample_df.csv')
labels_file = os.path.join(evaluation_dir, 'labels.json')
evaluation_file = os.path.join(evaluation_dir, 'evaluation.csv')

In [20]:
import torch
from transformers import AutoTokenizer, AutoModel

# Torch device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('malteos/scincl', num_workers=os.cpu_count())
model = AutoModel.from_pretrained('malteos/scincl').to(device)

In [21]:
import pandas as pd
import numpy as np

# Create a random sample of 5 rows
df = pd.read_csv(orkg_file)
df["doi"] = df.doi.apply(eval).apply(np.array)  # convert string to array
df["subfields"] = df.subfields.apply(eval).apply(np.array)  # convert string to array
df = df.fillna('')

# Remove rows where the title is less than 5 characters
df = df[df['title'].str.len() > 35]
df

Unnamed: 0,id,title,doi,research field,subfields,abstract
0,R209491,Knowledge management framework for monitoring ...,[10.1109/EESMS.2015.7175848],Computer Engineering,"[Robotics, Digital Circuits, Data Storage Syst...",In the last decades scarcity of resources and ...
2,R504922,A Strong Baseline for Fashion Retrieval with P...,[],Computer Sciences,"[Security and Dependability, Computer Architec...",
8,R422258,End-to-End Human Pose and Mesh Reconstruction ...,[],Computer Sciences,"[Security and Dependability, Computer Architec...",
9,R518734,IPOD: Intensive Point-based Object Detector fo...,[],Computer Sciences,"[Security and Dependability, Computer Architec...",
13,R449385,SqueezeSeg: Convolutional Neural Nets with Rec...,[],Computer Sciences,"[Security and Dependability, Computer Architec...",
...,...,...,...,...,...,...
26735,R597298,Dual Temperature Helps Contrastive Learning Wi...,[],Computer and Systems Architecture,[],
26739,R160334,Product recovery decisions within the context ...,[10.1016/j.jengtecman.2013.11.002],"Information Systems, Process and Knowledge Man...",[],
26741,R521270,Comprehensive Attention Self-Distillation for ...,[],Computer Sciences,"[Security and Dependability, Computer Architec...",
26742,R554286,How Do Graph Networks Generalize to Large and ...,[],Computer Sciences,"[Security and Dependability, Computer Architec...",


In [22]:
df['text'] = df['title']

for inx, row in df.iterrows():
    if row['abstract'] != '':
        df['text'][inx] += tokenizer.sep_token + row['abstract']

In [23]:
sample_df = df.sample(n=200, random_state=420)

In [24]:
import json

label_unique = sample_df['research field'].unique()
label_unique.sort()

label_dict = {
    label: i for i, label in enumerate(label_unique)
}

reverse_label_dict = {
    v: k for k, v in label_dict.items()
}

# Map the labels to integers
sample_df['label'] = sample_df['research field'].map(label_dict)

# Save the sample dataframe
sample_df.to_csv(sample_df_file, index=False)

# Save the labels
with open(labels_file, 'w') as file:
    file.write(json.dumps(label_dict, indent=2))

In [25]:
labels = sample_df['label'].unique()
print("Number of labels:", len(labels))
print("Labels:", labels)

Number of labels: 25
Labels: [ 3 19  8 21 14 20  0  6 17 13 11  7  9  4  2 23 18 10  1  5 12 15 24 16
 22]


In [26]:
# Tokenize the 'text' column
tokenized_inputs = tokenizer(
    sample_df['text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt",
    max_length=512,
)

# Convert tokenized inputs to tensors
input_ids = tokenized_inputs['input_ids'].to(device)
attention_mask = tokenized_inputs['attention_mask'].to(device)

In [27]:
from tqdm import tqdm

batch_size = 8  # Specify the batch size

# Split input tensors into batches
input_ids_batches = input_ids.split(batch_size)
attention_mask_batches = attention_mask.split(batch_size)

embeddings = []  # List to store the embeddings

# Create a progress bar
progress_bar = tqdm(total=len(input_ids_batches), desc='Embeddings', unit='batch')

# Iterate over each batch
for input_ids_batch, attention_mask_batch in zip(input_ids_batches, attention_mask_batches):
    # Move the batch to the appropriate device if using GPU
    input_ids_batch = input_ids_batch.to(device)
    attention_mask_batch = attention_mask_batch.to(device)

    # Obtain embeddings for the batch
    batch_embeddings = model(
        input_ids=input_ids_batch,
        attention_mask=attention_mask_batch
    )

    # Append batch embeddings to the list
    embeddings.append(batch_embeddings[0].detach().cpu())

    # Delete batch embeddings to avoid out of memory leaks
    del batch_embeddings
    del input_ids_batch
    del attention_mask_batch

    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

Embeddings: 100%|██████████| 25/25 [00:52<00:00,  2.10s/batch]


In [28]:
embeddings_np = np.concatenate(embeddings, axis=0)
del embeddings

# Reshape the embeddings if necessary
if embeddings_np.ndim > 1:
    embeddings_np = embeddings_np.reshape(embeddings_np.shape[0], -1)

In [53]:
# Split dataframes into train and test sets
from sklearn.model_selection import train_test_split

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    embeddings_np, sample_df['label'], test_size=0.2, random_state=42
)

## AgglomerativeClustering

In [54]:
evaluation_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])

In [55]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

hierarchical_cluster = AgglomerativeClustering(
    n_clusters=len(labels),
    metric='euclidean',
    linkage='ward'
)

_ = hierarchical_cluster.fit_predict(
    X_train,
    y=y_train
)

hierarchical_pred = hierarchical_cluster.fit_predict(X_test, y=y_test)


hierarchical_accuracy = accuracy_score(y_test, hierarchical_pred)
hierarchical_precision = precision_score(y_test, hierarchical_pred, average="micro")
hierarchical_recall = recall_score(y_test, hierarchical_pred, average="micro")
hierarchical_f1 = f1_score(y_test, hierarchical_pred, average="micro")

evaluation_df.loc[len(evaluation_df)] = ['Hierarchical Clustering',
                                         hierarchical_accuracy,
                                         hierarchical_precision,
                                         hierarchical_recall,
                                         hierarchical_f1]

## KMeans

In [56]:
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=len(sample_df["label"].unique()),
    random_state=42,
    max_iter=100,
)

_ = kmeans.fit_predict(X_train, y=y_train)
clusters = kmeans.labels_

# Calculate Adjusted Rand Index (ARI) to compare the clustering with ground truth
kmeans_pred = kmeans.predict(X_test)

kmeans_accuracy = accuracy_score(y_test, kmeans_pred)
kmeans_precision = precision_score(y_test, kmeans_pred, average="micro")
kmeans_recall = recall_score(y_test, kmeans_pred, average="micro")
kmeans_f1 = f1_score(y_test, kmeans_pred, average="micro")

evaluation_df.loc[len(evaluation_df)] = ['KMeans',
                                         kmeans_accuracy,
                                         kmeans_precision,
                                         kmeans_recall,
                                         kmeans_f1]

  super()._check_params_vs_input(X, default_n_init=10)


## RandomForestClassifier

In [57]:
from sklearn.ensemble import RandomForestClassifier

# train random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

forest_pred = clf.predict(X_test)

forest_accuracy = accuracy_score(y_test, forest_pred)
forest_precision = precision_score(y_test, forest_pred, average="micro")
forest_recall = recall_score(y_test, forest_pred, average="micro")
forest_f1 = f1_score(y_test, forest_pred, average="micro")

evaluation_df.loc[len(evaluation_df)] = ['Random Forest',
                                         forest_accuracy,
                                         forest_precision,
                                         forest_recall,
                                         forest_f1]

In [58]:
evaluation_df.to_csv(evaluation_file, index=False)

In [59]:
evaluation_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Hierarchical Clustering,0.025,0.025,0.025,0.025
1,KMeans,0.05,0.05,0.05,0.05
2,Random Forest,0.65,0.65,0.65,0.65
