# Train Multi-Label Research Field Classifier

This notebook trains a multi-label classifier to predict the research fields of a paper. The classifier is trained on the Open Research Knowledge Graph (ORKG) dataset. The classifier is trained on the title and abstract of the papers.

In [1]:
import os

base_dir = os.path.join('..', '..')
data_dir = os.path.join(base_dir, 'data')
orkg_file = os.path.join(data_dir, 'orkg', 'orkg_data.csv')

#
# Model
#

model_dir = os.path.join(base_dir, 'models', 'scincl_multi_label_classifier')
os.makedirs(model_dir, exist_ok=True)

#
# Data
#

train_file = os.path.join(data_dir, 'scincl_classifier', 'multi_label_test.csv')
test_file = os.path.join(data_dir, 'scincl_classifier', 'multi_label_train.csv')
os.makedirs(os.path.dirname(train_file), exist_ok=True)

#
# Reports
#

evaluation_dir = os.path.join(base_dir, 'reports', 'scincl_classifier')
os.makedirs(evaluation_dir, exist_ok=True)

predictions_file = os.path.join(evaluation_dir, f'multi_label_predictions.csv')
evaluation_file = os.path.join(evaluation_dir, f'multi_label_evaluation.json')

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
import json


# Pretty print json data to console
def print_json(tag: str, data: any):
    print(tag, json.dumps(data, indent=2, sort_keys=True))

### Build Taxonomy

In [None]:
import pandas as pd
import numpy as np

# Create a random sample of 5 rows
df = pd.read_csv(orkg_file)
df["doi"] = df.doi.apply(eval).apply(list)  # convert string to array
df["subfields"] = df.subfields.apply(eval).apply(list)  # convert string to array
df = df.fillna('')

# Remove rows where the title is less than 5 characters
df = df[df['title'].str.len() > 35]

In [None]:
labels = set()

for inx, row in df.iterrows():
    for label in row['subfields']:
        #print("label:", label)
        labels.add(label)

labels = list(labels)
labels.sort()

label_dict = {
    label: i for i, label in enumerate(labels)
}

reverse_label_dict = {
    v: k for k, v in label_dict.items()
}

In [None]:
# Build a vector of labels for each row
label_vectors = []

for inx, row in df.iterrows():
    label_vector = np.zeros(len(labels), dtype=int)
    for label in row['subfields']:
        label_vector[label_dict[label]] = 1
    label_vectors.append(label_vector)

df['label_vector'] = label_vectors

### Create Dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('malteos/scincl')

In [None]:
df['text'] = df['title']

for inx, row in df.iterrows():
    if row['abstract'] != '':
        df['text'][inx] += tokenizer.sep_token + row['abstract']

In [None]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train.to_csv(train_file, index=False)
df_test.to_csv(test_file, index=False)

In [None]:
from datasets import Dataset
from datasets import DatasetDict

dd = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test),
})

#### Tokenize the text in the dataset

In [None]:
def tokenize_function(examples):
    # tokenize the text
    tokenized_examples = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

    # pad the attention masks to the same length as the input sequences
    tokenized_examples['attention_mask'] = [
        torch.cat([
            torch.tensor(mask),
            torch.zeros(512 - len(mask))
        ])
        for mask in tokenized_examples['attention_mask']
    ]

    return tokenized_examples

#### Preprocessing

In [None]:
tokenized_datasets = dd.map(tokenize_function, batched=True)

# remove unnecessary columns from dataset
tokenized_datasets = tokenized_datasets.remove_columns([
    "id",
    "title",
    "abstract",
    "doi",
    "research field",
    "subfields",
    "__index_level_0__",
])
# set the format of the dataset to return PyTorch instrad og lists
tokenized_datasets.set_format("torch")

In [None]:
from torch.utils.data import DataLoader

# create DataLoader objects to iterate over batches of data when training
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=20)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=20)

In [None]:
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, get_scheduler

# get the model
model = AutoModelForSequenceClassification.from_pretrained(
    "malteos/scincl",
    num_labels=len(label_dict)).to(device)

# define optimizer with the learning rate and the scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

#### Training loop

In [None]:
model.train()

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))
criterion = torch.nn.BCEWithLogitsLoss()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'text'}

        # Extract the label vectors from the batch
        label_vectors = batch['label_vector'].to(device)
        del batch['label_vector']  # Remove the label_vector from batch

        outputs = model(**batch)

        logits = outputs.logits  # Adjust this to match your model's output

        # Calculate loss
        loss = criterion(logits, label_vectors.float())  # Use BCE loss

        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

progress_bar.close()

In [None]:
model.save_pretrained(model_dir)

### Evaluation

In [None]:
model.eval()

In [None]:
progress_bar = tqdm(range(len(test_dataloader)))

predictions = []
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items() if k != 'text'}

    # Extract the label vectors from the batch
    label_vectors = batch['label_vector'].to(device)
    del batch['label_vector']  # Remove the label_vector from batch

    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions.extend(logits.cpu().numpy())
    progress_bar.update(1)

progress_bar.close()

In [None]:
# Replace all values below 0 with 0. Round all values to 0 or 1
predictions = np.array(predictions)
predictions[predictions < 0] = 0
predictions[predictions > 0] = 1
predictions = predictions.round().astype(int)

# Add the predictions to the test dataset
df_test['predictions_vector'] = predictions.tolist()

In [None]:
y_true = np.array(df_test['label_vector'].tolist())
y_pred = np.array(predictions)

# Calculate tp, tn, fp and fn  for each label
tn = np.sum(np.logical_and(y_true == 0, y_pred == 0))
tp = np.sum(np.logical_and(y_true == 1, y_pred == 1))
fp = np.sum(np.logical_and(y_true == 0, y_pred == 1))
fn = np.sum(np.logical_and(y_true == 1, y_pred == 0))

# Calculate the accuracy, precision, recall and F1 score for each label
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

evaluation_report = {
    "tn": tn.item(),
    "tp": tp.item(),
    "fp": fp.item(),
    "fn": fn.item(),
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}

print_json("Evaluation report:", evaluation_report)

with open(evaluation_file, 'w') as file:
    json.dump(evaluation_report, file, indent=2, sort_keys=True)

In [None]:
# Calculate the accuracy, precision, recall and F1 score for multi-label classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
f1 = f1_score(y_true, y_pred, average='micro')

print("Accuracy:  ", accuracy)
print("Precision: ", precision)
print("Recall:    ", recall)
print("F1:        ", f1)

In [None]:
# Map the label vectors back to the labels
df_test['predictions'] = df_test['predictions_vector'].apply(
    lambda x: [labels[i] for i, v in enumerate(x) if v == 1])

# Sort subfields alphabetically
df_test['subfields'] = df_test['subfields'].apply(lambda x: sorted(x))

# Sort the predictions alphabetically
df_test['predictions'] = df_test['predictions'].apply(lambda x: sorted(x))

df_test[["doi", "title", "subfields", "predictions"]]

In [None]:
df_test.to_csv(predictions_file, index=False)

In [None]:
# Show random example from the test set with the predicted labels and the true labels for comparison
import random

for inx, row in random.sample(list(df_test.iterrows()), 10):
    print("Title:       ", row['title'])
    print("True labels: ", row['subfields'])
    print("Predictions: ", row['predictions'])

    y_true = np.array(row['label_vector'])
    y_pred = np.array(row['predictions_vector'])

    print("True label vector:      ", y_true)
    print("Predicted label vector: ", y_pred)

    # Calculate tp, tn, fp and fn  for each label
    tn = np.sum(np.logical_and(y_true == 0, y_pred == 0))
    fp = np.sum(np.logical_and(y_true == 0, y_pred == 1))
    fn = np.sum(np.logical_and(y_true == 1, y_pred == 0))
    tp = np.sum(np.logical_and(y_true == 1, y_pred == 1))

    print("TP: ", tp)
    print("TN: ", tn)
    print("FP: ", fp)
    print("FN: ", fn)

    # Calculate accuracy, precision, recall and F1 score for each label
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    print("Accuracy:  ", accuracy)
    print("Precision: ", precision)
    print("Recall:    ", recall)
    print("F1:        ", f1)

    print()