# Train Research Field Classifier ([Source](https://gitlab.com/TIBHannover/orkg/nlp/experiments/orkg-research-fields-classifier/-/blob/master/notebooks/train_eval_notebook.ipynb))

This notebook trains a model to classify research fields of research papers. The model is trained on the ORKG dataset and is based on the [SciNCL model](https://huggingface.co/malteos/scincl).

In [None]:
import os

base_dir = os.path.join('..', '..')
data_dir = os.path.join(base_dir, 'data')
orkg_file = os.path.join(data_dir, 'orkg', 'orkg_data.csv')

#
# Model
#

model_dir = os.path.join(base_dir, 'models', 'scincl_single_label_classifier')
os.makedirs(model_dir, exist_ok=True)

#
# Data
#

train_file = os.path.join(data_dir, 'scincl_classifier', 'single_label_test.csv')
test_file = os.path.join(data_dir, 'scincl_classifier', 'single_label_train.csv')
os.makedirs(os.path.dirname(train_file), exist_ok=True)

#
# Reports
#

evaluation_dir = os.path.join(base_dir, 'reports', 'orkg', 'scincl_classifier')
os.makedirs(evaluation_dir, exist_ok=True)

predictions_file = os.path.join(evaluation_dir, f'single_label_evaluation.csv')
evaluation_file = os.path.join(evaluation_dir, f'single_label_evaluation.json')

In [None]:
import torch

# Set device to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### Get the dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('malteos/scincl')

In [None]:
import pandas as pd

df = pd.read_csv(orkg_file)

# Rename "research field" to "label"
df = df.rename(columns={"research field": "label"})

df = df[["title", "abstract", "label"]]
df["abstract"] = ["" if pd.isna(abstract) else abstract for abstract in df["abstract"]]
df["text"] = [str(row["title"]) + tokenizer.sep_token + (row["abstract"] or "") for index, row in df.iterrows()]

In [None]:
# Split the data into train and test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv(train_file, index=False)
test_df.to_csv(test_file, index=False)

In [None]:
# Mapping of labels to integers
labels = list(set(df["label"].unique()))
label_dict = {label: i for i, label in enumerate(labels)}
reverse_label_dict = {v: k for k, v in label_dict.items()}

In [None]:
from datasets import Dataset, DatasetDict

train_df = train_df[["text", "label"]]
test_df = test_df[["text", "label"]]

train_df["label"] = [label_dict[label] for label in train_df["label"]]
test_df["label"] = [label_dict[label] for label in test_df["label"]]

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dd = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

#### Tokenize the text in the dataset

In [None]:
def tokenize_function(examples):
    # tokenize the text
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

    # pad the attention masks to the same length as the input sequences
    tokenized['attention_mask'] = [
        torch.cat([
            torch.tensor(mask),
            torch.zeros(512 - len(mask))
        ])
        for mask in tokenized['attention_mask']
    ]

    return tokenized

In [None]:
tokenized_datasets = dd.map(tokenize_function, batched=True)

In [None]:
# Remove __index_level_0__ column
tokenized_datasets = tokenized_datasets.remove_columns("__index_level_0__")
tokenized_datasets

#### Preporcessing

In [None]:
# remove unnecessary columns from dataset
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

In [None]:
# rename the label column to labels because the model expects the argument to be named as the latter
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [None]:
# set the format of the dataset to return PyTorch instrad og lists
tokenized_datasets.set_format("torch")

In [None]:
tokenized_datasets

### Training with PyTorch

In [None]:
from torch.utils.data import DataLoader

# Create DataLoader objects to iterate over batches of data when training
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=30)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=30)

In [None]:
from transformers import AutoModelForSequenceClassification

# get the model
model = AutoModelForSequenceClassification.from_pretrained(
    "malteos/scincl",
    num_labels=len(label_dict)
).to(device)

In [None]:
from transformers import get_scheduler
from torch.optim import AdamW

# define optimizer with the learning rate and the scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

#### Training loop

In [None]:
model.train()

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

progress_bar.close()

In [None]:
# Save model checkpoint
model.save_pretrained(model_dir)

### Evaluation

In [None]:
model.eval()

In [None]:
progress_bar = tqdm(range(len(test_dataloader)))

predictions = []
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=1).tolist())
    progress_bar.update(1)

progress_bar.close()

In [None]:
final_df = test_df.copy()

predicted_labels = [reverse_label_dict[label] for label in predictions]
gt_labels = [reverse_label_dict[label] for label in final_df["label"]]

final_df["label"] = gt_labels
final_df["predicted_labels"] = predicted_labels
final_df.to_csv(predictions_file, index=False)

final_df

In [None]:
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(gt_labels, predicted_labels)
precision = precision_score(gt_labels, predicted_labels, average="micro")
recall = recall_score(gt_labels, predicted_labels, average="micro")
f1 = f1_score(gt_labels, predicted_labels, average="micro")

evaluation_results = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}

# Write evaluation results to file
with open(evaluation_file, "w") as file:
    json.dump(evaluation_results, file, indent=2)

evaluation_results