# **Kaggle Challenge: Research Project Multilabel Classification with Transformers**

*Master in Machine Learning for Health, UC3M - 2023~2024*

*Authors: Daniel Corrales, Jaime Fernández & Rafael Rodríguez*

## **1. Data Loading**

In [None]:
import pickle
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set()

%load_ext google.colab.data_table

In [None]:
# Install necessary packages
%pip install transformers[torch]
%pip install datasets
%pip install accelerate -U

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
import torch
import transformers
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding, EarlyStoppingCallback, AutoConfig
from datasets import Dataset, DatasetDict
import accelerate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/NLP/Kaggle'

# Read files
with open(path + '/data_text_test.pickle', 'rb') as file:
    corpus_test = pickle.load(file)

with open(path + '/data_text_train.pickle', 'rb') as file:
    corpus_train = pickle.load(file)

with open(path + '/publis_title_test.pickle', 'rb') as file:
    publis_test = pickle.load(file)

with open(path + '/publis_title_train.pickle', 'rb') as file:
    publis_train = pickle.load(file)

with open(path + '/data_v2_train.pickle', 'rb') as file:
    labels_train = pickle.load(file)

Extract training labels and project id

In [None]:
y = np.array(labels_train['label'].to_list())
y = torch.tensor(y, dtype=torch.float)
project_id = corpus_test['projectID'].to_list()

print(f"Samples in training set: {len(y)}")
print(f"Samples in test set: {corpus_test.shape[0]}")

Samples in training set: 11228
Samples in test set: 6231


Join all information within the corpus sets

In [None]:
def join_df(project_df, pub_df):
  grouped_publications = pub_df.groupby('projectID')['title'].agg(list).reset_index()
  joined_df = pd.merge(project_df, grouped_publications, on='projectID', how='left')
  joined_df = joined_df.rename(columns={'title_y': 'publications'})
  joined_df = joined_df.rename(columns={'title_x': 'title'})

  return joined_df

In [None]:
train_df = join_df(corpus_train, publis_train)
test_df = join_df(corpus_test, publis_test)

Replace NaN values within the publication category with `no publications` text

In [None]:
train_df['publications'] = train_df['publications'].fillna('No publications')
test_df['publications'] = test_df['publications'].fillna('No publications')

In [None]:
train_df.head(2)

Unnamed: 0,projectID,title,objective,publications
0,305282,A Multi-Stage Malaria Vaccine,A highly effective malaria vaccine is a major ...,[Plasmodium falciparum full life cycle and Pla...
1,318997,NEUREN - Neuroscience Research Exchange Networ...,"""The NEUREN project is based on an interdiscip...",[Prenatal Exposure to Paint Thinner Alters Pos...


In [None]:
test_df.head(2)

Unnamed: 0,projectID,title,objective,publications
0,101095619,Efficient and rapidly SCAlable EU-wide evidenc...,Pandemics have the potential to disrupt our da...,No publications
1,836869,Unique approach to improving neurological func...,The aim of this project is to develop a busine...,No publications


## **Create the Dataset**

Depending on which data we want to use for training the model, we have several options:
1. Only project title.
2. Only project Objective.
3. Project title and Objective.
4. Project title and publications.
5. Project objective and publications.
6. Project title, objective and publications.

In [None]:
def create_dataset(df, data_to_include, y=None, validation=True, validation_size=0.15):
  df_copy = df.copy()

  if len(data_to_include) == 3:
    df_copy['text'] = df_copy.apply(lambda row: f"Title: {row['title']}. Objective: {row['objective']} Publications: {row['publications']}", axis=1)
  elif len(data_to_include) == 2 and 'title' in data_to_include and 'objective' in data_to_include:
    df_copy['text'] = df_copy.apply(lambda row: f"Title: {row['title']}. Objective: {row['objective']}", axis=1)
  elif len(data_to_include) == 2 and 'title' in data_to_include and 'publications' in data_to_include:
    df_copy['text'] = df_copy.apply(lambda row: f"Title: {row['title']}. Publication: {row['publications']}", axis=1)
  elif len(data_to_include) == 2 and 'objective' in data_to_include and 'publications' in data_to_include:
    df_copy['text'] = df_copy.apply(lambda row: f"Objective: {row['objective']} Publications: {row['publications']}", axis=1)
  else:
    df_copy['text'] = df_copy[data_to_include]

  data = df_copy['text'].to_list()

  if validation:
    train_data, val_data, y_train, y_val = train_test_split(data, y, test_size=validation_size, random_state=7)
    return train_data, val_data, y_train, y_val

  return data

In [None]:
data_to_include = ['title', 'objective', 'publications']

trainset, valset, y_train, y_val = create_dataset(train_df, data_to_include, y=y)
testset = create_dataset(test_df, data_to_include, validation=False)

print(f"Training samples: {len(trainset)}")
print(f"Validation samples: {len(valset)}")
print(f"Test samples: {len(testset)}")

Training samples: 9543
Validation samples: 1685
Test samples: 6231


Print an example to check correct functioning

In [None]:
trainset[0]

'Title: Algorithm for healthy eating habits. Objective: MENUTERRANEUS aims to improve the quality of life of European citizens, fighting a growing problem in Europe and very present in the policies of the World Health Organization, OECD and the European Commission. Specifically, Menuterraneus intends to help European families to plan their meals, using an algorithm that functions according to nutritional parameters, promoting healthy eating habits.The project is a pioneering solution in Europe, where the systems developed by the competitors are based on templates. In addition, many of them belong to food Companies or groups, so have and have a dependence on their brands when setting the menus.For the moment, a limited technical concept (a basic algorithm) has been developed and tested geographically: the Spanish market. The results have been so far positive, with 83,000 registered users, using a free application, and 3 market channels have been explored: discount coupons, publicity reg

Finally, we adapat the dataset to the expected `DatasetDict` format

In [None]:
train_dataset = Dataset.from_dict({"text": trainset, "labels": y_train})
val_dataset = Dataset.from_dict({"text": valset, "labels": y_val})
test_dataset = Dataset.from_dict({"text": testset})

# Organize datasets into a dictionary
research_dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

research_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 1685
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6231
    })
})

## **Fine Tuning**

Define `compute_metrics()` function

In [None]:
# Load pre-trained model and tokenizer
num_labels = len(y[0])
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, problem_type='multi_label_classification')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize dataset

In [None]:
def tokenize_function(sample):
  return tokenizer(sample["text"], truncation=True)

In [None]:
tokenized_research_dataset = research_dataset.map(tokenize_function, batched=True)
tokenized_research_dataset = tokenized_research_dataset.remove_columns(["text"])

tokenized_research_dataset

Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

Map:   0%|          | 0/1685 [00:00<?, ? examples/s]

Map:   0%|          | 0/6231 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1685
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6231
    })
})

Now we train the model

In [None]:
def compute_metrics(pred):
    labels = torch.tensor(pred.label_ids)
    logits = torch.tensor(pred.predictions)
    probabilities = torch.sigmoid(logits) # Sigmoid for multi-label classifications

    return {"mean_auc": roc_auc_score(labels, probabilities)}

In [None]:
training_args = TrainingArguments(
    output_dir=path + f"/{model_name}-finetuned-research-2",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="mean_auc"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_research_dataset['train'],
    eval_dataset=tokenized_research_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)]
)

trainer.train()

In [None]:
trainer.evaluate()

## **Testset Evaluation and *.csv Generation**

Load trained model and evaluate testset

In [None]:
# Load trained model
checkpoint = 3579
model_path = path + f"/{model_name}-finetuned-research/checkpoint-{checkpoint}"
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Define test trainer
test_trainer = Trainer(model, data_collator=data_collator, tokenizer=tokenizer)

# Make prediction
test_dataset = tokenized_research_dataset["test"]
y_pred_test = test_trainer.predict(test_dataset)[0]

# Convert logits to probabilities
probs = torch.sigmoid(torch.tensor(y_pred_test))

print(probs.shape)

Convert probability predictions into *.csv file.

In [None]:
def create_submission_file(soft_outputs, projectID, path):
  with open(path, 'w') as file:
    # Write header
    header_str = 'projectID'
    for i in range(34):
      header_str += f',cat_{i}'

    file.write(header_str + '\n')

    # Write data
    for id, sample in zip(projectID, soft_outputs):
      sample_str = f'{id}'
      for proba in sample:
        sample_str += f',{proba}'

      file.write(sample_str + '\n')

    print(f"File created succesfully...")
    file.close()

In [None]:
# Concatenate IDproject with probs
probs = np.array(probs).astype(float)
# project_id = [int(id) for id in project_id]

# Write csv file
create_submission_file(probs, project_id, path + '/submissions' + '/test_submission_BERT_3epochs.csv')

File created succesfully...
