We'll fine-tune a pre-trained RoBERTa model from HuggingFace to perform multi-output regression on this task's data. We'll use `RobertaPreTrainedModel` from the `transformers` library to do so.

In [1]:
import os, json, glob

import numpy as np
import pandas as pd
import torch
from transformers import (
    BertTokenizer, BertForSequenceClassification, 
    AdamW, BertConfig, Trainer, TrainingArguments,
    RobertaForSequenceClassification, RobertaTokenizer,
    RobertaPreTrainedModel, BertPreTrainedModel,
)
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from src import data, utils

In [6]:
# load the train data
train_data = data.load('train').filter(regex='^(subject_id|round|date|message|d_)')
display(train_data.head(1))
# concat messages by subject id
train_data = data.concat_messages(train_data)

# split into 15% of subject ids for validation 
# get the classes as the argmax of the label probabilities to use them for stratification
subj_classes = train_data.set_index('subject_id').filter(regex='^d_')\
    .apply(lambda x: x.argmax() if x[:-1].sum()<0.5 else x[:-1].argmax(), axis=1)\
        .replace(dict(enumerate(train_data.filter(regex='^d_').columns)))
tr_subj_ids, val_subj_ids = train_test_split(subj_classes.index, test_size=0.15, random_state=42, stratify=subj_classes.values)
# split the train data into train and validation sets
df_val = train_data[train_data['subject_id'].isin(val_subj_ids)]
df_train = train_data[train_data['subject_id'].isin(tr_subj_ids)]

# augment the train data by taking only the first half of the messages
half_messages_df_train = df_train.assign(
    message=lambda df: df['message'].apply(lambda x: ' | '.join(x.split(' | ')[:len(x.split(' | '))//2])),
    # num_messages=lambda df: df['message'].apply(lambda x: len(x.split(' | ')))
)
df_train = pd.concat([df_train, half_messages_df_train], axis=0).sort_values('subject_id').reset_index(drop=True)
print(f"train_df.shape (augmented): {df_train.shape}")
print(f"val_data.shape: {df_val.shape}")

Unnamed: 0,subject_id,round,date,message,d_suffer_in_favour,d_suffer_against,d_suffer_other,d_control
0,subject264,-1,2020-10-16 07:04:25,Alguien que quiera charlar ?,0.3,0.4,0.3,0.0


train_df.shape (augmented): (296, 8)
val_data.shape: (27, 8)


Load the pre-trained transformer:

In [7]:
# config
import transformers
from transformers import AutoConfig, AutoTokenizer, RobertaPreTrainedModel
from src.roberta_regressor import RobertaRegressor, multi_reg_loss, train, evaluate

model_name = "PlanTL-GOB-ES/roberta-base-bne"
model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
model = RobertaPreTrainedModel.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaRegressor.from_pretrained(model_name, num_outputs=4)

Some weights of the model checkpoint at hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es were not used when initializing RobertaPreTrainedModel: ['roberta.encoder.layer.5.attention.self.query.weight', 'roberta.encoder.layer.6.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.dense.weight', 'roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.5.attention.output.LayerNorm.bias', 'roberta.encoder.layer.10.attention.output.dense.weight', 'roberta.encoder.layer.4.attention.self.key.weight', 'roberta.encoder.layer.7.attention.self.value.weight', 'roberta.encoder.layer.4.attention.self.query.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.7.output.dense.bias', 'roberta.encoder.layer.1.attention.output.dense.bias', 'roberta.encoder.layer.2.attention.self.query.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.8.output.LayerNorm.weight', 'roberta.encoder.layer.1

Prepare the data and datasets

In [29]:
from torch.utils.data import Dataset, DataLoader

class MentalRiskDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer, max_len: int = 1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text = data_row.message
        label = torch.tensor(data_row.label)
        tokens = self.tokenizer.tokenize(text, padding=True, truncation=True)
        if len(tokens) > self.max_len - 2:
            tokens = tokens[:self.max_len - 2]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.max_len:
            tokens = tokens + ['[PAD]' for _ in range(self.max_len - len(tokens))] 
        else:
            tokens = tokens[:self.max_len-1] + ['[SEP]'] 
        input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)).squeeze(0) 
        # pooling layer
        
        attention_mask = (input_ids != 0).long()
        return input_ids, attention_mask, label


In [77]:
from sklearn.model_selection import train_test_split
train_df = df_train.set_index('subject_id').assign(
    label=lambda df: df.filter(regex='^d_').values.tolist()
)[['message', 'label']]
val_df = df_val.set_index('subject_id').assign(
    label=lambda df: df.filter(regex='^d_').values.tolist()
)[['message', 'label']]

display(train_df.head(3))
display(val_df.head(3))
print(f"train_df.shape: {train_df.shape}")
print(f"val_df.shape: {val_df.shape}")

MAX_LEN = 512

train_dataset = MentalRiskDataset(train_df, tokenizer, max_len=MAX_LEN)
# test_dataset = MentalRiskDataset(test_df, tokenizer)
val_dataset = MentalRiskDataset(val_df.sample(frac=0.3), tokenizer, max_len=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

Unnamed: 0_level_0,message,label
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
subject104,"Sin duda alguna , mal | Algo general que puedo...","[0.4, 0.0, 0.3, 0.3]"
subject104,"Sin duda alguna , mal | Algo general que puedo...","[0.4, 0.0, 0.3, 0.3]"
subject108,Ocupado con la uni | Unmsm es universidad naci...,"[0.4, 0.1, 0.0, 0.5]"


Unnamed: 0_level_0,message,label
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
subject101,volvi me extrañaron signo de interrogación | y...,"[0.7, 0.1, 0.1, 0.1]"
subject106,Nunca me imagine terminar tan mal | Y necesita...,"[0.5, 0.5, 0.0, 0.0]"
subject11,"Y al día de hoy , cómo sigues después de todo ...","[0.0, 0.0, 0.0, 1.0]"


train_df.shape: (296, 2)
val_df.shape: (27, 2)


In [85]:
display(subj_classes[tr_subj_ids].value_counts())
display(subj_classes[val_subj_ids].value_counts())

d_control             69
d_suffer_against      37
d_suffer_in_favour    37
d_suffer_other         5
dtype: int64

d_control             12
d_suffer_against       7
d_suffer_in_favour     7
d_suffer_other         1
dtype: int64

### Train the model:

In [None]:
import wandb # to log the training progress

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="mentalriskes-roberta-finetuning",
    # track hyperparameters and run metadata
    config={
    "init_learning_rate": 3e-5,
    "architecture": "BERT",
    "dataset": "MENTALRISKES"}
)
def evaluate(model, criterion, dataloader, device):
    """
    To evaluate the model on the validation set
    """
    model.eval()
    mean_acc, mean_loss, count = 0, 0, 0

    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
            mean_loss += criterion(output.squeeze(), target.type_as(output).squeeze()).item()
            count += 1
            
    return mean_loss/count  

In [70]:
from torch.nn import CrossEntropyLoss
criterion = CrossEntropyLoss() #multi_reg_loss("cross_entropy",sum_diff_penalty=0.1)
optimizer = AdamW(params=model.parameters(), lr=3e-5)
device = "mps"

num_epochs = 20
unfreeze_at = 12 # epoch
unfreeze_percent = 0.1 # 0.1 = 10% of the (last) layers will be unfrozen at each epoch after unfreeze_at
unfrozen_count = 0
loss_per_epoch = []

num_training_steps=len(train_loader) * num_epochs
scheduler = transformers.get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
roberta_params = list(model.roberta.parameters())



In [None]:
from tqdm import trange

first_epoch = 0
epochs = num_epochs
log_every = 3
save_every = 10
model = model.to(device)
for epoch in trange(first_epoch, epochs, desc="Epoch"):
    model.train()
    train_loss = 0
    for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
        optimizer.zero_grad()  
        
        input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
        
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(output.squeeze(), target.type_as(output).squeeze())
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
    if unfreeze_at and epoch>=unfreeze_at:
      print(f"Unfreezing params at epoch {epoch}")
      # unfreeze the roberta parameters
      if unfrozen_count < len(roberta_params):
        unfrozen_count += int(len(roberta_params) * unfreeze_percent)
        for param in roberta_params[-unfrozen_count:]:
          param.requires_grad = True
    if (epoch%log_every == 0 or (epoch==epochs-1)):
      print(f"Training loss is {train_loss/len(train_loader)}")
      val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
      print("Epoch {} complete! Validation Loss : {}".format(epoch+1, val_loss))
      loss_dict = {
        "epoch":epoch,
        "train":train_loss/len(train_loader), 
        "validation":val_loss,
        "lr":optimizer.param_groups[0]['lr']
      }
      loss_per_epoch.append(loss_dict)
      wandb.log(loss_dict)

In [None]:
# plot 
import seaborn as sns
import matplotlib.pyplot as plt
loss_df = pd.DataFrame(loss_per_epoch).assign(epoch=lambda df: df.epoch + 1)
ax = loss_df.set_index('epoch').plot()
ax.grid(); ax.set(ylabel="Loss");
ax.set_xticks(loss_df.epoch.values)
plt.show()

In [None]:
# save the model
torch.save(model.state_dict(), "models/roberta-finetuned.pth")

In [97]:
import pickle
model = pickle.load(open("models/roberta-base-bne-finetuned-25.pkl", "rb"))

In [59]:
from typing import Tuple
# model = model.cpu()
device="cpu"

def normalize(x, weights:Tuple[float,float,float,float] = None):
    if weights is None:
        weights = (0.25,0.25,0.25,0.25)
    # normalize to [0,1]
    x = (x - x.min())/(x.max() - x.min())
    # normalize to [-1,1]
    x = (x - 0.5)*2
    # apply weights
    x = x * np.array(weights)
    x /= x.sum()
    return x

def predict(text:str, model, tokenizer, device) -> Tuple[float, float, float, float]:
    input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
    output = model(input_ids, attention_mask).squeeze()
    # free up memory
    del input_ids, attention_mask
    return output.detach().numpy()

def label_metrics(score_fun, y_true, y_pred):
    scores = []
    for i in range(len(label_names)):
        scores.append(score_fun(y_true[:,i],y_pred[:,i]))
    return scores

In [61]:
val_df_sampled = val_df.sample(60)
val_predictions = val_df_sampled.message.apply(lambda x: predict(x, model, tokenizer, device=device))
results_df = val_df_sampled.assign(
    predicted=val_predictions.apply(lambda x: x.round(2)).tolist()
)
results_df.head()

Unnamed: 0,message,label,predicted
86,Es estudiar algo o me echan de casa para cosa ...,"[0.5, 0.2, 0.2, 0.1]","[-0.7, 0.02, -1.52, 0.83]"
0,"Si es una ayuda momentánea , espero que con má...","[0.5, 0.2, 0.0, 0.3]","[-0.7, 0.02, -1.52, 0.83]"
99,Tu paes nose pueden Meter en tu vida sentiment...,"[0.8, 0.0, 0.1, 0.1]","[-0.7, 0.02, -1.52, 0.83]"
141,Estas muy linda cara feliz con ojos sonrientes...,"[0.1, 0.0, 0.1, 0.8]","[-0.7, 0.02, -1.52, 0.83]"
312,La verdad es que para mí es un castigo la comi...,"[0.1, 0.1, 0.2, 0.6]","[-0.7, 0.02, -1.52, 0.83]"


In [26]:
from sklearn.metrics import r2_score, mean_squared_error

r2 = r2_score(val_df_sampled.label.tolist(), val_predictions.apply(list).tolist())
rmse = mean_squared_error(val_df_sampled.label.tolist(), val_predictions.apply(list).tolist(), squared=False)
print(f"R2 score is {r2}")
print(f"RMSE score is {rmse}")

R2 score is -0.4116190829023336
RMSE score is 0.2525223318253405


In [47]:
from functools import partial
true = np.array(val_df_sampled.label.apply(list).tolist())
pred = np.array(val_predictions.apply(list).tolist())
rmse_metrics = label_metrics(
    partial(mean_squared_error, squared=False), 
    true, pred
)
rmse_metrics = dict(zip(label_names, rmse_metrics))
r2_metrics = dict(zip(label_names, label_metrics(r2_score, true, pred)))
metrics_df = pd.DataFrame([rmse_metrics, r2_metrics], index=['rmse', 'r2']).T.rename_axis('label', axis=1)
metrics_df


label,rmse,r2
suffer_in_favour,0.275965,0.128641
suffer_against,0.257571,-0.290715
suffer_other,0.126232,-1.655737
control,0.350322,0.171335


In [None]:
# inference example
text = "como puedo ayudarte?"
input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
out = model(input_ids, attention_mask).squeeze().detach().numpy()
out

array([0.16364938, 0.23866598, 0.04136196, 0.5285845 ], dtype=float32)