We'll fine-tune a pre-trained RoBERTa model from HuggingFace to perform multi-output regression on this task's data. We'll use `RobertaPreTrainedModel` from the `transformers` library to do so.

In [None]:
# Autoreload Modules
%load_ext autoreload
%autoreload 1

/content/drive/MyDrive/Master/NLP


In [None]:
!pip install transformers sentence-transformers accelerate datasets wandb

In [None]:
import numpy as np
import pandas as pd

from src import data
from sklearn.model_selection import train_test_split

In [None]:
# load the train and test data
train_data = data.load('train')
test_df = data.load('test')
# concat messages by subject id
train_data = data.concat_messages(train_data)
test_df = data.concat_messages(test_df)

# split into 15% of subject ids for validation
# get the classes as the argmax of the label probabilities to use them for stratification
subj_classes = train_data.set_index('subject_id').filter(regex='^d_')\
    .apply(lambda x: x.argmax() if x[:-1].sum()<0.5 else x[:-1].argmax(), axis=1)\
        .replace(dict(enumerate(train_data.filter(regex='^d_').columns)))
tr_subj_ids, val_subj_ids = train_test_split(subj_classes.index, test_size=0.15, random_state=42, stratify=subj_classes.values)
# split the train data into train and validation sets
val_df = train_data[train_data['subject_id'].isin(val_subj_ids)]
train_df = train_data[train_data['subject_id'].isin(tr_subj_ids)]

# augment the train data by taking only the first half of the messages
half_messages_df_train = train_df.assign(
    message=lambda df: df['message'].apply(lambda x: ' | '.join(x.split(' | ')[:len(x.split(' | '))//2])),
    # num_messages=lambda df: df['message'].apply(lambda x: len(x.split(' | ')))
)
train_df = pd.concat([train_df, half_messages_df_train], axis=0).sort_values('subject_id').reset_index(drop=True)
print(f"df_train.shape (augmented): {train_df.shape}")
print(f"df_train.shape (original): {train_data.shape}")
print(f"val_data.shape: {val_df.shape}")
print(f"test_df.shape: {test_df.shape}")

df_train.shape (augmented): (296, 12)
df_train.shape (original): (175, 12)
val_data.shape: (27, 12)
test_df.shape: (149, 12)


Augment data by adding rows that only contain half of the messages of a subject - this is done to simulate the case where a lower number of messages are available for early risk detection.

In [None]:
# config
from transformers import AutoConfig, AutoTokenizer, RobertaPreTrainedModel, AdamW
from src.roberta_regressor import RobertaRegressor, multi_reg_loss, train, evaluate
import transformers
import numpy as np
import pandas as pd
import torch

# model_name = "PlanTL-GOB-ES/roberta-base-bne"
model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
model = RobertaPreTrainedModel.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaRegressor.from_pretrained(model_name, num_outputs=1)

In [None]:
# prepare the data
from torch.utils.data import Dataset, DataLoader

import pandas as pd

class MentalRiskDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer, max_len: int = 1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text = data_row.message
        label = torch.tensor(data_row.label)
        tokens = self.tokenizer.tokenize(text, padding=True, truncation=True)
        if len(tokens) > self.max_len - 2:
            tokens = tokens[:self.max_len - 2]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.max_len:
            tokens = tokens + ['[PAD]' for _ in range(self.max_len - len(tokens))]
        else:
            tokens = tokens[:self.max_len-1] + ['[SEP]']
        input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)).squeeze(0)
        # pooling layer

        attention_mask = (input_ids != 0).long()
        return input_ids, attention_mask, label.view(1,len(data_row.label))


In [None]:
from sklearn.model_selection import train_test_split
df_train = train_df.set_index('subject_id').assign(
    label=lambda df: df.filter(regex='^b_').values.tolist()
)[['message', 'label']]
df_val = val_df.set_index('subject_id').assign(
    label=lambda df: df.filter(regex='^b_').values.tolist()
)[['message', 'label']]

display(df_train.head(3))
display(df_val.head(3))
print(f"train_df.shape: {df_train.shape}")
print(f"val_df.shape: {df_val.shape}")

MAX_LEN = 512

train_dataset = MentalRiskDataset(df_train, tokenizer, max_len=MAX_LEN)
# test_dataset = MentalRiskDataset(test_df, tokenizer)
val_dataset = MentalRiskDataset(df_val.sample(frac=0.3), tokenizer, max_len=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

Unnamed: 0_level_0,message,label
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
subject104,"Sin duda alguna , mal | Algo general que puedo...",[0.7]
subject104,"Sin duda alguna , mal | Algo general que puedo...",[0.7]
subject108,Ocupado con la uni | Unmsm es universidad naci...,[0.5]


Unnamed: 0_level_0,message,label
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
subject101,volvi me extrañaron signo de interrogación | y...,[0.9]
subject106,Nunca me imagine terminar tan mal | Y necesita...,[1.0]
subject11,"Y al día de hoy , cómo sigues después de todo ...",[0.0]


train_df.shape: (296, 2)
val_df.shape: (27, 2)


In [None]:
_,_,label = train_dataset.__getitem__(0)
label.size()

torch.Size([1, 1])

### Train the model:

In [None]:
import wandb # to log the training progress

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="mentalriskes-roberta-finetuning",
    # track hyperparameters and run metadata
    config={
    "init_learning_rate": 3e-5,
    "architecture": "BERT",
    "dataset": "MENTALRISKES"}
)
def evaluate(model, criterion, dataloader, device):
    """
    To evaluate the model on the validation set
    """
    model.eval()
    mean_acc, mean_loss, count = 0, 0, 0

    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):

            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
            mean_loss += criterion(output, target.view(1,1)).item()
            count += 1

    return mean_loss/count

In [None]:
from torch.nn import CrossEntropyLoss, MSELoss
from torch.optim import AdamW
criterion = MSELoss() #multi_reg_loss("cross_entropy",sum_diff_penalty=0)
optimizer = AdamW(params=model.parameters(), lr=3e-5)
device = "cuda"

num_epochs = 30
unfreeze_at = 15 # epoch
unfreeze_percent = 0.1 # 0.1 = 10% of the (last) layers will be unfrozen at each epoch after unfreeze_at
unfrozen_count = 0
loss_per_epoch = []

num_training_steps=len(train_loader) * num_epochs
scheduler = transformers.get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=len(train_loader)*(unfreeze_at-2),
    num_training_steps=num_training_steps+10
)
roberta_params = list(model.roberta.parameters())

In [None]:
tab = wandb.Table(dataframe=train_df.head(5))
wandb.log({'data_sample':tab})

In [None]:
# for the final model the train set includes the validation
# train_dataset_final = MentalRiskDataset(pd.concat([df_train, df_val]),tokenizer, max_len=MAX_LEN)
# train_loader = DataLoader(train_dataset_final, batch_size=1, shuffle=True)

In [None]:
from tqdm import trange

first_epoch = 0
epochs = 30 #num_epochs
log_every = 3
save_every = 10
model = model.to(device)
for epoch in trange(first_epoch, epochs, desc="Epoch"):
    model.train()
    train_loss = 0
    for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
        optimizer.zero_grad()

        input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.squeeze(-1).to(device)

        output = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(output[0], target[0].type_as(output))
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
    if unfreeze_at and epoch>=unfreeze_at:
      print(f"Unfreezing params at epoch {epoch}")
      # unfreeze the roberta parameters
      if unfrozen_count < len(roberta_params):
        unfrozen_count += int(len(roberta_params) * unfreeze_percent)
        for param in roberta_params[-unfrozen_count:]:
          param.requires_grad = True
    if (epoch%log_every == 0 or (epoch==epochs-1)):
      print(f"Training loss is {train_loss/len(train_loader)}")
      val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
      print("Epoch {} complete! Validation Loss : {}".format(epoch+1, val_loss))
      loss_dict = {
        "epoch":epoch,
        "train":train_loss/len(train_loader),
        "validation":val_loss,
        "lr":optimizer.param_groups[0]['lr']
      }
      loss_per_epoch.append(loss_dict)
      wandb.log(loss_dict)

In [None]:
# plot
import matplotlib.pyplot as plt
loss_df = pd.DataFrame(loss_per_epoch).assign(epoch=lambda df: df.epoch + 1).drop_duplicates(subset=['epoch'])
ax = loss_df.loc[loss_df.epoch!=20, ['epoch','train','validation']].set_index('epoch').plot()
ax.grid(); ax.set(ylabel="Loss (Cross-Entropy)");
ax.set_xticks(loss_df[loss_df.epoch!=20].epoch.values)
# ax.vlines(unfreeze_at,0.85,1.25,'r','dashed', alpha=0.7, label="weight unfreeze")
plt.legend()
plt.show()

In [None]:
# save the model
# import pickle
# pickle.dump(model.cpu(), open(f'/content/drive/MyDrive/Master/NLP/roberta-base-bne-finetuned-simple-reg-final-{epoch+1}.pkl', "wb"))

In [None]:
from typing import Tuple
# model = model.cpu()
device="cuda"

def predict(text:str, model, tokenizer, device) -> Tuple[float, float, float, float]:
    input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
    output = model(input_ids, attention_mask).squeeze()
    # free up memory
    del input_ids, attention_mask
    return output.detach()

def label_metrics(score_fun, y_true, y_pred):
    return score_fun(y_true, y_pred)

In [None]:
val_df = val_df.assign(label=lambda df: df.filter(regex='^b_').values.tolist())
device = "cuda"
model.cuda()
val_predictions = val_df.message.apply(lambda x: predict(x, model, tokenizer, device=device))
results_df = test_df.assign(
    predicted_raw=val_predictions.apply(lambda x: x.cpu().numpy().round(2)).tolist()
)
results_df.head(3)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
pred_div_sum = results_df.predicted_raw.apply(lambda x: np.array(x)/sum(x))
r2 = r2_score(results_df.label.tolist(), results_df.predicted.tolist())
rmse = mean_squared_error(results_df.label.tolist(), results_df.predicted.tolist(), squared=False)
print(f"R2 score is {r2}")
print(f"RMSE score is {rmse}")

R2 score is 0.049733651435731
RMSE score is 0.21255671787144495


In [None]:
# ejemplo de inferencia con mensajes nuevos
text = "como puedo ayudarte? | estas bien?"
input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
out = model(input_ids.to(device), attention_mask.to(device)).squeeze().cpu().detach().numpy()
out

array(-0.01911618, dtype=float32)