We'll fine-tune a pre-trained RoBERTa model from HuggingFace's transformers library to perform multi-output regression on this task's data. We'll leverage `BertForSequenceClassification` and `Trainer` from the `transformers` library to do so. We'll also use the `datasets` library to load the data and `sklearn` to evaluate the model. 


In [None]:
!pip install transformers sentence-transformers accelerate datasets

In [2]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset

!huggingface-cli login --token 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
task = 'd'
ds = load_dataset(f'nlpUc3mStudents/mental-risk-{task}')
# to pandas
train_df = ds['train'].to_pandas()
test_df = ds['test'].to_pandas()
label_names = train_df.iloc[:,4:].columns.tolist()
# concat messages by subject id
train_by_subjectid = (
    train_df.groupby('subject_id')
    .agg({'message': lambda x: ' | '.join(x), **{col: 'first' for col in label_names}})
    .reset_index()
)
test_by_subjectid = (
    test_df.groupby('subject_id')
    .agg({'message': lambda x: ' | '.join(x), **{col: 'first' for col in label_names}})
    .reset_index()
)

data = pd.concat([train_by_subjectid, test_by_subjectid], axis=0)
dataset = Dataset.from_pandas(data)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful




  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# get the test data
import requests, os
url_endpoint = "http://s3-ceatic.ujaen.es:8036" 
token = "c90775da4a16e3d5ed1cda75fa9d53424dc7ba78" 
task_number = '2'
subtasks = 'task2a,task2b,task2c,task2d'.split(',')
setname = 'test'

test_dir = f"data/{setname}"
truth = test_dir+"/golden_truth"
os.makedirs(truth,exist_ok=True)

url = f'{url_endpoint}/task{task_number}/download_{setname}/{token}'
subtasks_template = '{endpoint}/{task}/download_{setname}/{token}'
for task in subtasks:
  subtask_url = subtasks_template.format(endpoint=url_endpoint, task=task, token=token, setname=setname)
  resp = requests.get(subtask_url)
  with open(f'{truth}/task{task_number}_gold_{task[-1]}.txt', 'wb') as f:
    f.write(resp.content)
  
zip_url = f'{url_endpoint}/task{task_number}/download_{setname}/{token}'
resp = requests.get(zip_url)
with open(f'{test_dir}/task_{task_number}.zip', 'wb') as f:
    f.write(resp.content)

In [4]:
!unzip data/test/task_2.zip -d data/test/

Archive:  data/test/task_2.zip
replace data/test/round_1.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [5]:
# load test data
import pandas as pd
import json, glob, os

test_dir = 'data/test'
truth_dir = 'golden_truth'
test_data_paths = glob.glob('data/test/*.json')
test_data = []
for path in test_data_paths:
    round_data = json.load(open(path))
    test_data = test_data + round_data

test_df = pd.DataFrame.from_records(test_data).rename(columns={'nick': 'subject_id'})

for task in ['a', 'b', 'c', 'd']:
    truth_path = os.path.join(test_dir, truth_dir, 'task2_gold_{}.txt'.format(task))
    truth = pd.read_csv(truth_path).rename(columns={'Subject': 'subject_id'}).rename(columns={'Subject': 'subject_id'})
    test_df = pd.merge(test_df, truth, on='subject_id')
    test_df = test_df.rename(columns={col: f'{task}_{col}' for col in truth.columns.drop('subject_id')})

test_df = test_df.groupby('subject_id').agg({
    'message': lambda x: ' | '.join(x),
    **{col: 'first' for col in test_df.columns.drop(['subject_id', 'message'])}
}).reset_index()
test_df

Unnamed: 0,subject_id,message,id_message,round,date,a_label,b_label,c_label,d_suffer_in_favour,d_suffer_against,d_suffer_other,d_control
0,subject184,Ver cómo todo se cae a pedazos y a nadie le in...,25068367545,16,2022-05-23 07:54:47,1,0.7,suffer+in favour,0.5,0.2,0.0,0.3
1,subject185,Dios dicen que es buen piadoso y amoroso yo no...,38588707957,16,2020-07-22 22:41:08,1,1.0,suffer+in favour,1.0,0.0,0.0,0.0
2,subject186,Y no que mas me deprime es ver a mi ex llorand...,69766610192,16,2020-09-13 05:24:35,1,0.8,suffer+in favour,0.7,0.0,0.1,0.2
3,subject188,Por ejemplo las chicas de mi edad | siento que...,88987355467,16,2022-03-20 20:23:01,1,1.0,suffer+in favour,1.0,0.0,0.0,0.0
4,subject190,Pero no quiero dejar de amarla | He intentado ...,6779952067,16,2021-10-24 09:02:52,1,1.0,suffer+in favour,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
144,subject86,Si no analiza bien la situación no vas a ser g...,98888280801,16,2021-10-22 22:28:39,0,0.2,control,0.1,0.1,0.0,0.8
145,subject9,No sé digo lo que puede ser | La vd no quiere ...,63679281088,16,2021-11-09 20:20:21,0,0.2,control,0.2,0.0,0.0,0.8
146,subject91,Para eso te conviene no moverte | El sueldo no...,6023021908,16,2022-03-11 08:11:56,0,0.2,control,0.0,0.1,0.1,0.8
147,subject92,Pero podemos llamar a un italiano ! | Yo solo ...,87651939898,16,2020-10-09 23:42:31,0,0.0,control,0.0,0.0,0.0,1.0


In [6]:
import numpy as np
import pandas as pd
import torch
from transformers import (
    BertTokenizer, BertForSequenceClassification, 
    AdamW, BertConfig, Trainer, TrainingArguments,
    RobertaForSequenceClassification, RobertaTokenizer,
    RobertaPreTrainedModel, BertPreTrainedModel,
)
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [7]:
# config
from transformers import AutoConfig, AutoTokenizer
from roberta_regressor import RobertaRegressor, multi_reg_loss, train

model_name = "PlanTL-GOB-ES/roberta-base-bne"
model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
model = RobertaPreTrainedModel.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaRegressor.from_pretrained(model_name, num_outputs=4)

criterion = multi_reg_loss(sum_diff_penalty=0)
optimizer = AdamW(params=model.parameters(), lr=1e-3)

Some weights of the model checkpoint at hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es were not used when initializing RobertaPreTrainedModel: ['roberta.encoder.layer.10.attention.self.query.bias', 'roberta.encoder.layer.7.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.dense.weight', 'roberta.encoder.layer.10.attention.output.LayerNorm.bias', 'roberta.encoder.layer.4.output.dense.weight', 'roberta.encoder.layer.6.attention.output.LayerNorm.weight', 'roberta.encoder.layer.10.output.dense.bias', 'roberta.encoder.layer.2.attention.self.key.bias', 'roberta.encoder.layer.8.attention.output.dense.bias', 'roberta.encoder.layer.4.output.dense.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.5.output.dense.weight', 'roberta.encoder.layer.3.attention.self.key.bias', 'roberta.encoder.layer.5.attention.self.key.weight', 'roberta.encoder.layer.10.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.self.value.bias', 'rob

In [8]:
# prepare the data
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class MentalRiskDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer, max_len: int = 1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text = data_row.message
        label = torch.tensor(data_row.label)
        tokens = self.tokenizer.tokenize(text, padding=True, truncation=True)
        if len(tokens) > self.max_len - 2:
            tokens = tokens[:self.max_len - 2]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.max_len:
            tokens = tokens + ['[PAD]' for _ in range(self.max_len - len(tokens))] 
        else:
            tokens = tokens[:self.max_len-1] + ['[SEP]'] 
        input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0) 
        # pooling layer
        
        attention_mask = (input_ids != 0).long()
        return input_ids, attention_mask, label


In [9]:
train_df = data.assign(
    label=lambda df: df[label_names].values.tolist()
)[['message', 'label']]
val_df = test_df.assign(
    label=lambda df: df[df.columns[df.columns.str.startswith('d_')]].values.tolist()
)[['message', 'label']]

MAX_LEN = 512

train_dataset = MentalRiskDataset(train_df, tokenizer, max_len=MAX_LEN)
# test_dataset = MentalRiskDataset(test_df, tokenizer)
val_dataset = MentalRiskDataset(val_df.sample(60), tokenizer, max_len=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [10]:
device = "cuda"
model.cuda()
input_ids, attention_mask, target = train_dataset.__getitem__(3)
input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
output = model(input_ids=input_ids, attention_mask=attention_mask).squeeze().to(device)
loss = criterion(output, target)

### Train the model:

In [11]:
from roberta_regressor import evaluate
from tqdm import trange
epochs = 15
device='cuda'
for epoch in trange(epochs, desc="Epoch"):
    model.train()
    train_loss = 0
    for i, (input_ids, attention_mask, target) in enumerate(iterable=train_dataset):
        optimizer.zero_grad()  
        
        input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
        
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        # out = model.classifier(output)
        loss = criterion(output.squeeze(), target.type_as(output))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
      
    if (epoch%5 == 0):
      print(f"Training loss is {train_loss/len(train_loader)}")
      val_loss = evaluate(model=model, criterion=criterion, dataloader=val_dataset, device=device)
      print("Epoch {} complete! Validation Loss : {}".format(epoch+1, val_loss))

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Training loss is 3.4810999775848663


Epoch:   7%|▋         | 1/15 [00:24<05:48, 24.93s/it]

Epoch 1 complete! Validation Loss : 0.07505334735227127


Epoch:  33%|███▎      | 5/15 [01:58<03:55, 23.57s/it]

Training loss is 2.19557720151109


Epoch:  40%|████      | 6/15 [02:24<03:37, 24.14s/it]

Epoch 6 complete! Validation Loss : 0.06892567244746411


Epoch:  67%|██████▋   | 10/15 [03:56<01:56, 23.32s/it]

Training loss is 2.1615504201423996


Epoch:  73%|███████▎  | 11/15 [04:22<01:36, 24.01s/it]

Epoch 11 complete! Validation Loss : 0.06891087817493827


Epoch:  87%|████████▋ | 13/15 [05:20<00:49, 24.67s/it]


In [13]:
# save the model
import pickle
pickle.dump(model.cpu(), open('/content/drive/MyDrive/Master/NLP/roberta-base-bne-finetuned.pickle', "wb"))

In [None]:
# load the model
import pickle
model = pickle.load(open('roberta-base-bne-finetuned.pickle', "rb"))
model

RobertaRegressor(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50262, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [None]:
from typing import Sequence, Tuple
model = model.cpu()
device="cpu"
def predict(model, text:str, device) -> Tuple[float, float, float, float]:
    input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
    output = model(input_ids, attention_mask).squeeze()
    return output/sum(output) # to normalize it
val_df_sampled = val_df.sample(30)
val_predictions = val_df_sampled.message.apply(lambda x: predict(model, x, device=device))

In [None]:
val_df

Unnamed: 0,message,label
0,Ver cómo todo se cae a pedazos y a nadie le in...,"[0.5, 0.2, 0.0, 0.3]"
1,Dios dicen que es buen piadoso y amoroso yo no...,"[1.0, 0.0, 0.0, 0.0]"
2,Y no que mas me deprime es ver a mi ex llorand...,"[0.7, 0.0, 0.1, 0.2]"
3,Por ejemplo las chicas de mi edad | siento que...,"[1.0, 0.0, 0.0, 0.0]"
4,Pero no quiero dejar de amarla | He intentado ...,"[1.0, 0.0, 0.0, 0.0]"
...,...,...
144,Si no analiza bien la situación no vas a ser g...,"[0.1, 0.1, 0.0, 0.8]"
145,No sé digo lo que puede ser | La vd no quiere ...,"[0.2, 0.0, 0.0, 0.8]"
146,Para eso te conviene no moverte | El sueldo no...,"[0.0, 0.1, 0.1, 0.8]"
147,Pero podemos llamar a un italiano ! | Yo solo ...,"[0.0, 0.0, 0.0, 1.0]"
