In [9]:
import os, glob, json, sys
import pickle
import pandas as pd
import numpy as np
from transformers import AutoTokenizer

from src import data, utils
from src.roberta_regressor import RobertaRegressor

sys.path.append('src')

In [17]:
# load the fine-tuned model and its tokenizer
with open('models/roberta-base-bne-finetuned-simple-reg-final-20.pkl', 'rb') as f:
    model = pickle.load(f)

model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "mps"
model.to(device)
# predict = utils.make_predict(model.predict, tokenizer=tokenizer, device=device)

method = "fine-tuning"


In [3]:
#load the embeddings regression model and its sentece embeddings
from sentence_transformers import SentenceTransformer
from src.embeddings import EmbeddingsRegressor

regressor_file = 'models/2b_ridge_roberta-suicide-simple-regression-final.pkl'
model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
tokenizer = SentenceTransformer(model_name)
with open(regressor_file, 'rb') as f:
    regressor = pickle.load(f)
model = EmbeddingsRegressor(tokenizer, regressor, normalize_output=False)
# make the prediction function
predict = utils.make_predict(model.predict)
method = "embeddings"

No sentence-transformers model found with name /Users/simon/.cache/torch/sentence_transformers/hackathon-somos-nlp-2023_roberta-base-bne-finetuned-suicide-es. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/simon/.cache/torch/sentence_transformers/hackathon-somos-nlp-2023_roberta-base-bne-finetuned-suicide-es were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ABSOLUTE METRICS (predicting all observations):

In [13]:
# load all the test set messages and concatenate then into a single string per subject id
test_df = data.load('test')
test_df = data.concat_messages(test_df)
# # get the predictions of each subject from the model
results_df = test_df.assign(
    label = lambda df: df.filter(regex="^b_").iloc[:,0].values,
)
results_df['predicted'] = predict(test_df.message.tolist()).tolist()
results_df.head()

Unnamed: 0,subject_id,message,round,id_message,date,d_suffer_in_favour,d_suffer_against,d_suffer_other,d_control,a_label,b_label,c_label,label,predicted
0,subject184,"Un poco cansado , al menos me distraje al hace...",22,5086784470,2022-05-23 06:33:17,0.5,0.2,0.0,0.3,1,0.7,suffer+in favour,0.7,0.552381
1,subject185,No c como decirlo | Pero tengo deseos de suici...,26,10923780185,2020-07-22 22:17:47,1.0,0.0,0.0,0.0,1,1.0,suffer+in favour,1.0,1.0
2,subject186,Es la primera vez que ingreso a un grupo pero ...,17,38934710467,2020-09-13 05:10:17,0.7,0.0,0.1,0.2,1,0.8,suffer+in favour,0.8,1.0
3,subject188,últimamente he pensando en suicidarme | Pero s...,33,69981614487,2022-03-20 20:12:26,1.0,0.0,0.0,0.0,1,1.0,suffer+in favour,1.0,1.0
4,subject190,Es la unica persona que estuvo conmigo mientra...,44,19009535426,2021-10-24 08:48:22,1.0,0.0,0.0,0.0,1,1.0,suffer+in favour,1.0,1.0


In [14]:
preds_df = results_df.assign(
    a_pred=lambda df: df.predicted>0.5,
    b_pred=lambda df: np.clip(df.predicted.values, 0, 1)
)[['subject_id', 'a_pred', 'b_pred']]
preds_df.to_csv(f'data/b_{method}_test_predictions.csv', index=False)

In [16]:
f'data/{method}_test_predictions.csv'

'data/b_b_fine-tuning_test_predictions.csv'

In [30]:
from src.eval import ClassificationReport, RegressionReport

task_b_preds = results_df.predicted.apply(np.round,args=(5,))
task_a_preds = (task_b_preds > 0.5).astype(int)

taska_results = ClassificationReport.make_report(test_df['a_label'], task_a_preds)
taskb_results = RegressionReport.make_report(test_df['b_label'], task_b_preds)

print("Task A:")
display(taska_results.df.round(3))
print("Task B:")
display(taskb_results.df.round(3))
# print("Task C:")
# display(results.taskc.df.round(3))
# print("Task D:")
# display(results.taskd.df.round(3))

Task A:


Unnamed: 0,Accuracy,Precision (macro),Recall (macro),F1 (macro)
0,0.772,0.779,0.778,0.772


Task B:


Unnamed: 0,RMSE avg,R2 avg
0,0.244,0.581


EARLY-RISK METRICS (round predictions/evaluation)

In [31]:
import json, glob, os, re, tqdm
from codecarbon import EmissionsTracker
from src.class_eval import (
    ClassRegressionEvaluation,
    BinaryClassification,
    BinaryMultiClassification,
    ClassMultiRegressionEvaluation,
    Emissions
)
from src import data, eval

# load the test set
test_df = data.load('test').sort_values('round')
# carbon emissions tracker
tracker = EmissionsTracker(
        save_to_file = True,
        log_level= "INFO",
        tracking_mode= "process",
        output_dir= "reports")
# relevant columns for the tracker
relevant_cols = ['duration', 'emissions', 'cpu_energy', 'gpu_energy', 'ram_energy', 
            'energy_consumed', 'cpu_count', 'gpu_count', 'cpu_model', 'gpu_model', 'ram_total_size']

# initialize helper variables
prev_rounds_dfs = []
round_preds = {}
round_emmissions = {}
preds_dict = {}

[codecarbon INFO @ 12:02:33] [setup] RAM Tracking...
[codecarbon INFO @ 12:02:33] [setup] GPU Tracking...
[codecarbon INFO @ 12:02:33] No GPU found.
[codecarbon INFO @ 12:02:33] [setup] CPU Tracking...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[codecarbon INFO @ 12:02:33] CPU Model on constant consumption mode: Apple M1 Pro
[codecarbon INFO @ 12:02:33] >>> Tracker's metadata:
[codecarbon INFO @ 12:02:33]   Platform system: macOS-13.0-arm64-arm-64bit
[codecarbon INFO @ 12:02:33]   Python version: 3.8.16
[codecarbon INFO @ 12:02:33]   CodeCarbon version: 2.2.1
[codecarbon INFO @ 12:02:33]   Available RAM : 16.000 GB
[codecarbon INFO @ 12:02:33]   CPU count: 10
[codecarbon INFO @ 12:02:33]   CPU model: Apple M1 Pro
[codecarbon INFO @ 12:02:33]   GPU count: None
[codecarbon INFO @ 12:02:33]   GPU model: None


In [None]:
for round_num in tqdm.notebook.tqdm(sorted(test_df['round'].unique())):
    # print(f"Round {round_num} ======================")
    round_df = test_df[test_df['round'] == round_num]
    rounds_df = pd.concat(prev_rounds_dfs+[round_df], axis=0).drop_duplicates('id_message')
    rounds_df = rounds_df[rounds_df.subject_id.isin(round_df.subject_id.unique())]
    rounds_df = data.concat_messages(rounds_df)
    # make the predictions
    tracker.start()
    # pred_of_round = rounds_df.message.apply(predict).tolist()
    pred_of_round = predict(rounds_df.message.tolist()).tolist()
    # normalize on [0,1]
    task_b_preds = np.clip(np.array(pred_of_round), 0, 1)
    task_a_preds = (task_b_preds > 0.5).astype(int)
    pred_of_round_df = pd.DataFrame({'a_pred':task_a_preds, 'b_pred':task_b_preds})
    pred_of_round_df['nick'] = rounds_df['subject_id'].values
    tracker.stop()
    emissions_df = pd.read_csv('reports/emissions.csv') # pd.DataFrame([tracker.final_emissions_data])
    emissions_dict = emissions_df[relevant_cols].iloc[-1].to_dict()
    pred_of_round_df.columns = pred_of_round_df.columns.str.replace('[+|\s]', '_', regex=True).str.lower()
    pred_of_round_df['round'] = round_num
    # save the predictions and the round data
    round_preds[round_num] = pred_of_round_df
    round_emmissions[round_num] = emissions_dict
    preds_dict.update(pred_of_round_df.set_index('nick').to_dict(orient='index'))
    round_preds[round_num].to_csv(f'data/round_results/b_{method}_round_{round_num}_preds.csv', index=False)
    json.dump(round_emmissions[round_num],open(f"data/b_{method}_round_{round_num}_emissions.json",'w'))
    prev_rounds_dfs.append(round_df)

Get the metrics:

In [35]:
from src.class_eval import (
    ClassRegressionEvaluation,
    BinaryClassification,
    BinaryMultiClassification,
    ClassMultiRegressionEvaluation,
    Emissions
)
from src.eval import ClassificationReport, RegressionReport
import glob, re
import pandas as pd

method = "embeddings"
# method = "bert-fine-tuning"

# load round predictions obtained above:
round_preds = {}
round_preds_files = glob.glob(f'data/round_results/b_{method}_round_*_preds.csv')
for f in round_preds_files:
  round_num = re.search('round_(\d+)_preds.csv',f).group(1)
  round_preds[round_num] = pd.read_csv(f)#.drop(columns=['Unnamed: 0'])

  # to make a dictionary of dataframes with the predictions of each subject for all the rounds
round_preds_dict = {}
round_preds_seq_df = round_preds['1'].set_index('nick').sort_index()
for round_ in round_preds:
    # get the predictions of the last rounds
    prev_rounds_df = round_preds_seq_df.copy()
    # update the predictions with the current round
    this_round_df = round_preds[round_].set_index('nick').sort_index()
    prev_rounds_df.loc[this_round_df.index.values,this_round_df.columns] = this_round_df.values
    round_preds_dict[int(round_)] = prev_rounds_df.assign(round=int(round_)).reset_index()

round_preds_dict[100].to_csv(f'data/b_{method}_final_preds.csv', index=False)

Classification metrics:

In [36]:
def preprocess_binary_preds(binary_preds_df, pred_col='pred'):
  """ 
  preprocess the binary predictions dataframe to only keep the first round
  when each user was labeled as positive (if any)
  """
  results_dicts = []
  for nick, nick_df in binary_preds_df.groupby('nick'):
    df = nick_df.sort_values(['round'])
    # get the instance when this user was labeled as positive for the first time 
    result_df = df[df[pred_col]==1]
    if len(result_df) > 0: # if labeled positive in any round
      results_dicts.append(result_df.iloc[0].to_dict())
    else: # otherwise get the results of the last round
      results_dicts.append(df.iloc[-1].to_dict())
  return pd.DataFrame(results_dicts)


# predictions of each round
round_preds_df = pd.concat(round_preds.values()).sort_values(['nick','round'])

# binary classification
binary_preds_df = round_preds_df[['round','nick','a_pred']].rename(columns={'a_pred':'pred'})
binary_preds_df = preprocess_binary_preds(binary_preds_df)
binary_class = BinaryClassification(
    '2',  data=binary_preds_df, qrels='data/test/golden_truth/task2_gold_a.txt'
  )

# binary regression
binary_reg_df = round_preds_df[['round','nick','b_pred']].rename(columns={'b_pred':'pred'})
binary_reg_std = ClassRegressionEvaluation('2',  # standard regression metrics
                                       data=binary_reg_df.groupby(['nick']).last().reset_index(), 
                                       qrels='data/test/golden_truth/task2_gold_b.txt')
binary_reg_preds_dict = {k:v[['round','nick','b_pred']].rename(columns={'b_pred':'pred'}) for k,v in round_preds_dict.items()}
binary_reg_rank = ClassRegressionEvaluation('2',  # rank-based regression metrics
                                        data=binary_reg_preds_dict,
                                        qrels='data/test/golden_truth/task2_gold_b.txt')



149 lines read in qrels file!



149 lines read in qrels file!



149 lines read in qrels file!




In [37]:
print("Task A:")
task_a_metrics = binary_class.eval_performance()
task_a_metrics_df = pd.DataFrame([task_a_metrics]).round(3)
display(task_a_metrics_df)
print("Task B:")
task_b_metrics_std = binary_reg_std.eval_performance()
task_b_metrics_rank = binary_reg_rank.eval_performance_rank_based()
task_b_metrics_df_std = pd.DataFrame([task_b_metrics_std]).round(3)
display(task_b_metrics_df_std)
print('Rank-based metrics:')
display(pd.DataFrame(task_b_metrics_rank).T.rename_axis('round').round(3))

# save the results
# os.makedirs(f'reports/{method}', exist_ok=True)
json.dump(task_a_metrics, open(f'reports/b_embeddings_task_a_metrics.json', 'w'), indent=2)
json.dump(task_b_metrics_std, open(f'reports/b_embeddings_task_b_std_metrics.json', 'w'), indent=2)
json.dump(task_b_metrics_rank, open(f'reports/b_embeddings_tasl_b_rank_metrics.json', 'w'), indent=2)

Task A:
DECISION-BASED EVALUATION:
Accuracy:0.6912751677852349
Macro precision:0.7125090777051561
Macro recall:0.7643405600722675
Macro f1:0.6807935916542474
Micro precision:0.6912751677852349
Micro recall:0.6912751677852349
Micro f1:0.6912751677852349
ERDE_5:0.2860443554898514
ERDE_50:0.1518400072068826
Median latency:3.0
Speed:0.9821019115346772
latency-weightedF1:0.7254161846562956


Unnamed: 0,Accuracy,Macro_P,Macro_R,Macro_F1,Micro_P,Micro_R,Micro_F1,ERDE5,ERDE30,ERDE50,latencyTP,speed,latency-weightedF1
0,0.691,0.713,0.764,0.681,0.691,0.691,0.691,0.286,0.02,0.152,3.0,0.982,0.725


Task B:
RMSE:0.24366801978689823
Pearson correlation coefficient:0.7784207435974818
RANK-BASED EVALUATION:
Analizing ranking at round 1
P@5:0.6
P@10:0.7
P@20:0.55
P@30:0.36666666666666664
P@50:0.22
Analizing ranking at round 25
P@5:0.8
P@10:0.8
P@20:0.45
P@30:0.36666666666666664
P@50:0.24
Analizing ranking at round 50
P@5:0.8
P@10:0.7
P@20:0.55
P@30:0.36666666666666664
P@50:0.22
Analizing ranking at round 75
P@5:0.8
P@10:0.6
P@20:0.55
P@30:0.36666666666666664
P@50:0.22


Unnamed: 0,RMSE:,Pearson_coefficient
0,0.244,0.778


Rank-based metrics:


Unnamed: 0_level_0,@5,@10,@20,@30,@50
round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.6,0.7,0.55,0.367,0.22
25,0.8,0.8,0.45,0.367,0.24
50,0.8,0.7,0.55,0.367,0.22
75,0.8,0.6,0.55,0.367,0.22
