In [1]:
import os, glob, json, sys
import pickle
import pandas as pd
import numpy as np

from src import data, utils, eval
from typing import List

sys.path.append('src')

In [7]:
# load the fine-tuned model and its tokenizer
from transformers import AutoTokenizer

tokenizer_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
with open('models/roberta-base-bne-finetuned.final-30.pkl', 'rb') as f:
    model = pickle.load(f)
# make the prediction function
device = "mps"
model.to(device)
# predict = utils.make_predict(model.predict, tokenizer=tokenizer, device=device)
def predict(msgs:List[str]):
    predictions = np.zeros(len(msgs)).tolist()
    for i,msg in enumerate(msgs):
        predictions[i] = model.predict(msg, tokenizer=tokenizer, device=device)
    return np.array(predictions)
method = "fine-tuning"

In [10]:
#load the embeddings regression model and its sentece embeddings
from sentence_transformers import SentenceTransformer
from src.embeddings import EmbeddingsRegressor

regressor_file = 'models/ridge_regressor-final.pkl'
regressor_file = 'models/2d_ridge_roberta-suicide-regchain-pca-final.pkl'
model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
tokenizer = SentenceTransformer(model_name)
with open(regressor_file, 'rb') as f:
    regressor = pickle.load(f)
model = EmbeddingsRegressor(tokenizer, regressor, normalize_output=True)
# make the prediction function
predict = utils.make_predict(model.predict)
# method = "embeddings_multireg"
method = "embeddings_chain-pca"

No sentence-transformers model found with name /Users/simon/.cache/torch/sentence_transformers/hackathon-somos-nlp-2023_roberta-base-bne-finetuned-suicide-es. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/simon/.cache/torch/sentence_transformers/hackathon-somos-nlp-2023_roberta-base-bne-finetuned-suicide-es were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ABSOLUTE METRICS (predicting all observations):

In [None]:
# load all the test set messages and concatenate then into a single string per subject id
test_df = data.load('test')
test_df = data.concat_messages(test_df)
# get the predictions of each subject from the model
results_df = test_df.assign(
    label = lambda df: df.filter(regex="^d_").values.tolist(),
)
results_df['predicted'] = predict(test_df.message.tolist()).tolist()
# normalize
# results_df['predicted'] = results_df['predicted'].apply(lambda x: (np.array(x)/sum(x)).tolist())
results_df['predicted'] = utils.normalize(np.array(results_df.predicted.tolist())).tolist()
results_df.head()

In [9]:
predictions = results_df.predicted.apply(np.round,args=(5,)).tolist()
preds_df = data.make_task_labels_from_d(predictions, include_d=True).rename(
    columns={c:'d_'+c.replace('+','_').replace('|','_') for c in data.task_d_cols}
)
preds_df.to_csv(f'data/d_{method}_test_predictions.csv', index=False)
results = eval.absolute_results(test_df, preds_df)
print("Task A:")
display(results.taska.df.round(3))
print("Task B:")
display(results.taskb.df.round(3))
print("Task C:")
display(results.taskc.df.round(3))
print("Task D:")
display(results.taskd.df.round(3))

Task A:


Unnamed: 0,Accuracy,Precision (macro),Recall (macro),F1 (macro)
0,0.711,0.753,0.727,0.707


Task B:


Unnamed: 0,RMSE avg,R2 avg
0,0.304,0.349


Task C:


Unnamed: 0,Accuracy,Precision (macro),Recall (macro),F1 (macro)
0,0.577,0.439,0.468,0.431


Task D:


Unnamed: 0,RMSE avg,R2 avg,R2 control,R2 suffer_against,R2 suffer_in_favour,R2 suffer_other,RMSE control,RMSE suffer_against,RMSE suffer_in_favour,RMSE suffer_other
0,0.222,0.006,0.349,-0.144,0.358,-0.538,0.304,0.23,0.212,0.143


EARLY-RISK METRICS (round predictions/evaluation)

In [11]:
import json, glob, os, re, tqdm
from codecarbon import EmissionsTracker
from src.class_eval import (
    ClassRegressionEvaluation,
    BinaryClassification,
    BinaryMultiClassification,
    ClassMultiRegressionEvaluation,
    Emissions
)

# load the test set
test_df = data.load('test').sort_values('round')
# carbon emissions tracker
tracker = EmissionsTracker(
        save_to_file = True,
        log_level= "INFO",
        tracking_mode= "process",
        output_dir= "reports")
# relevant columns for the tracker
relevant_cols = ['duration', 'emissions', 'cpu_energy', 'gpu_energy', 'ram_energy', 
            'energy_consumed', 'cpu_count', 'gpu_count', 'cpu_model', 'gpu_model', 'ram_total_size']

# initialize helper variables
prev_rounds_dfs = []
round_preds = {}
round_emmissions = {}
preds_dict = {}

[codecarbon INFO @ 12:52:19] [setup] RAM Tracking...
[codecarbon INFO @ 12:52:19] [setup] GPU Tracking...
[codecarbon INFO @ 12:52:19] No GPU found.
[codecarbon INFO @ 12:52:19] [setup] CPU Tracking...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[codecarbon INFO @ 12:52:19] CPU Model on constant consumption mode: Apple M1 Pro
[codecarbon INFO @ 12:52:19] >>> Tracker's metadata:
[codecarbon INFO @ 12:52:19]   Platform system: macOS-13.0-arm64-arm-64bit
[codecarbon INFO @ 12:52:19]   Python version: 3.8.16
[codecarbon INFO @ 12:52:19]   CodeCarbon version: 2.2.1
[codecarbon INFO @ 12:52:19]   Available RAM : 16.000 GB
[codecarbon INFO @ 12:52:19]   CPU count: 10
[codecarbon INFO @ 12:52:19]   CPU model: Apple M1 Pro
[codecarbon INFO @ 12:52:19]   GPU count: None
[codecarbon INFO @ 12:52:19]   GPU model: None


In [None]:
os.makedirs(f'data/round_results/{method}', exist_ok=True)
for round_num in tqdm.notebook.tqdm(sorted(test_df['round'].unique())):
    # print(f"Round {round_num} ======================")
    round_df = test_df[test_df['round'] == round_num]
    rounds_df = pd.concat(prev_rounds_dfs+[round_df], axis=0).drop_duplicates('id_message')
    rounds_df = rounds_df[rounds_df.subject_id.isin(round_df.subject_id.unique())]
    rounds_df = data.concat_messages(rounds_df)
    # make the predictions
    tracker.start()
    # pred_of_round = rounds_df.message.apply(predict).tolist()
    pred_of_round = predict(rounds_df.message.tolist()).tolist()
    # normalize on [0,1]
    pred_of_round = np.array(pred_of_round).reshape(-1,4)
    pred_of_round = pred_of_round / pred_of_round.sum(axis=1, keepdims=True)
    pred_of_round_df = data.make_task_labels_from_d(pred_of_round.tolist(), include_d=True)
    pred_of_round_df['nick'] = rounds_df['subject_id'].values
    tracker.stop()
    emissions_df = pd.read_csv('reports/emissions.csv') # pd.DataFrame([tracker.final_emissions_data])
    emissions_dict = emissions_df[relevant_cols].iloc[-1].to_dict()
    pred_of_round_df.columns = pred_of_round_df.columns.str.replace('[+|\s]', '_', regex=True).str.lower()
    pred_of_round_df['round'] = round_num
    # save the predictions and the round data
    round_preds[round_num] = pred_of_round_df
    round_emmissions[round_num] = emissions_dict
    preds_dict.update(pred_of_round_df.set_index('nick').to_dict(orient='index'))
    round_preds[round_num].to_csv(f'data/round_results/{method}/round_{round_num}_preds.csv', index=False)
    json.dump(round_emmissions[round_num],open(f"data/round_results/{method}/round_{round_num}_emissions.json",'w'))
    prev_rounds_dfs.append(round_df)

Get the metrics:

In [15]:
from src.class_eval import (
    ClassRegressionEvaluation,
    BinaryClassification,
    BinaryMultiClassification,
    ClassMultiRegressionEvaluation,
    Emissions
)
import glob, re
import pandas as pd

# method = "embeddings"
# method = "bert-fine-tuning"

# load round predictions obtained above:
round_preds = {}
round_preds_files = glob.glob(f'data/round_results/{method}/round_*_preds.csv')
for f in round_preds_files:
  round_num = re.search('round_(\d+)_preds.csv',f).group(1)
  round_preds[round_num] = pd.read_csv(f)#.drop(columns=['Unnamed: 0'])

  # to make a dictionary of dataframes with the predictions of each subject for all the rounds
round_preds_dict = {}
round_preds_seq_df = round_preds['1'].set_index('nick').sort_index()
for round_ in round_preds:
    # get the predictions of the last rounds
    prev_rounds_df = round_preds_seq_df.copy()
    # update the predictions with the current round
    this_round_df = round_preds[round_].set_index('nick').sort_index()
    prev_rounds_df.loc[this_round_df.index.values,this_round_df.columns] = this_round_df.values
    round_preds_dict[int(round_)] = prev_rounds_df.assign(round=int(round_)).reset_index()

Classification metrics:

In [16]:
def preprocess_binary_preds(binary_preds_df, pred_col='pred'):
  """ 
  preprocess the binary predictions dataframe to only keep the first round
  when each user was labeled as positive (if any)
  """
  results_dicts = []
  for nick, nick_df in binary_preds_df.groupby('nick'):
    df = nick_df.sort_values(['round'])
    # get the instance when this user was labeled as positive for the first time 
    result_df = df[df[pred_col]==1]
    if len(result_df) > 0: # if labeled positive in any round
      results_dicts.append(result_df.iloc[0].to_dict())
    else: # otherwise get the results of the last round
      results_dicts.append(df.iloc[-1].to_dict())
  return pd.DataFrame(results_dicts)


# predictions of each round
round_preds_df = pd.concat(round_preds.values()).sort_values(['nick','round'])

# binary classification
binary_preds_df = round_preds_df[['round','nick','a_label']].rename(columns={'a_label':'pred'})
binary_preds_df = preprocess_binary_preds(binary_preds_df)
binary_class = BinaryClassification(
    '2',  data=binary_preds_df, qrels='data/test/golden_truth/task2_gold_a.txt'
  )

# binary regression
binary_reg_df = round_preds_df[['round','nick','b_label']].rename(columns={'b_label':'pred'})
binary_reg_std = ClassRegressionEvaluation('2',  # standard regression metrics
                                       data=binary_reg_df.groupby(['nick']).last().reset_index(), 
                                       qrels='data/test/golden_truth/task2_gold_b.txt')
binary_reg_preds_dict = {k:v[['round','nick','b_label']].rename(columns={'b_label':'pred'}) for k,v in round_preds_dict.items()}
binary_reg_rank = ClassRegressionEvaluation('2',  # rank-based regression metrics
                                        data=binary_reg_preds_dict,
                                        qrels='data/test/golden_truth/task2_gold_b.txt')

# multi-class classification
multi_class_preds_df = round_preds_df[['round','nick','c_label']].rename(columns={'c_label':'pred'})
# make the  binary predictions (1 if suffer, 0 otherwise)
multi_class_preds_df = multi_class_preds_df.assign(pred_b = lambda df: df.pred.str.contains('suffer').astype(int))
multi_class_preds_df = preprocess_binary_preds(multi_class_preds_df,pred_col='pred_b')
multiclass = BinaryMultiClassification(
    '2',  data=multi_class_preds_df,  qrels='data/test/golden_truth/task2_gold_c.txt'
)
del multiclass.qrels_multiclass['subject51'] # this user is not in the test set gold labels

# multi-regression
multi_reg_df = round_preds_df.set_index(['round','nick'])[['suffer_in_favour','suffer_against','suffer_other','control']].assign(
  pred=lambda df: df.values.tolist()
)
multi_reg_preds_dict = {}
for round_ in round_preds_dict:
  multi_reg_preds_dict[round_] = round_preds_dict[round_].set_index(['nick'])[['suffer_in_favour','suffer_against','suffer_other','control']].assign(
    pred=lambda df: df.values.tolist()
  )
multi_reg = ClassMultiRegressionEvaluation('2',  # standard regression metrics
                                        data=multi_reg_df.groupby(['nick']).last().reset_index(),
                                        qrels='data/test/golden_truth/task2_gold_d.txt')
multi_reg_rank = ClassMultiRegressionEvaluation('2',  # rank-based regression metrics
                                        data=multi_reg_preds_dict,
                                        qrels='data/test/golden_truth/task2_gold_d.txt')



149 lines read in qrels file!



149 lines read in qrels file!



149 lines read in qrels file!



150 lines read in qrels file!



149 lines read in qrels file!



149 lines read in qrels file!




In [19]:
print("Task A:")
task_a_metrics = binary_class.eval_performance()
task_a_metrics_df = pd.DataFrame([task_a_metrics]).round(3)
display(task_a_metrics_df)
print("Task B:")
task_b_metrics_std = binary_reg_std.eval_performance()
task_b_metrics_rank = binary_reg_rank.eval_performance_rank_based()
task_b_metrics_df_std = pd.DataFrame([task_b_metrics_std]).round(3)
display(task_b_metrics_df_std)
print('Rank-based metrics:')
display(pd.DataFrame(task_b_metrics_rank).T.rename_axis('round').round(3))
print("Task C:")
task_c_metrics = multiclass.eval_performance()
task_c_metrics_df = pd.DataFrame([task_c_metrics]).round(3)
display(task_c_metrics_df)
print("Task D:")
task_d_metrics_std = multi_reg.eval_performance()
task_d_metrics_rank = multi_reg_rank.eval_performance_rank_based()
task_d_metrics_df_std = pd.DataFrame([task_d_metrics_std]).round(3)
display(task_d_metrics_df_std)
print('Rank-based metrics:')
display(pd.DataFrame(task_d_metrics_rank).T.rename_axis('round').round(3))

# save the results
os.makedirs(f'reports/d_{method}', exist_ok=True)
json.dump(task_a_metrics, open(f'reports/d_{method}/task_a_metrics.json', 'w'), indent=2)
json.dump(task_b_metrics_std, open(f'reports/d_{method}/task_b_std_metrics.json', 'w'), indent=2)
json.dump(task_b_metrics_rank, open(f'reports/d_{method}/task_b_rank_metrics.json', 'w'), indent=2)
json.dump(task_c_metrics, open(f'reports/d_{method}/task_c_metrics.json', 'w'), indent=2)
json.dump(task_d_metrics_std, open(f'reports/d_{method}/task_d_std_metrics.json', 'w'), indent=2)
json.dump(task_d_metrics_rank, open(f'reports/d_{method}/task_d_rank_metrics.json', 'w'), indent=2)

Task A:
DECISION-BASED EVALUATION:
Accuracy:0.6912751677852349
Macro precision:0.7113289760348583
Macro recall:0.7553751645458534
Macro f1:0.682332220986281
Micro precision:0.6912751677852349
Micro recall:0.6912751677852349
Micro f1:0.6912751677852349
ERDE_5:0.2827975990970474
ERDE_50:0.15548849150939317
Median latency:3.0
Speed:0.9821019115346772
latency-weightedF1:0.7224657740025211


Unnamed: 0,Accuracy,Macro_P,Macro_R,Macro_F1,Micro_P,Micro_R,Micro_F1,ERDE5,ERDE30,ERDE50,latencyTP,speed,latency-weightedF1
0,0.691,0.711,0.755,0.682,0.691,0.691,0.691,0.283,0.027,0.155,3.0,0.982,0.722


Task B:
RMSE:0.2407803309766703
Pearson correlation coefficient:0.7746648207201776
RANK-BASED EVALUATION:
Analizing ranking at round 1
P@5:0.0
P@10:0.0
P@20:0.15
P@30:0.1
P@50:0.1
Analizing ranking at round 25
P@5:0.0
P@10:0.2
P@20:0.15
P@30:0.13333333333333333
P@50:0.08
Analizing ranking at round 50
P@5:0.0
P@10:0.0
P@20:0.15
P@30:0.1
P@50:0.1
Analizing ranking at round 75
P@5:0.0
P@10:0.0
P@20:0.15
P@30:0.1
P@50:0.1


Unnamed: 0,RMSE:,Pearson_coefficient
0,0.241,0.775


Rank-based metrics:


Unnamed: 0_level_0,@5,@10,@20,@30,@50
round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0,0.0,0.15,0.1,0.1
25,0.0,0.2,0.15,0.133,0.08
50,0.0,0.0,0.15,0.1,0.1
75,0.0,0.0,0.15,0.1,0.1


Task C:
DECISION-BASED EVALUATION:
Accuracy:0.5302013422818792
Macro precision:0.4366506123058542
Macro recall:0.41759500189732746
Macro f1:0.39195194206714945
Micro precision:0.5302013422818792
Micro recall:0.5302013422818792
Micro f1:0.5302013422818792
ERDE_5:0.28381917470174994
ERDE_50:0.1565100671140957
Median latency:3.0
Speed:0.9821019115346772
latency-weightedF1:0.7183373981510782


Unnamed: 0,Accuracy,Macro_P,Macro_R,Macro_F1,Micro_P,Micro_R,Micro_F1,ERDE5,ERDE30,ERDE50,latencyTP,speed,latency-weightedF1
0,0.53,0.437,0.418,0.392,0.53,0.53,0.53,0.284,0.157,0.157,3.0,0.982,0.718


Task D:
RMSE:0.17865016808534367
Pearson correlation coefficient:
Pearson sf:0.7472435224903471
Pearson sa:0.4992946899714973
Pearson so:0.37469662952285493
Pearson c:0.7742933448336263
PRECISION AT - EVALUATION:
Analizing ranking at round 1
P@5:0.55
P@10:0.275
P@20:0.25
P@30:0.20833333333333331
P@50:0.2
Analizing ranking at round 25
P@5:0.55
P@10:0.275
P@20:0.275
P@30:0.25
P@50:0.24
Analizing ranking at round 50
P@5:0.49999999999999994
P@10:0.24999999999999997
P@20:0.275
P@30:0.24166666666666667
P@50:0.23
Analizing ranking at round 75
P@5:0.49999999999999994
P@10:0.24999999999999997
P@20:0.275
P@30:0.225
P@50:0.23


Unnamed: 0,RMSE:,Pearson_mean,Pearson_sf,Pearson_sa,Pearson_so,Pearson_c
0,0.179,0.599,0.747,0.499,0.375,0.774


Rank-based metrics:


Unnamed: 0_level_0,@5,@10,@20,@30,@50
round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.55,0.275,0.25,0.208,0.2
25,0.55,0.275,0.275,0.25,0.24
50,0.5,0.25,0.275,0.242,0.23
75,0.5,0.25,0.275,0.225,0.23
