In [1]:
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
import logging
import time
import os
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tqdm import tqdm
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from scipy.special import softmax
import numpy as np
import torch

In [2]:
train = pd.read_csv('train_soft_human.csv')
test = pd.read_csv('test_soft_human.csv')
val = pd.read_csv('val_soft_human.csv')

In [3]:
def no_maj(df):
    df = df.loc[df['majority_label'] != 'No majority']
    print(df.shape)
    return df

train = no_maj(train)
test = no_maj(test)
val = no_maj(val)

(619, 23)
(139, 23)
(139, 23)


In [4]:
label_encoding = {'Pro': 0,
'Against': 1,
'Neutral': 2,
'Not-about': 3}

train['labels'] = train['majority_label'].map(label_encoding)
val['labels'] = val['majority_label'].map(label_encoding)
test['labels'] = test['majority_label'].map(label_encoding)

In [5]:
train = train[['Input', 'labels']]
val = val[['Input', 'labels']]
test = test[['Input', 'labels']]

In [6]:
train_ = Dataset.from_pandas(train)
test_ = Dataset.from_pandas(test)
val_ = Dataset.from_pandas(val)


dataset = DatasetDict({'train': train_, 'test': test_, 'val': val_})

In [7]:
save_dir = '../output/'
model_name = 'FacebookAI/roberta-large' #google-bert/bert-large-uncased'
model_name_filename = model_name.replace("/", "-")

In [8]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')



In [9]:
def tokenize_func(examples):
    tokenized_inputs = tokenizer(examples['Input'], padding = 'max_length', truncation = True, max_length = 512)
    tokenized_inputs['label'] = examples['labels']
    return tokenized_inputs

In [10]:
train_tokenized = train_.map(tokenize_func, batched = True)
val_tokenized = val_.map(tokenize_func, batched = True)
test_tokenized = test_.map(tokenize_func, batched = True)

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

In [11]:
train_tokenized

Dataset({
    features: ['Input', 'labels', '__index_level_0__', 'input_ids', 'attention_mask', 'label'],
    num_rows: 619
})

In [12]:
train_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
val_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])

In [17]:
num_labels = 4


In [18]:
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=num_labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
output_dir = f'./output/baseline_{model_name_filename}'

In [21]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    
    # Compute cross-entropy loss
    probs = softmax(logits, axis=-1)
    cross_entropy = -np.sum(np.eye(probs.shape[1])[labels] * np.log(probs + 1e-9)) / len(labels)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'cross_entropy': cross_entropy
    }


In [22]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)


In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=train_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [24]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpraveenbushipaka942[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,F1,Cross Entropy
1,1.3895,1.373397,0.305331,0.194049,1.373424
2,1.4012,1.361438,0.323102,0.157803,1.361593
3,1.3542,1.340516,0.379645,0.277219,1.340549
4,1.3055,1.23697,0.450727,0.392419,1.237038
5,1.1468,1.04158,0.558966,0.550432,1.041462
6,0.9564,0.692162,0.77706,0.776074,0.691982




TrainOutput(global_step=120, training_loss=1.281460984547933, metrics={'train_runtime': 418.681, 'train_samples_per_second': 8.871, 'train_steps_per_second': 0.287, 'total_flos': 3461216472686592.0, 'train_loss': 1.281460984547933, 'epoch': 6.0})

In [25]:
eval_results = trainer.evaluate()
print(eval_results)



{'eval_loss': 0.6921621561050415, 'eval_accuracy': 0.777059773828756, 'eval_f1': 0.7760738264740161, 'eval_cross_entropy': 0.6919822075399713, 'eval_runtime': 7.5146, 'eval_samples_per_second': 82.373, 'eval_steps_per_second': 2.661, 'epoch': 6.0}


In [26]:
best_model_dir = f'{output_dir}/best_model'

In [27]:
model.save_pretrained(best_model_dir)
tokenizer.save_pretrained(best_model_dir)

('./output/baseline_FacebookAI-roberta-large/best_model/tokenizer_config.json',
 './output/baseline_FacebookAI-roberta-large/best_model/special_tokens_map.json',
 './output/baseline_FacebookAI-roberta-large/best_model/vocab.json',
 './output/baseline_FacebookAI-roberta-large/best_model/merges.txt',
 './output/baseline_FacebookAI-roberta-large/best_model/added_tokens.json')

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [64]:
tokenizer = RobertaTokenizer.from_pretrained(best_model_dir)
model = RobertaForSequenceClassification.from_pretrained(best_model_dir)

In [65]:
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [55]:
def predictions(text):
    inputs = tokenizer(text, return_tensors="pt", truncation = True, padding = 'max_length', max_length = 512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).tolist()[0]
        predicted_class = np.argmax(probabilities)
        return probabilities, predicted_class

In [56]:
softmax_prob = []
softmax_pred = []

for i, row in test.iterrows():
    text = row['Input']
    probs, preds = predictions(text)
    softmax_prob.append(probs)
    softmax_pred.append(preds)

In [58]:
test['softmax_prob'] = softmax_prob
test['softmax_preds'] = softmax_pred

In [59]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [60]:
model.push_to_hub('Multiperspective/roberta-human-label')
tokenizer.push_to_hub('Multiperspective/roberta-human-label')

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Multiperspective/roberta-human-label/commit/e7855687fedc6dfede5d7ef20756dd7523952c9d', commit_message='Upload tokenizer', commit_description='', oid='e7855687fedc6dfede5d7ef20756dd7523952c9d', pr_url=None, pr_revision=None, pr_num=None)

In [62]:
y_true = test['labels']
y_pred = test['softmax_preds']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy*100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision*100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall*100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1*100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 56.11510791366906
Precision: 61.119791666666664
Recall: 58.04597701149425
F1 Score: 57.22824498586048
Confusion Matrix:
[[14  2 22  5]
 [ 1 19  7  2]
 [ 4  3 29  7]
 [ 2  0  6 16]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.33      0.44        43
           1       0.79      0.66      0.72        29
           2       0.45      0.67      0.54        43
           3       0.53      0.67      0.59        24

    accuracy                           0.56       139
   macro avg       0.61      0.58      0.57       139
weighted avg       0.60      0.56      0.55       139



## Temperature scaling

In [64]:
from temperature_scaling_roberta import TemperatureScalingCalibrationModule

In [66]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 619
    })
    test: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 139
    })
    val: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 139
    })
})

In [67]:
columns = ['Input', '__index_level_0__']

In [68]:
def tokenize_fn(example):
    # Tokenize the input text
    tokenized_example = tokenizer(example['Input'], padding='max_length', truncation=True)
    # Add the numerical majority label
    tokenized_example['label'] = example['labels']
    return tokenized_example

tokenized_dict = dataset.map(
    tokenize_fn,
    batched= True,
    remove_columns = columns

)

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

In [70]:
calibration_module = TemperatureScalingCalibrationModule(best_model_dir, tokenizer).to(device)
calibration_module.fit(tokenized_dict['val'], n_epochs = 6)

100%|██████████| 6/6 [00:23<00:00,  3.98s/it]


TemperatureScalingCalibrationModule(
  (model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfO

In [71]:
calibration_module.temperature

Parameter containing:
tensor([1.0055], device='cuda:0', requires_grad=True)

In [72]:
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np

In [73]:
def predict(model, examples, round_digits: int = 5):
    input_ids = examples['input_ids'].to(device)
    attention_mask = examples['attention_mask'].to(device)
    #token_type_ids = examples['token_type_ids'].to(device)
    batch_labels = examples['labels'].detach().cpu().numpy().tolist()
    model.eval()
    with torch.no_grad():
        batch_output = model(input_ids, attention_mask) #,token_type_ids

    batch_scores = np.round(batch_output.detach().cpu().numpy(), round_digits).tolist()
    predicted_labels = [np.argmax(scores) for scores in batch_scores]
    return batch_scores, batch_labels, predicted_labels


def predict_data_loader(model, data_loader: DataLoader) -> pd.DataFrame:
    scores = []
    true_labels = []
    pred_labels = []
    
    for examples in data_loader:
        batch_scores, batch_labels, batch_pred_labels = predict(model, examples)
        scores += batch_scores
        true_labels += batch_labels
        pred_labels += batch_pred_labels

    df_predictions = pd.DataFrame({'scores': scores, 'original_labels': true_labels, 'pred_labels': pred_labels})
    return df_predictions

In [74]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)
data_loader = DataLoader(tokenized_dict['test'], collate_fn=data_collator, batch_size=128)
start = time.time()
df_calibrated_predictions = predict_data_loader(calibration_module, data_loader)
end = time.time()

print('elapsed: ', end - start)
print(df_calibrated_predictions.shape)
df_calibrated_predictions.head()

elapsed:  4.149695873260498
(139, 3)


Unnamed: 0,scores,original_labels,pred_labels
0,"[0.5756800174713135, 0.1309400051832199, 0.245...",0,0
1,"[0.5684800148010254, 0.04566999897360802, 0.27...",0,0
2,"[0.10339999943971634, 0.06179000064730644, 0.2...",1,3
3,"[0.18422000110149384, 0.042319998145103455, 0....",3,3
4,"[0.11984000355005264, 0.01844000071287155, 0.1...",3,3


In [75]:
y_true = df_calibrated_predictions['original_labels']
y_pred = df_calibrated_predictions['pred_labels']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy*100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision*100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall*100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1*100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 56.11510791366906
Precision: 61.119791666666664
Recall: 58.04597701149425
F1 Score: 57.22824498586048
Confusion Matrix:
[[14  2 22  5]
 [ 1 19  7  2]
 [ 4  3 29  7]
 [ 2  0  6 16]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.33      0.44        43
           1       0.79      0.66      0.72        29
           2       0.45      0.67      0.54        43
           3       0.53      0.67      0.59        24

    accuracy                           0.56       139
   macro avg       0.61      0.58      0.57       139
weighted avg       0.60      0.56      0.55       139



In [76]:
test.head()

Unnamed: 0,Input,labels,softmax_prob,softmax_preds
0,Should Social Security Be Privatized? Social S...,0,"[0.5773278474807739, 0.13025356829166412, 0.24...",0
1,Can Alternative Energy Effectively Replace Fos...,0,"[0.5700161457061768, 0.04516660422086716, 0.27...",0
2,Should the United States Maintain Its Embargo ...,1,"[0.10270910710096359, 0.061203762888908386, 0....",3
3,Should the United States Return to a Gold Stan...,3,"[0.18370606005191803, 0.041861020028591156, 0....",3
4,Is Obesity a Disease? Treatment for obesity an...,3,"[0.11896917223930359, 0.01811995729804039, 0.1...",3


In [79]:
df_calibrated_predictions.head()

Unnamed: 0,scores,original_labels,pred_labels
0,"[0.5756800174713135, 0.1309400051832199, 0.245...",0,0
1,"[0.5684800148010254, 0.04566999897360802, 0.27...",0,0
2,"[0.10339999943971634, 0.06179000064730644, 0.2...",1,3
3,"[0.18422000110149384, 0.042319998145103455, 0....",3,3
4,"[0.11984000355005264, 0.01844000071287155, 0.1...",3,3


In [77]:
test_df = pd.read_csv('test_soft_human.csv')

In [80]:
test_df['labels'] = test_df['labels'].tolist()
test_df['uncalib_scores'] = softmax_prob
test_df['uncalib_preds'] = softmax_pred
test_df['calib_scores'] = df_calibrated_predictions['scores'].tolist()
test_df['calib_preds'] = df_calibrated_predictions['pred_labels'].tolist()

In [81]:
test_df.to_csv('results_baseline_roberta_human.csv', index= False)