In [2]:
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
import logging
import time
import os
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tqdm import tqdm
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from scipy.special import softmax
import numpy as np
import torch

In [3]:
train = pd.read_csv('df_train_llm_soft.csv')
val = pd.read_csv('df_val_llm_soft.csv')
test = pd.read_csv('df_test_llm_soft.csv')

In [4]:
label2id = {
    'Pro': 0,
    'Against': 1,
    'Neutral': 2,
    'Not-about': 3
}

id2label = {v: k for k, v in label2id.items()}

In [5]:
def no_maj(df, col_name):
    df = df.loc[df[col_name] != 'No Majority']
    print(df.shape)
    return df

train = no_maj(train, 'majority_llm_noninst')
test = no_maj(test, 'majority_llm_noninst')
val = no_maj(val, 'majority_llm_noninst')   

(505, 22)
(102, 35)
(97, 35)


In [6]:
label_encoding = {'Pro': 0,
'Against': 1,
'Neutral': 2,
'Not-about': 3}

train['labels'] = train['majority_llm_noninst'].map(label_encoding)
val['labels'] = val['majority_llm_noninst'].map(label_encoding)
test['labels'] = test['majority_llm_noninst'].map(label_encoding)

In [7]:
train = train[['Input', 'labels']]
val = val[['Input', 'labels']]
test = test[['Input', 'labels']]

In [8]:
train_ = Dataset.from_pandas(train)
test_ = Dataset.from_pandas(test)
val_ = Dataset.from_pandas(val)


dataset = DatasetDict({'train': train_, 'test': test_, 'val': val_})

In [9]:
save_dir = '../output/llm/'
model_name = 'FacebookAI/roberta-large' #google-bert/bert-large-uncased'
model_name_filename = model_name.replace("/", "-")

In [10]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')



In [11]:
def tokenize_func(examples):
    tokenized_inputs = tokenizer(examples['Input'], padding = 'max_length', truncation = True, max_length = 512)
    tokenized_inputs['label'] = examples['labels']
    return tokenized_inputs

In [12]:
train_tokenized = train_.map(tokenize_func, batched = True)
val_tokenized = val_.map(tokenize_func, batched = True)
test_tokenized = test_.map(tokenize_func, batched = True)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

In [13]:
train_tokenized

Dataset({
    features: ['Input', 'labels', '__index_level_0__', 'input_ids', 'attention_mask', 'label'],
    num_rows: 505
})

In [14]:
train_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
val_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])

In [15]:
num_labels = 4
len(label2id)

4

In [16]:
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=len(label2id))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
output_dir = f'./output/llm/baseline_{model_name_filename}'

In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    
    # Compute cross-entropy loss
    probs = softmax(logits, axis=-1)
    cross_entropy = -np.sum(np.eye(probs.shape[1])[labels] * np.log(probs + 1e-9)) / len(labels)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'cross_entropy': cross_entropy
    }


In [20]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)


In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=train_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [22]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpraveenbushipaka942[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,F1,Cross Entropy
1,1.5124,1.247733,0.447525,0.438081,1.247605
2,0.9982,0.977691,0.633663,0.491569,0.978228
3,1.0151,0.986554,0.633663,0.491569,0.987518
4,0.9977,0.930489,0.639604,0.508652,0.93116
5,0.9798,0.931812,0.637624,0.500544,0.932027
6,0.9529,0.866917,0.635644,0.496064,0.867351




TrainOutput(global_step=96, training_loss=1.0559033453464508, metrics={'train_runtime': 390.5207, 'train_samples_per_second': 7.759, 'train_steps_per_second': 0.246, 'total_flos': 2823771112611840.0, 'train_loss': 1.0559033453464508, 'epoch': 6.0})

In [23]:
eval_results = trainer.evaluate()
print(eval_results)



{'eval_loss': 0.9304889440536499, 'eval_accuracy': 0.6396039603960396, 'eval_f1': 0.5086521838997086, 'eval_cross_entropy': 0.9311601927964994, 'eval_runtime': 5.4132, 'eval_samples_per_second': 93.29, 'eval_steps_per_second': 2.956, 'epoch': 6.0}


In [24]:
best_model_dir = f'{output_dir}/best_model'

In [25]:
model.save_pretrained(best_model_dir)
tokenizer.save_pretrained(best_model_dir)

('./output/llm/baseline_FacebookAI-roberta-large/best_model/tokenizer_config.json',
 './output/llm/baseline_FacebookAI-roberta-large/best_model/special_tokens_map.json',
 './output/llm/baseline_FacebookAI-roberta-large/best_model/vocab.json',
 './output/llm/baseline_FacebookAI-roberta-large/best_model/merges.txt',
 './output/llm/baseline_FacebookAI-roberta-large/best_model/added_tokens.json')

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [64]:
# tokenizer = RobertaTokenizer.from_pretrained(best_model_dir)
# model = RobertaForSequenceClassification.from_pretrained(best_model_dir)

In [27]:
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [28]:
def predictions(text):
    inputs = tokenizer(text, return_tensors="pt", truncation = True, padding = 'max_length', max_length = 512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).tolist()[0]
        predicted_class = np.argmax(probabilities)
        return probabilities, predicted_class

In [29]:
softmax_prob = []
softmax_pred = []

for i, row in test.iterrows():
    text = row['Input']
    probs, preds = predictions(text)
    softmax_prob.append(probs)
    softmax_pred.append(preds)

In [30]:
test['softmax_prob'] = softmax_prob
test['softmax_preds'] = softmax_pred

In [47]:
from huggingface_hub import login
login() #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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [48]:
model.push_to_hub('Multiperspective/roberta-llm-noninstruct')
tokenizer.push_to_hub('Multiperspective/roberta-llm-noninstruct')

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Multiperspective/roberta-llm-noninstruct/commit/d83c51998be7db23920eb6ecbd2d3f407554db08', commit_message='Upload tokenizer', commit_description='', oid='d83c51998be7db23920eb6ecbd2d3f407554db08', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
y_true = test['labels']
y_pred = test['softmax_preds']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy*100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision*100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall*100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1*100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 61.76470588235294
Precision: 15.441176470588236
Recall: 25.0
F1 Score: 19.090909090909093
Confusion Matrix:
[[63  0  0  0]
 [13  0  0  0]
 [ 6  0  0  0]
 [20  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      1.00      0.76        63
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00        20

    accuracy                           0.62       102
   macro avg       0.15      0.25      0.19       102
weighted avg       0.38      0.62      0.47       102



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Temperature scaling

In [32]:
from temperature_scaling_roberta import TemperatureScalingCalibrationModule

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 505
    })
    test: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 102
    })
    val: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 97
    })
})

In [34]:
columns = ['Input', '__index_level_0__']

In [35]:
def tokenize_fn(example):
    # Tokenize the input text
    tokenized_example = tokenizer(example['Input'], padding='max_length', truncation=True)
    # Add the numerical majority label
    tokenized_example['label'] = example['labels']
    return tokenized_example

tokenized_dict = dataset.map(
    tokenize_fn,
    batched= True,
    remove_columns = columns

)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

In [36]:
calibration_module = TemperatureScalingCalibrationModule(best_model_dir, tokenizer).to(device)
calibration_module.fit(tokenized_dict['val'], n_epochs = 6)

100%|██████████| 6/6 [00:16<00:00,  2.67s/it]


TemperatureScalingCalibrationModule(
  (model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfO

In [37]:
calibration_module.temperature

Parameter containing:
tensor([1.0039], device='cuda:0', requires_grad=True)

In [38]:
from torch.utils.data import DataLoader

from transformers import (

    DataCollatorWithPadding
)
import numpy as np

In [39]:
def predict(model, examples, round_digits: int = 5):
    input_ids = examples['input_ids'].to(device)
    attention_mask = examples['attention_mask'].to(device)
    #token_type_ids = examples['token_type_ids'].to(device)
    batch_labels = examples['labels'].detach().cpu().numpy().tolist()
    model.eval()
    with torch.no_grad():
        batch_output = model(input_ids, attention_mask) #,token_type_ids

    batch_scores = np.round(batch_output.detach().cpu().numpy(), round_digits).tolist()
    predicted_labels = [np.argmax(scores) for scores in batch_scores]
    return batch_scores, batch_labels, predicted_labels


def predict_data_loader(model, data_loader: DataLoader) -> pd.DataFrame:
    scores = []
    true_labels = []
    pred_labels = []
    
    for examples in data_loader:
        batch_scores, batch_labels, batch_pred_labels = predict(model, examples)
        scores += batch_scores
        true_labels += batch_labels
        pred_labels += batch_pred_labels

    df_predictions = pd.DataFrame({'scores': scores, 'original_labels': true_labels, 'pred_labels': pred_labels})
    return df_predictions

In [40]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)
data_loader = DataLoader(tokenized_dict['test'], collate_fn=data_collator, batch_size=128)
start = time.time()
df_calibrated_predictions = predict_data_loader(calibration_module, data_loader)
end = time.time()

print('elapsed: ', end - start)
print(df_calibrated_predictions.shape)
df_calibrated_predictions.head()

elapsed:  2.992555856704712
(102, 3)


Unnamed: 0,scores,original_labels,pred_labels
0,"[0.5389000177383423, 0.32190999388694763, 0.01...",0,0
1,"[0.6856099963188171, 0.2026199996471405, 0.012...",1,0
2,"[0.613510012626648, 0.22702999413013458, 0.013...",3,0
3,"[0.6389099955558777, 0.21601000428199768, 0.01...",0,0
4,"[0.508840024471283, 0.3368299901485443, 0.0137...",0,0


In [41]:
y_true = df_calibrated_predictions['original_labels']
y_pred = df_calibrated_predictions['pred_labels']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy*100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision*100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall*100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1*100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 61.76470588235294
Precision: 15.441176470588236
Recall: 25.0
F1 Score: 19.090909090909093
Confusion Matrix:
[[63  0  0  0]
 [13  0  0  0]
 [ 6  0  0  0]
 [20  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      1.00      0.76        63
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00        20

    accuracy                           0.62       102
   macro avg       0.15      0.25      0.19       102
weighted avg       0.38      0.62      0.47       102



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
test.head()

Unnamed: 0,Input,labels,softmax_prob,softmax_preds
0,Is Obesity a Disease? DNA variants significant...,0,"[0.5397554039955139, 0.3217791020870209, 0.018...",0
4,Is a Two-State Solution (Israel and Palestine)...,1,"[0.6869023442268372, 0.20204474031925201, 0.01...",0
5,Was Bill Clinton a Good President? Tipper Gore...,3,"[0.6146619319915771, 0.22658228874206543, 0.01...",0
8,Should the United States Use the Electoral Col...,0,"[0.64012211561203, 0.2155153453350067, 0.01212...",0
10,"Is Obesity a Disease? Sofas, Vinyl Flooring Mi...",0,"[0.5095676183700562, 0.3367694318294525, 0.013...",0


In [43]:
df_calibrated_predictions.head()

Unnamed: 0,scores,original_labels,pred_labels
0,"[0.5389000177383423, 0.32190999388694763, 0.01...",0,0
1,"[0.6856099963188171, 0.2026199996471405, 0.012...",1,0
2,"[0.613510012626648, 0.22702999413013458, 0.013...",3,0
3,"[0.6389099955558777, 0.21601000428199768, 0.01...",0,0
4,"[0.508840024471283, 0.3368299901485443, 0.0137...",0,0


In [44]:
test_df = pd.read_csv('df_test_llm_soft.csv')

test_df = no_maj(test_df, 'majority_llm_noninst')

(102, 35)


In [45]:
test_df['labels'] = test_df['labels'].tolist()
test_df['uncalib_scores'] = softmax_prob
test_df['uncalib_preds'] = softmax_pred
test_df['calib_scores'] = df_calibrated_predictions['scores'].tolist()
test_df['calib_preds'] = df_calibrated_predictions['pred_labels'].tolist()

In [46]:
test_df.to_csv('results_baseline_roberta_llm.csv', index= False)