In [83]:
import pandas as pd
from datasets import DatasetDict, Dataset
import logging
import time
import os
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tqdm import tqdm
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from scipy.special import softmax
import numpy as np
import torch

In [84]:
train = pd.read_csv('df_train_llm_soft.csv')
val = pd.read_csv('df_val_llm_soft.csv')
test = pd.read_csv('df_test_llm_soft.csv')

In [85]:
label2id = {
    'Pro': 0,
    'Against': 1,
    'Neutral': 2,
    'Not-about': 3
}

id2label = {v: k for k, v in label2id.items()}

In [87]:
def no_maj(df, col_name):
    df = df.loc[df[col_name] != 'No Majority']
    print(df.shape)
    return df

train = no_maj(train, 'majority_llm_noninst')
test = no_maj(test, 'majority_llm_noninst')
val = no_maj(val, 'majority_llm_noninst')   


(505, 22)
(102, 35)
(97, 35)


In [88]:
label_encoding = {'Pro': 0,
'Against': 1,
'Neutral': 2,
'Not-about': 3}

train['labels'] = train['majority_llm_noninst'].map(label_encoding)
val['labels'] = val['majority_llm_noninst'].map(label_encoding)
test['labels'] = test['majority_llm_noninst'].map(label_encoding)

In [89]:
train = train[['Input', 'labels']]
val = val[['Input', 'labels']]
test = test[['Input', 'labels']]


train['Input'] = train['Input'].str.lower()
val['Input'] = val['Input'].str.lower()
test['Input'] = test['Input'].str.lower()

In [90]:
train_ = Dataset.from_pandas(train)
test_ = Dataset.from_pandas(test)
val_ = Dataset.from_pandas(val)


dataset = DatasetDict({'train': train_, 'test': test_, 'val': val_})

In [11]:
save_dir = '../output/llm/'
model_name = 'google-bert/bert-large-uncased' #google-bert/bert-large-uncased'
model_name_filename = model_name.replace("/", "-")

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')



In [62]:
def tokenize_func(examples):
    tokenized_inputs = tokenizer(examples['Input'], padding = 'max_length', truncation = True, max_length = 512)
    tokenized_inputs['label'] = examples['labels']
    return tokenized_inputs

In [63]:
train_tokenized = train_.map(tokenize_func, batched = True)
val_tokenized = val_.map(tokenize_func, batched = True)
test_tokenized = test_.map(tokenize_func, batched = True)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

In [64]:
train_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask','token_type_ids', 'label'])
val_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask','token_type_ids', 'label'])
test_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask','token_type_ids', 'label'])

In [18]:
num_labels = 4


In [17]:
len(label2id)

4

In [19]:
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=len(label2id))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
output_dir = f'./output/llm/baseline_{model_name_filename}'

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    
    # Compute cross-entropy loss
    probs = softmax(logits, axis=-1)
    cross_entropy = -np.sum(np.eye(probs.shape[1])[labels] * np.log(probs + 1e-9)) / len(labels)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'cross_entropy': cross_entropy
    }


In [23]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)


In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=train_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [25]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mpraveenbushipaka942[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,F1,Cross Entropy
1,1.7733,1.576668,0.192079,0.076832,1.575815
2,1.3382,1.23044,0.617822,0.506333,1.230652
3,1.2183,1.024783,0.635644,0.496091,1.025123
4,1.0361,0.940289,0.637624,0.500544,0.940513
5,0.9922,0.907816,0.649505,0.534052,0.907773
6,0.9605,0.82682,0.685149,0.591067,0.827186




TrainOutput(global_step=96, training_loss=1.1897473533948262, metrics={'train_runtime': 338.4999, 'train_samples_per_second': 8.951, 'train_steps_per_second': 0.284, 'total_flos': 2823771112611840.0, 'train_loss': 1.1897473533948262, 'epoch': 6.0})

In [26]:
eval_results = trainer.evaluate()
print(eval_results)



{'eval_loss': 0.8268200159072876, 'eval_accuracy': 0.6851485148514852, 'eval_f1': 0.5910669176374864, 'eval_cross_entropy': 0.8271858274641604, 'eval_runtime': 5.5386, 'eval_samples_per_second': 91.178, 'eval_steps_per_second': 2.889, 'epoch': 6.0}


In [27]:
best_model_dir = f'{output_dir}/best_model'

In [28]:
model.save_pretrained(best_model_dir)
tokenizer.save_pretrained(best_model_dir)

('./output/llm/baseline_google-bert-bert-large-uncased/best_model/tokenizer_config.json',
 './output/llm/baseline_google-bert-bert-large-uncased/best_model/special_tokens_map.json',
 './output/llm/baseline_google-bert-bert-large-uncased/best_model/vocab.txt',
 './output/llm/baseline_google-bert-bert-large-uncased/best_model/added_tokens.json')

In [29]:
from huggingface_hub import login
login() #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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
model.push_to_hub('Multiperspective/bert-llm-noninstruct')
tokenizer.push_to_hub('Multiperspective/bert-llm-noninstruct')

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Multiperspective/bert-llm-noninstruct/commit/cb9abda26da8b0a2b601815fefcb51c7c9a5c66a', commit_message='Upload tokenizer', commit_description='', oid='cb9abda26da8b0a2b601815fefcb51c7c9a5c66a', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [32]:
#tokenizer = BertTokenizer.from_pretrained(best_model_dir)
#model = BertForSequenceClassification.from_pretrained(best_model_dir)

## Prediction

In [91]:
def predictions(text):
    inputs = tokenizer(text, return_tensors="pt", truncation = True, padding = 'max_length', max_length = 512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).tolist()[0]
        predicted_class = np.argmax(probabilities)
        return probabilities, predicted_class

In [92]:
softmax_prob = []
softmax_pred = []

for i, row in test.iterrows():
    text = row['Input']
    probs, preds = predictions(text)
    softmax_prob.append(probs)
    softmax_pred.append(preds)

In [93]:
test['softmax_prob'] = softmax_prob
test['softmax_preds'] = softmax_pred

In [94]:
y_true = test['labels']
y_pred = test['softmax_preds']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy*100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision*100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall*100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1*100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 60.78431372549019
Precision: 15.5
Recall: 24.6031746031746
F1 Score: 19.01840490797546
Confusion Matrix:
[[62  1  0  0]
 [13  0  0  0]
 [ 6  0  0  0]
 [19  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.98      0.76        63
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00        20

    accuracy                           0.61       102
   macro avg       0.15      0.25      0.19       102
weighted avg       0.38      0.61      0.47       102



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Temperature scaling

In [95]:
from temperature_scaling_bert import TemperatureScalingCalibrationModule

In [96]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 505
    })
    test: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 102
    })
    val: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 97
    })
})

In [97]:
columns = ['Input', '__index_level_0__']

In [98]:
def tokenize_fn(example):
    # Tokenize the input text
    tokenized_example = tokenizer(example['Input'], padding='max_length', truncation=True)
    # Add the numerical majority label
    tokenized_example['label'] = example['labels']
    return tokenized_example

tokenized_dict = dataset.map(
    tokenize_fn,
    batched= True,
    remove_columns = columns

)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

In [99]:
calibration_module = TemperatureScalingCalibrationModule(best_model_dir, tokenizer).to(device)
calibration_module.fit(tokenized_dict['val'], n_epochs = 6)

100%|██████████| 6/6 [00:16<00:00,  2.69s/it]


TemperatureScalingCalibrationModule(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-23): 24 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_

In [100]:
calibration_module.temperature

Parameter containing:
tensor([0.9986], device='cuda:0', requires_grad=True)

In [101]:
from torch.utils.data import DataLoader
from transformers import (
    DataCollatorWithPadding
)
import numpy as np

In [102]:
def predict(model, examples, round_digits: int = 5):
    input_ids = examples['input_ids'].to(device)
    attention_mask = examples['attention_mask'].to(device)
    token_type_ids = examples['token_type_ids'].to(device)
    batch_labels = examples['labels'].detach().cpu().numpy().tolist()
    model.eval()
    with torch.no_grad():
        batch_output = model(input_ids, attention_mask, token_type_ids) #,token_type_ids

    batch_scores = np.round(batch_output.detach().cpu().numpy(), round_digits).tolist()
    predicted_labels = [np.argmax(scores) for scores in batch_scores]
    return batch_scores, batch_labels, predicted_labels


def predict_data_loader(model, data_loader: DataLoader) -> pd.DataFrame:
    scores = []
    true_labels = []
    pred_labels = []
    
    for examples in data_loader:
        batch_scores, batch_labels, batch_pred_labels = predict(model, examples)
        scores += batch_scores
        true_labels += batch_labels
        pred_labels += batch_pred_labels

    df_predictions = pd.DataFrame({'scores': scores, 'original_labels': true_labels, 'pred_labels': pred_labels})
    return df_predictions

In [103]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)
data_loader = DataLoader(tokenized_dict['test'], collate_fn=data_collator, batch_size=128)
start = time.time()
df_calibrated_predictions = predict_data_loader(calibration_module, data_loader)
end = time.time()

print('elapsed: ', end - start)
print(df_calibrated_predictions.shape)
df_calibrated_predictions.head()

elapsed:  3.041917324066162
(102, 3)


Unnamed: 0,scores,original_labels,pred_labels
0,"[0.4947099983692169, 0.3409300148487091, 0.046...",0,0
1,"[0.5944899916648865, 0.2287999987602234, 0.036...",1,0
2,"[0.6797099709510803, 0.165010005235672, 0.0247...",3,0
3,"[0.6097000241279602, 0.21901999413967133, 0.03...",0,0
4,"[0.5196200013160706, 0.3166399896144867, 0.045...",0,0


In [104]:
y_true = df_calibrated_predictions['original_labels']
y_pred = df_calibrated_predictions['pred_labels']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy*100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision*100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall*100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1*100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 60.78431372549019
Precision: 15.5
Recall: 24.6031746031746
F1 Score: 19.01840490797546
Confusion Matrix:
[[62  1  0  0]
 [13  0  0  0]
 [ 6  0  0  0]
 [19  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.98      0.76        63
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00        20

    accuracy                           0.61       102
   macro avg       0.15      0.25      0.19       102
weighted avg       0.38      0.61      0.47       102



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [105]:
test.head()

Unnamed: 0,Input,labels,softmax_prob,softmax_preds
0,is obesity a disease? dna variants significant...,0,"[0.494432657957077, 0.3409091830253601, 0.0464...",0
4,is a two-state solution (israel and palestine)...,1,"[0.5940539836883545, 0.22894276678562164, 0.03...",0
5,was bill clinton a good president? tipper gore...,3,"[0.6792035698890686, 0.1652161031961441, 0.024...",0
8,should the united states use the electoral col...,0,"[0.6092492341995239, 0.21917493641376495, 0.03...",0
10,"is obesity a disease? sofas, vinyl flooring mi...",0,"[0.5192981362342834, 0.3166601359844208, 0.045...",0


In [106]:
df_calibrated_predictions.head()

Unnamed: 0,scores,original_labels,pred_labels
0,"[0.4947099983692169, 0.3409300148487091, 0.046...",0,0
1,"[0.5944899916648865, 0.2287999987602234, 0.036...",1,0
2,"[0.6797099709510803, 0.165010005235672, 0.0247...",3,0
3,"[0.6097000241279602, 0.21901999413967133, 0.03...",0,0
4,"[0.5196200013160706, 0.3166399896144867, 0.045...",0,0


In [107]:
test_df = pd.read_csv('df_test_llm_soft.csv')

test_df = no_maj(test_df, 'majority_llm_noninst')

(102, 35)


In [108]:

test_df['uncalib_scores'] = softmax_prob
test_df['uncalib_preds'] = softmax_pred
test_df['calib_scores'] = df_calibrated_predictions['scores'].tolist()
test_df['calib_preds'] = df_calibrated_predictions['pred_labels'].tolist()

In [109]:
test_df.to_csv('results_baseline_bert_llm.csv', index= False)