In [1]:
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
import logging
import time
import os
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tqdm import tqdm
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from scipy.special import softmax
import numpy as np
import torch

In [2]:
train = pd.read_csv('train_soft_human.csv')
test = pd.read_csv('test_soft_human.csv')
val = pd.read_csv('val_soft_human.csv')

In [3]:
def no_maj(df):
    df = df.loc[df['majority_label'] != 'No majority']
    print(df.shape)
    return df

train = no_maj(train)
test = no_maj(test)
val = no_maj(val)

(619, 23)
(139, 23)
(139, 23)


In [4]:
label_encoding = {'Pro': 0,
'Against': 1,
'Neutral': 2,
'Not-about': 3}

train['labels'] = train['majority_label'].map(label_encoding)
val['labels'] = val['majority_label'].map(label_encoding)
test['labels'] = test['majority_label'].map(label_encoding)

In [6]:
train = train[['Input', 'labels']]
val = val[['Input', 'labels']]
test = test[['Input', 'labels']]


train['Input'] = train['Input'].str.lower()
val['Input'] = val['Input'].str.lower()
test['Input'] = test['Input'].str.lower()

In [7]:
train_ = Dataset.from_pandas(train)
test_ = Dataset.from_pandas(test)
val_ = Dataset.from_pandas(val)


dataset = DatasetDict({'train': train_, 'test': test_, 'val': val_})

In [8]:
save_dir = '../output/'
model_name = 'google-bert/bert-large-uncased' #google-bert/bert-large-uncased'
model_name_filename = model_name.replace("/", "-")

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')



In [10]:
def tokenize_func(examples):
    tokenized_inputs = tokenizer(examples['Input'], padding = 'max_length', truncation = True, max_length = 512)
    tokenized_inputs['label'] = examples['labels']
    return tokenized_inputs

In [11]:
train_tokenized = train_.map(tokenize_func, batched = True)
val_tokenized = val_.map(tokenize_func, batched = True)
test_tokenized = test_.map(tokenize_func, batched = True)

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

In [12]:
train_tokenized

Dataset({
    features: ['Input', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 619
})

In [13]:
train_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask','token_type_ids', 'label'])
val_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask','token_type_ids', 'label'])
test_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask','token_type_ids', 'label'])

In [14]:
num_labels = 4


In [15]:
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
output_dir = f'./output/baseline_{model_name_filename}'

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    
    # Compute cross-entropy loss
    probs = softmax(logits, axis=-1)
    cross_entropy = -np.sum(np.eye(probs.shape[1])[labels] * np.log(probs + 1e-9)) / len(labels)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'cross_entropy': cross_entropy
    }


In [19]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=train_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [21]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpraveenbushipaka942[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,F1,Cross Entropy
1,1.5461,1.50545,0.20517,0.069857,1.505567
2,1.4234,1.420465,0.219709,0.097953,1.420589
3,1.407,1.326948,0.369952,0.320858,1.327037
4,1.301,1.208233,0.484653,0.452118,1.208058
5,1.1702,1.078676,0.630048,0.625338,1.078991
6,1.1246,1.012385,0.602585,0.594844,1.012251




TrainOutput(global_step=120, training_loss=1.3560953696568807, metrics={'train_runtime': 370.7936, 'train_samples_per_second': 10.016, 'train_steps_per_second': 0.324, 'total_flos': 3461216472686592.0, 'train_loss': 1.3560953696568807, 'epoch': 6.0})

In [22]:
eval_results = trainer.evaluate()
print(eval_results)



{'eval_loss': 1.0786762237548828, 'eval_accuracy': 0.630048465266559, 'eval_f1': 0.6253383531773795, 'eval_cross_entropy': 1.078990625064092, 'eval_runtime': 9.3128, 'eval_samples_per_second': 66.468, 'eval_steps_per_second': 2.148, 'epoch': 6.0}


In [23]:
best_model_dir = f'{output_dir}/best_model'

In [24]:
model.save_pretrained(best_model_dir)
tokenizer.save_pretrained(best_model_dir)

('./output/baseline_google-bert-bert-large-uncased/best_model/tokenizer_config.json',
 './output/baseline_google-bert-bert-large-uncased/best_model/special_tokens_map.json',
 './output/baseline_google-bert-bert-large-uncased/best_model/vocab.txt',
 './output/baseline_google-bert-bert-large-uncased/best_model/added_tokens.json')

In [25]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
model.push_to_hub('Multiperspective/bert-human-label')
tokenizer.push_to_hub('Multiperspective/bert-human-label')

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Multiperspective/bert-human-label/commit/7588548b53bb51127cb00fc4ee41cea04cc04f5d', commit_message='Upload tokenizer', commit_description='', oid='7588548b53bb51127cb00fc4ee41cea04cc04f5d', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [None]:
#tokenizer = BertTokenizer.from_pretrained(best_model_dir)
#model = BertForSequenceClassification.from_pretrained(best_model_dir)

## Prediction

In [28]:
def predictions(text):
    inputs = tokenizer(text, return_tensors="pt", truncation = True, padding = 'max_length', max_length = 512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).tolist()[0]
        predicted_class = np.argmax(probabilities)
        return probabilities, predicted_class

In [29]:
softmax_prob = []
softmax_pred = []

for i, row in test.iterrows():
    text = row['Input']
    probs, preds = predictions(text)
    softmax_prob.append(probs)
    softmax_pred.append(preds)

In [30]:
test['softmax_prob'] = softmax_prob
test['softmax_preds'] = softmax_pred

In [31]:
y_true = test['labels']
y_pred = test['softmax_preds']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy*100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision*100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall*100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1*100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 36.69064748201439
Precision: 39.035364145658264
Recall: 35.93791766907245
F1 Score: 33.80628161542504
Confusion Matrix:
[[ 6  4 26  7]
 [ 2  6 19  2]
 [ 5  4 29  5]
 [ 3  0 11 10]]
Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.14      0.20        43
           1       0.43      0.21      0.28        29
           2       0.34      0.67      0.45        43
           3       0.42      0.42      0.42        24

    accuracy                           0.37       139
   macro avg       0.39      0.36      0.34       139
weighted avg       0.38      0.37      0.33       139



## Temperature scaling

In [32]:
from temperature_scaling_bert import TemperatureScalingCalibrationModule

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 619
    })
    test: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 139
    })
    val: Dataset({
        features: ['Input', 'labels', '__index_level_0__'],
        num_rows: 139
    })
})

In [34]:
columns = ['Input', '__index_level_0__']

In [35]:
def tokenize_fn(example):
    # Tokenize the input text
    tokenized_example = tokenizer(example['Input'], padding='max_length', truncation=True)
    # Add the numerical majority label
    tokenized_example['label'] = example['labels']
    return tokenized_example

tokenized_dict = dataset.map(
    tokenize_fn,
    batched= True,
    remove_columns = columns

)

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

In [36]:
calibration_module = TemperatureScalingCalibrationModule(best_model_dir, tokenizer).to(device)
calibration_module.fit(tokenized_dict['val'], n_epochs = 6)

100%|██████████| 6/6 [00:24<00:00,  4.02s/it]


TemperatureScalingCalibrationModule(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-23): 24 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_

In [37]:
calibration_module.temperature

Parameter containing:
tensor([0.9974], device='cuda:0', requires_grad=True)

In [38]:
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np

In [40]:
def predict(model, examples, round_digits: int = 5):
    input_ids = examples['input_ids'].to(device)
    attention_mask = examples['attention_mask'].to(device)
    token_type_ids = examples['token_type_ids'].to(device)
    batch_labels = examples['labels'].detach().cpu().numpy().tolist()
    model.eval()
    with torch.no_grad():
        batch_output = model(input_ids, attention_mask, token_type_ids) #,token_type_ids

    batch_scores = np.round(batch_output.detach().cpu().numpy(), round_digits).tolist()
    predicted_labels = [np.argmax(scores) for scores in batch_scores]
    return batch_scores, batch_labels, predicted_labels


def predict_data_loader(model, data_loader: DataLoader) -> pd.DataFrame:
    scores = []
    true_labels = []
    pred_labels = []
    
    for examples in data_loader:
        batch_scores, batch_labels, batch_pred_labels = predict(model, examples)
        scores += batch_scores
        true_labels += batch_labels
        pred_labels += batch_pred_labels

    df_predictions = pd.DataFrame({'scores': scores, 'original_labels': true_labels, 'pred_labels': pred_labels})
    return df_predictions

In [41]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)
data_loader = DataLoader(tokenized_dict['test'], collate_fn=data_collator, batch_size=128)
start = time.time()
df_calibrated_predictions = predict_data_loader(calibration_module, data_loader)
end = time.time()

print('elapsed: ', end - start)
print(df_calibrated_predictions.shape)
df_calibrated_predictions.head()

elapsed:  4.228758811950684
(139, 3)


Unnamed: 0,scores,original_labels,pred_labels
0,"[0.17215999960899353, 0.3062399923801422, 0.46...",0,2
1,"[0.32607999444007874, 0.13131000101566315, 0.2...",0,0
2,"[0.2809799909591675, 0.11432000249624252, 0.28...",1,3
3,"[0.251010000705719, 0.1273300051689148, 0.3838...",3,2
4,"[0.2943600118160248, 0.12541000545024872, 0.27...",3,3


In [42]:
y_true = df_calibrated_predictions['original_labels']
y_pred = df_calibrated_predictions['pred_labels']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy*100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision*100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall*100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1*100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 36.69064748201439
Precision: 39.035364145658264
Recall: 35.93791766907245
F1 Score: 33.80628161542504
Confusion Matrix:
[[ 6  4 26  7]
 [ 2  6 19  2]
 [ 5  4 29  5]
 [ 3  0 11 10]]
Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.14      0.20        43
           1       0.43      0.21      0.28        29
           2       0.34      0.67      0.45        43
           3       0.42      0.42      0.42        24

    accuracy                           0.37       139
   macro avg       0.39      0.36      0.34       139
weighted avg       0.38      0.37      0.33       139



In [43]:
test.head()

Unnamed: 0,Input,labels,softmax_prob,softmax_preds
0,should social security be privatized? social s...,0,"[0.17241421341896057, 0.30623528361320496, 0.4...",2
1,can alternative energy effectively replace fos...,0,"[0.32589882612228394, 0.1315443515777588, 0.27...",0
2,should the united states maintain its embargo ...,1,"[0.28093746304512024, 0.11456725001335144, 0.2...",3
3,should the united states return to a gold stan...,3,"[0.2510535717010498, 0.12757351994514465, 0.38...",2
4,is obesity a disease? treatment for obesity an...,3,"[0.29427507519721985, 0.12564603984355927, 0.2...",3


In [44]:
df_calibrated_predictions.head()

Unnamed: 0,scores,original_labels,pred_labels
0,"[0.17215999960899353, 0.3062399923801422, 0.46...",0,2
1,"[0.32607999444007874, 0.13131000101566315, 0.2...",0,0
2,"[0.2809799909591675, 0.11432000249624252, 0.28...",1,3
3,"[0.251010000705719, 0.1273300051689148, 0.3838...",3,2
4,"[0.2943600118160248, 0.12541000545024872, 0.27...",3,3


In [45]:
test_df = pd.read_csv('test_soft_human.csv')

In [46]:
test_df['uncalib_scores'] = softmax_prob
test_df['uncalib_preds'] = softmax_pred
test_df['calib_scores'] = df_calibrated_predictions['scores'].tolist()
test_df['calib_preds'] = df_calibrated_predictions['pred_labels'].tolist()

In [47]:
test_df.to_csv('results_baseline_bert_human.csv', index= False)