Classification multi-perspective approach BERT-large HD 

In [3]:
import ast 
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
#formatting soft labels
train['soft_labels'] = train['soft_labels'].apply(ast.literal_eval).tolist()
test['soft_labels'] = test['soft_labels'].apply(ast.literal_eval).tolist()
val['soft_labels'] = val['soft_labels'].apply(ast.literal_eval).tolist()

In [4]:
#dataset
train = Dataset.from_pandas(train)
val = Dataset.from_pandas(val)
test = Dataset.from_pandas(test)

dataset = DatasetDict() 
dataset['train'] = train
dataset['val'] = val
dataset['test'] = test

In [5]:
#model
model_name = 'bert-large-uncased'
model_name_filename = model_name.replace("/", "-")

In [None]:
#tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def tokenize_func(examples):
    # Tokenize the input text and map the 'soft_labels' column to 'labels'
    tokenized_inputs = tokenizer(examples['Input'], padding='max_length', truncation=True, max_length=512)
    tokenized_inputs['labels'] = examples['soft_labels']  
    return tokenized_inputs

In [8]:
train_tokenized = train.map(tokenize_func, batched = True)
val_tokenized = val.map(tokenize_func, batched = True) 

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

In [9]:
train_tokenized

Dataset({
    features: ['Unnamed: 0', 'docID', 'Query', 'docTitle', 'doc', 'Input', 'labels', 'majority_label', 'label_index', 'soft_labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 619
})

In [10]:
train_tokenized.set_format('torch', columns =['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
val_tokenized.set_format('torch', columns = ['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [11]:
train_tokenized['labels'][0]

tensor([0.9000, 0.0000, 0.0000, 0.0000])

In [12]:
num_labels=len(train_tokenized['labels'][0])

In [13]:
num_labels

4

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
output_dir = f"./multiclassification/{model_name_filename}/results/human"

In [18]:
output_dir

'./multiclassification/bert-large-uncased/results/human'

In [None]:
#training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
#definition soft loss function 

def softmax(logits):
    """
    Compute softmax probabilities from logits.
    
    Parameters:
    - logits: A numpy array of shape (n, num_classes) containing the logits.
    
    Returns:
    - probabilities: A numpy array of shape (n, num_classes) containing the softmax probabilities.
    """
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def soft_loss_function(true_probabilities, predicted_logits):
    """
    Compute the soft loss function (cross-entropy with soft labels) using PyTorch tensors.
    
    Parameters:
    - true_probabilities: A PyTorch tensor of shape (n, num_classes) containing the true probability distributions.
    - predicted_logits: A PyTorch tensor of shape (n, num_classes) containing the logits from the model.
    
    Returns:
    - loss: The computed soft loss.
    """
   
    predicted_probabilities = torch.nn.functional.softmax(predicted_logits, dim=-1)
    
  
    epsilon = 1e-15
    predicted_probabilities = torch.clamp(predicted_probabilities, epsilon, 1. - epsilon)
    
   
    loss = -torch.sum(true_probabilities * torch.log(predicted_probabilities))
    
    return loss

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Compute the loss for the model. This uses the soft_loss_function.

        Parameters:
        - model: The model to evaluate.
        - inputs: A dictionary of inputs to the model.
        - return_outputs: Whether to return model outputs along with loss.

        Returns:
        - loss: The computed loss.
        - outputs (optional): The model outputs, if return_outputs is True.
        """
        labels = inputs.pop("labels")
        
       
        outputs = model(**inputs)
        logits = outputs.logits
        
      
        true_probabilities = labels  
        logits = logits  
        
       
        loss = soft_loss_function(true_probabilities, logits)
        
        return (loss, outputs) if return_outputs else loss


In [22]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(eval_results) 

In [26]:
model.save_pretrained(f'./multiclassification/{model_name_filename}/results/best_model_human')
tokenizer.save_pretrained(f'./multiclassification/{model_name_filename}/results/best_model_human') 

('./multiclassification/bert-large-uncased/results/best_model_human/tokenizer_config.json',
 './multiclassification/bert-large-uncased/results/best_model_human/special_tokens_map.json',
 './multiclassification/bert-large-uncased/results/best_model_human/vocab.txt',
 './multiclassification/bert-large-uncased/results/best_model_human/added_tokens.json',
 './multiclassification/bert-large-uncased/results/best_model_human/tokenizer.json')

In [27]:
model 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [28]:
tokenized_test = test.map(tokenize_func, batched=True)
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", 'token_type_ids', "labels"])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [29]:
#predicting
def predict(texts, model, tokenizer, device):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.logits.cpu().numpy()

In [30]:
all_predictions = []
all_labels = []
batch_size = 8  

for i in range(0, len(test), batch_size):
    batch = test[i:i+batch_size]
    batch_texts = batch['Input']
    batch_predictions = predict(batch_texts, model, tokenizer, device)
    all_predictions.extend(batch_predictions)
    all_labels.extend(batch['labels'])

In [31]:
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

In [32]:
from scipy.special import softmax
# Apply softmax to predictions
softmax_predictions = softmax(all_predictions, axis=1)

In [33]:
df_test = test.to_pandas()

In [35]:
#collecting predicted scores and normalized predicted scores with softmax
df_test['Predicted_scores'] = all_predictions.tolist()
df_test['Predicted_Softmax_scores'] = softmax_predictions.tolist()

In [36]:
df_test['predicted_labels'] = df_test['Predicted_scores'].apply(lambda x: x.index(max(x)))
df_test['predicted_softmax_labels'] = df_test['Predicted_Softmax_scores'].apply(lambda x: x.index(max(x)))

In [37]:
from datasets import Dataset
test = Dataset.from_pandas(df_test) 

In [40]:
#test file
m_test

Unnamed: 0.1,Unnamed: 0,docID,Query,docTitle,doc,Input,labels,majority_label,label_index,soft_labels,Predicted_scores,Predicted_Softmax_scores,predicted_labels,predicted_softmax_labels
0,0,1s54q9r,Should Social Security Be Privatized?,Social Security: Why America Can and Should Al...,The debate surrounding the privatization of So...,Should Social Security Be Privatized? Social S...,"['Pro', 'Pro', 'Pro']",Pro,"[0, 0, 0]",[0.9 0. 0. 0. ],[0.26883867 0.94456095 0.32207608 0.12423708],[0.20468739 0.40230322 0.21587968 0.17712969],1,1
1,1,2s28q5r,Can Alternative Energy Effectively Replace Fos...,Trump’s speech ignored global warming and clim...,"In 2018, the U.S. witnessed significant weathe...",Can Alternative Energy Effectively Replace Fos...,"['Pro', 'Pro', 'Pro']",Pro,"[2, 2, 2]",[0.9 0. 0. 0. ],[ 1.50775719 -0.40722397 0.03725779 -0.45741957],[0.65907133 0.09711061 0.15146163 0.09235639],0,0
2,2,2s39q1r,Should the United States Maintain Its Embargo ...,Cuba: Trouble in Paradise | Havana Times,The article examines the economic status and c...,Should the United States Maintain Its Embargo ...,"['Against', 'Neutral', 'Against']",Against,"[3, 3, 3]",[0.1 0.6 0.2 0.1],[-0.23275764 0.21257548 1.02882206 0.23093361],[0.1301755 0.20320526 0.45964906 0.2069702 ],2,2
3,3,1s52q3r,Should the United States Return to a Gold Stan...,Maduro calls for return of Venezuela's UK-depo...,Caracas (AFP) - Venezuela President Nicolas Ma...,Should the United States Return to a Gold Stan...,"['Not-about', 'Not-about', 'Not-about']",Not-about,"[2, 2, 0]",[0. 0. 0. 0.9],[-0.32070804 -0.24343689 0.90611458 -0.06835756],[0.14754142 0.15939415 0.50317174 0.18989272],2,2
4,4,2s8q3r,Is Obesity a Disease?,Treatment for obesity and fatty liver disease ...,Professor Amiram Goldblum and his team at the ...,Is Obesity a Disease? Treatment for obesity an...,"['Pro', 'Not-about', 'Not-about']",Not-about,"[2, 2, 2]",[0.2 0.1 0.1 0.6],[ 0.55524915 -0.13133954 0.01513 0.84196126],[0.29256797 0.14724652 0.17047316 0.38971242],3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,134,1s48q7r,Are the Olympic Games an Overall Benefit for T...,Business of the Olympics - InvestorGuide.com,"With this year’s Rio Olympics beginning, the b...",Are the Olympic Games an Overall Benefit for T...,"['Neutral', 'Pro', 'Pro']",Pro,"[2, 2, 2]",[0.6 0.1 0.2 0.1],[ 0.85651225 0.02331866 -0.26639318 0.58874345],[0.39602754 0.17213692 0.12884089 0.30299458],0,0
135,135,2s21q4r,Should the Drinking Age Be Lowered from 21 to ...,Lena Dunham's Comments Defending Jordyn Woods ...,Amid all the cheating rumors surrounding Jordy...,Should the Drinking Age Be Lowered from 21 to ...,"['Not-about', 'Not-about', 'Not-about']",Not-about,"[1, 1]",[0. 0. 0. 0.9],[-0.21372491 0.22113684 0.93482476 0.45593455],[0.13068587 0.20187694 0.41213301 0.25530422],2,2
136,136,2s7q8r,Should People Become Vegetarian?,Vegetarian man devastated after accidentally e...,Pizza Hut has apologisedA man who has never ea...,Should People Become Vegetarian? Vegetarian ma...,"['Neutral', 'Neutral', 'Not-about']",Neutral,"[1, 1, 1]",[0.1 0.1 0.6 0.2],[-0.56536305 0.18483636 1.13705373 0.32933161],[0.09048788 0.19160107 0.49652445 0.22138663],2,2
137,137,2s20q5r,Should Parents or Other Adults Be Able to Ban ...,Community Montessori’s 100th day of school: Ki...,Hi Already a subscriber?Subscribe today for fu...,Should Parents or Other Adults Be Able to Ban ...,"['Not-about', 'Not-about', 'Neutral']",Not-about,"[1, 2, 1]",[0.1 0.1 0.2 0.6],[-0.15691513 -0.14465177 0.72674149 -0.00257615],[0.17860469 0.18080847 0.43217543 0.20841138],2,2


In [41]:
label_ecnoding_multip = {'Pro': 0,
'Against': 1,
'Neutral': 2,
'Not-about': 3,}


m_test['true_labels'] = m_test['majority_label'].map(label_ecnoding_multip) 

In [42]:
#calculation confidence and classification metrics (accuracy, precision, recall, f1, confusion matrix)

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def calculate_confidences(df, model, tokenizer):
    """
    Calculate confidence scores and update the DataFrame with a new column.
    
    Args:
    - df: DataFrame with input data
    - model: Trained model with a method to get logits
    - tokenizer: Tokenizer to preprocess text
    
    Returns:
    - df: Updated DataFrame with a 'confidence_scores' column
    """
    confidences = []

    model.eval()  

    for index, row in df.iterrows():
        text = row['Input']  
        
    
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
        
        with torch.no_grad():
            logits = model(**inputs).logits
            probabilities = F.softmax(logits, dim=-1) 
            probabilities = probabilities.cpu().numpy().flatten()  
            
        
            max_prob = np.max(probabilities)
            confidences.append(max_prob)
    

    df['confidence_scores'] = confidences
    return df


m_test = calculate_confidences(m_test, model, tokenizer)


y_true = m_test['true_labels']
y_pred = m_test['predicted_labels']
confidence_scores = m_test['confidence_scores']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy * 100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision * 100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall * 100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1 * 100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

# Average Confidence Score
avg_confidence = np.mean(confidence_scores)
print("Average Confidence Score:", avg_confidence * 100)

# Confidence for Correct and Incorrect Predictions
correct_confidence = np.mean([confidence for pred, true, confidence in zip(y_pred, y_true, confidence_scores) if pred == true])
incorrect_confidence = np.mean([confidence for pred, true, confidence in zip(y_pred, y_true, confidence_scores) if pred != true])

print("Average Confidence for Correct Predictions:", correct_confidence * 100)
print("Average Confidence for Incorrect Predictions:", incorrect_confidence * 100)

Accuracy: 46.76258992805755
Precision: 46.88025938025938
Recall: 47.16486233627372
F1 Score: 46.751735431792824
Confusion Matrix:
[[19  9 13  2]
 [ 6 17  4  2]
 [11  6 19  7]
 [ 3  5  6 10]]
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.44      0.46        43
           1       0.46      0.59      0.52        29
           2       0.45      0.44      0.45        43
           3       0.48      0.42      0.44        24

    accuracy                           0.47       139
   macro avg       0.47      0.47      0.47       139
weighted avg       0.47      0.47      0.47       139

Average Confidence Score: 45.82185447216034
Average Confidence for Correct Predictions: 47.3630455824045
Average Confidence for Incorrect Predictions: 44.46810850420514


In [43]:
#soft loss function

def softmax(logits):
    """
    Compute softmax probabilities from logits.
    
    Parameters:
    - logits: A numpy array of shape (n, num_classes) containing the logits.
    
    Returns:
    - probabilities: A numpy array of shape (n, num_classes) containing the softmax probabilities.
    """
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def soft_loss_function(true_probabilities, predicted_logits):
    """
    Compute the soft loss function (cross-entropy with soft labels) using PyTorch tensors.
    
    Parameters:
    - true_probabilities: A PyTorch tensor of shape (n, num_classes) containing the true probability distributions.
    - predicted_logits: A PyTorch tensor of shape (n, num_classes) containing the logits from the model.
    
    Returns:
    - loss: The computed soft loss.
    """

    predicted_probabilities = torch.nn.functional.softmax(predicted_logits, dim=-1)
    
  
    epsilon = 1e-15
    predicted_probabilities = torch.clamp(predicted_probabilities, epsilon, 1. - epsilon)
    
 
    loss = -torch.sum(true_probabilities * torch.log(predicted_probabilities))
    
    return loss

class TemperatureScalingCalibration(nn.Module):
    def __init__(self, model_path: str, tokenizer, device: torch.device):
        super().__init__()
        self.model_path = model_path
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = tokenizer
        self.device = device

       
        self.temperature = nn.Parameter(torch.ones(1)) 
        
        self.model.to(self.device)
        self.temperature.to(self.device)

    def forward(self, input_ids, attention_mask):
        """Forward method that returns softmax-ed confidence scores."""
        logits = self.forward_logit(input_ids, attention_mask)
        scaled_logits = logits / self.temperature
        scores = nn.functional.softmax(scaled_logits, dim=-1)
        return scores

    def forward_logit(self, input_ids, attention_mask):
        """Forward method that returns logits, to be used with cross-entropy loss."""
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        ).logits
        return outputs

    def fit(self, dataset_tokenized, n_epochs: int = 3, batch_size: int = 64, lr: float = 0.01):
        """Fits the temperature scaling parameter."""
        data_collator = DataCollatorWithPadding(self.tokenizer, padding=True)
        data_loader = DataLoader(dataset_tokenized, collate_fn=data_collator, batch_size=batch_size)

        self.freeze_base_model()
      
      
        optimizer = optim.SGD(self.parameters(), lr=lr)

      
        self.train()

        for epoch in trange(n_epochs):
            epoch_loss = 0.0
            for examples in data_loader:
            
                input_ids = examples['input_ids'].to(self.device)
                attention_mask = examples['attention_mask'].to(self.device)
                soft_labels = examples['labels'].to(self.device)  
                
                
                self.zero_grad()
                logits = self.forward_logit(input_ids, attention_mask)
                
             
                scaled_logits = logits / self.temperature
                
            
                loss = soft_loss_function(soft_labels, scaled_logits)
                loss.backward()
                optimizer.step()

             
                epoch_loss += loss.item() * input_ids.size(0)

       
            print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss / len(dataset_tokenized)}")

        return self

    def freeze_base_model(self):
        
        self.model.eval()
        for parameter in self.model.parameters():
            parameter.requires_grad = False

        return self

In [45]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir) 

('./multiclassification/bert-large-uncased/results/human/tokenizer_config.json',
 './multiclassification/bert-large-uncased/results/human/special_tokens_map.json',
 './multiclassification/bert-large-uncased/results/human/vocab.txt',
 './multiclassification/bert-large-uncased/results/human/added_tokens.json',
 './multiclassification/bert-large-uncased/results/human/tokenizer.json')

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
calibration_module = TemperatureScalingCalibration(model_path=output_dir, tokenizer=tokenizer, device=device)
calibration_module.to(device) 

TemperatureScalingCalibration(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-23): 24 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_fe

In [47]:
def tokenize_func(examples):
    # Tokenize the input text and map the 'soft_labels' column to 'labels'
    tokenized_inputs = tokenizer(examples['Input'], padding='max_length', truncation=True, max_length=512)
    tokenized_inputs['labels'] = examples['soft_labels']  # Rename 'soft_labels' to 'labels'
    return tokenized_inputs

In [48]:
val_tokenized_cal = val.map(tokenize_func, batched = True) 

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

In [49]:
val_tokenized_cal.set_format('torch', columns = ['input_ids', 'attention_mask', 'labels'])

In [50]:
calibration_module.fit(val_tokenized_cal,n_epochs=6, batch_size=64, lr=0.01) 

 17%|█▋        | 1/6 [00:03<00:19,  3.91s/it]

Epoch 1/6, Loss: 77.91530701067808


 33%|███▎      | 2/6 [00:07<00:15,  3.91s/it]

Epoch 2/6, Loss: 77.5023280493647


 50%|█████     | 3/6 [00:11<00:11,  3.91s/it]

Epoch 3/6, Loss: 77.3937425682013


 67%|██████▋   | 4/6 [00:15<00:07,  3.91s/it]

Epoch 4/6, Loss: 78.01580020849653


 83%|████████▎ | 5/6 [00:19<00:03,  3.92s/it]

Epoch 5/6, Loss: 78.29172865778422


100%|██████████| 6/6 [00:23<00:00,  3.92s/it]

Epoch 6/6, Loss: 77.63028305383037





TemperatureScalingCalibration(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-23): 24 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_fe

In [51]:
calibration_module.eval()

TemperatureScalingCalibration(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-23): 24 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_fe

In [52]:
def predict_cal(texts, model, tokenizer, device):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items() if k in ['input_ids', "token_ids", 'attention_mask']}  
    with torch.no_grad():
        logits = model(**inputs)  
    return logits.cpu().numpy()

In [53]:
all_predictions = []
all_labels = []
batch_size = 8  

for i in range(0, len(test), batch_size):
    batch = test[i:i+batch_size]
    batch_texts = batch['Input']
    batch_predictions = predict_cal(batch_texts, calibration_module, tokenizer, device)
    all_predictions.extend(batch_predictions)
    all_labels.extend(batch['labels']) 

In [54]:
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

In [55]:
from scipy.special import softmax
# Apply softmax to predictions
softmax_predictions = softmax(all_predictions, axis=1)

In [56]:
df_test_cal = test.to_pandas()
df_test_cal['Predicted_scores'] = all_predictions.tolist()
df_test_cal['Predicted_Softmax_scores'] = softmax_predictions.tolist()

In [57]:
df_test_cal['predicted_labels'] = df_test['Predicted_scores'].apply(lambda x: x.index(max(x)))
df_test_cal['predicted_softmax_labels'] = df_test['Predicted_Softmax_scores'].apply(lambda x: x.index(max(x)))

In [58]:
from datasets import Dataset
test_cal = Dataset.from_pandas(df_test_cal) 

In [60]:
label_ecnoding_multip = {'Pro': 0,
'Against': 1,
'Neutral': 2,
'Not-about': 3,} 


m_test_cal['true_labels'] = m_test_cal['majority_label'].map(label_ecnoding_multip) 

In [61]:
from torch.nn import functional as F

def calculate_confidences_cal(df, model, tokenizer):
    """
    Calculate confidence scores and update the DataFrame with a new column.
    
    Args:
    - df: DataFrame with input data
    - model: Trained model with a method to get logits
    - tokenizer: Tokenizer to preprocess text
    
    Returns:
    - df: Updated DataFrame with a 'confidence_scores' column
    """
    confidences = []

    calibration_module.eval() 

    for index, row in df.iterrows():
        text = row['Input']  
       
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
        
        with torch.no_grad():
            logits = model(**inputs).logits
            probabilities = F.softmax(logits, dim=-1) 
            probabilities = probabilities.cpu().numpy().flatten()  
            
            # Get the maximum probability (confidence) and corresponding class index
            max_prob = np.max(probabilities)
            confidences.append(max_prob)
    
   
    df['confidence_scores'] = confidences
    return df


m_test_cal = calculate_confidences_cal(m_test_cal, model, tokenizer)

# Proceed with metrics calculation
y_true = m_test_cal['true_labels']
y_pred = m_test_cal['predicted_labels']
confidence_scores = m_test_cal['confidence_scores']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy * 100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision * 100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall * 100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1 * 100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

# Average Confidence Score
avg_confidence = np.mean(confidence_scores)
print("Average Confidence Score:", avg_confidence * 100)

# Confidence for Correct and Incorrect Predictions
correct_confidence = np.mean([confidence for pred, true, confidence in zip(y_pred, y_true, confidence_scores) if pred == true])
incorrect_confidence = np.mean([confidence for pred, true, confidence in zip(y_pred, y_true, confidence_scores) if pred != true])

print("Average Confidence for Correct Predictions:", correct_confidence * 100)
print("Average Confidence for Incorrect Predictions:", incorrect_confidence * 100)

Accuracy: 46.76258992805755
Precision: 46.88025938025938
Recall: 47.16486233627372
F1 Score: 46.751735431792824
Confusion Matrix:
[[19  9 13  2]
 [ 6 17  4  2]
 [11  6 19  7]
 [ 3  5  6 10]]
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.44      0.46        43
           1       0.46      0.59      0.52        29
           2       0.45      0.44      0.45        43
           3       0.48      0.42      0.44        24

    accuracy                           0.47       139
   macro avg       0.47      0.47      0.47       139
weighted avg       0.47      0.47      0.47       139

Average Confidence Score: 45.82185447216034
Average Confidence for Correct Predictions: 47.3630455824045
Average Confidence for Incorrect Predictions: 44.46810850420514
