Classification multi-perspective approach with BERT-large LLMD

In [None]:
import numpy as np
import ast
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [3]:
#function to delete no majority instances 
def clean_data(df, col):
     df = df.loc[~(df[col] == 'No Majority')] 
     return df


 train = clean_data(train_df, 'majority_llm_noninst')
 test = clean_data(test_df, 'majority_llm_noninst')
 val = clean_data(val_df, 'majority_llm_noninst')

In [4]:
#mapping labels
labels = ['Pro', 'Against', 'Neutral', 'Not-about']
num_labels = len(labels)
id2label = {id:label for id,label in enumerate(labels)}
label2id = {label:id for id,label in enumerate(labels)}

In [6]:
#dataset

train = Dataset.from_pandas(train)
val = Dataset.from_pandas(val)
test = Dataset.from_pandas(test)

dataset = DatasetDict() 
dataset['train'] = train
dataset['val'] = val
dataset['test'] = test

In [7]:
#model 
model_name = 'bert-large-uncased'
model_name_filename = model_name.replace("/", "-")

In [None]:
#tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#parsing soft labels


def parse_soft_labels(example):
    example['soft_labels_noninst'] = ast.literal_eval(example['soft_labels_noninst'])
    return example


train = train.map(parse_soft_labels)
test = test.map(parse_soft_labels)
val = val.map(parse_soft_labels)




In [10]:
def tokenize_func(examples):
    # Tokenize the input text and map the 'soft_labels' column to 'labels'
    tokenized_inputs = tokenizer(examples['Input'], padding='max_length', truncation=True, max_length=512)
    tokenized_inputs['labels'] = examples['soft_labels_noninst']  # Rename 'soft_labels' to 'labels'
    return tokenized_inputs

In [None]:
train_tokenized = train.map(tokenize_func, batched = True)
val_tokenized = val.map(tokenize_func, batched = True) 

In [12]:
train_tokenized.set_format('torch', columns =['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
val_tokenized.set_format('torch', columns = ['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
train_tokenized['labels'][0]

In [14]:
num_labels=len(train_tokenized['labels'][0])

In [None]:
num_labels

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [50]:
output_dir = f"./multiclassification/{model_name_filename}/results"

In [None]:
output_dir

In [None]:
#training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [53]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [54]:
#definition of soft_loss_function 

def softmax(logits):
    """
    Compute softmax probabilities from logits.
    
    Parameters:
    - logits: A numpy array of shape (n, num_classes) containing the logits.
    
    Returns:
    - probabilities: A numpy array of shape (n, num_classes) containing the softmax probabilities.
    """
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def soft_loss_function(true_probabilities, predicted_logits):
    """
    Compute the soft loss function (cross-entropy with soft labels) using PyTorch tensors.
    
    Parameters:
    - true_probabilities: A PyTorch tensor of shape (n, num_classes) containing the true probability distributions.
    - predicted_logits: A PyTorch tensor of shape (n, num_classes) containing the logits from the model.
    
    Returns:
    - loss: The computed soft loss.
    """
  
    predicted_probabilities = torch.nn.functional.softmax(predicted_logits, dim=-1)
    
   
    epsilon = 1e-15
    predicted_probabilities = torch.clamp(predicted_probabilities, epsilon, 1. - epsilon)
    
  
    loss = -torch.sum(true_probabilities * torch.log(predicted_probabilities))
    
    return loss

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Compute the loss for the model. This uses the soft_loss_function.

        Parameters:
        - model: The model to evaluate.
        - inputs: A dictionary of inputs to the model.
        - return_outputs: Whether to return model outputs along with loss.

        Returns:
        - loss: The computed loss.
        - outputs (optional): The model outputs, if return_outputs is True.
        """
        labels = inputs.pop("labels")
        
      
        outputs = model(**inputs)
        logits = outputs.logits
        
      
        true_probabilities = labels  
        logits = logits  
        
      
        loss = soft_loss_function(true_probabilities, logits)
        
        return (loss, outputs) if return_outputs else loss


In [55]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(eval_results) 

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir) 

In [None]:
tokenized_test = test.map(tokenize_func, batched=True)
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", 'token_type_ids', "labels"])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set the model to evaluation mode

In [62]:
#prediction phase
def predict(texts, model, tokenizer, device):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.logits.cpu().numpy()

In [63]:
all_predictions = []
all_labels = []
batch_size = 8  

for i in range(0, len(test), batch_size):
    batch = test[i:i+batch_size]
    batch_texts = batch['Input']
    batch_predictions = predict(batch_texts, model, tokenizer, device)
    all_predictions.extend(batch_predictions)
    all_labels.extend(batch['labels'])

In [64]:
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

In [65]:
from scipy.special import softmax
# Apply softmax to predictions
softmax_predictions = softmax(all_predictions, axis=1)

In [66]:
df_test = test.to_pandas()

In [68]:
#collecting predicted scores and normalized predicted scores with softmax
df_test['Predicted_scores'] = all_predictions.tolist()
df_test['Predicted_Softmax_scores'] = softmax_predictions.tolist()

In [69]:
df_test['predicted_labels'] = df_test['Predicted_scores'].apply(lambda x: x.index(max(x)))
df_test['predicted_softmax_labels'] = df_test['Predicted_Softmax_scores'].apply(lambda x: x.index(max(x)))

In [70]:
#reading test file
from datasets import Dataset
test = Dataset.from_pandas(df_test) 

In [None]:
#test file but changed name 
m_test["majority_llm_noninst"]

In [75]:
#label encoding
label_ecnoding_multip = {'Pro': 0,
'Against': 1,
'Neutral': 2,
'Not-about': 3,}


m_test['true_labels'] = m_test['majority_llm_noninst'].map(label_ecnoding_multip) 

In [None]:
#calculation confidence and classification metrics (accuracy, precision, recall, f1, confusion matrix)
from torch.nn import functional as F

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def calculate_confidences(df, model, tokenizer):
    """
    Calculate confidence scores and update the DataFrame with a new column.
    
    Args:
    - df: DataFrame with input data
    - model: Trained model with a method to get logits
    - tokenizer: Tokenizer to preprocess text
    
    Returns:
    - df: Updated DataFrame with a 'confidence_scores' column
    """
    confidences = []

    model.eval()  

    for index, row in df.iterrows():
        text = row['Input']  
        
       
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
        
        with torch.no_grad():
            logits = model(**inputs).logits
            probabilities = F.softmax(logits, dim=-1)  
            probabilities = probabilities.cpu().numpy().flatten()  
            
            
            max_prob = np.max(probabilities)
            confidences.append(max_prob)
    
   
    df['confidence_scores'] = confidences
    return df


m_test = calculate_confidences(m_test, model, tokenizer)


y_true = m_test['true_labels']
y_pred = m_test['predicted_labels']
confidence_scores = m_test['confidence_scores']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy * 100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision * 100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall * 100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1 * 100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

# Average Confidence Score
avg_confidence = np.mean(confidence_scores)
print("Average Confidence Score:", avg_confidence * 100)

# Confidence for Correct and Incorrect Predictions
correct_confidence = np.mean([confidence for pred, true, confidence in zip(y_pred, y_true, confidence_scores) if pred == true])
incorrect_confidence = np.mean([confidence for pred, true, confidence in zip(y_pred, y_true, confidence_scores) if pred != true])

print("Average Confidence for Correct Predictions:", correct_confidence * 100)
print("Average Confidence for Incorrect Predictions:", incorrect_confidence * 100)

Accuracy: 61.76470588235294
Precision: 15.441176470588236
Recall: 25.0
F1 Score: 19.090909090909093
Confusion Matrix:
[[63  0  0  0]
 [13  0  0  0]
 [ 6  0  0  0]
 [20  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      1.00      0.76        63
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00        20

    accuracy                           0.62       102
   macro avg       0.15      0.25      0.19       102
weighted avg       0.38      0.62      0.47       102

Average Confidence Score: 45.91071307659149
Average Confidence for Correct Predictions: 46.44718411422911
Average Confidence for Incorrect Predictions: 45.044117325391525

In [78]:
#soft loss function definition and application on temperature scaling

def softmax(logits):
    """
    Compute softmax probabilities from logits.
    
    Parameters:
    - logits: A numpy array of shape (n, num_classes) containing the logits.
    
    Returns:
    - probabilities: A numpy array of shape (n, num_classes) containing the softmax probabilities.
    """
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def soft_loss_function(true_probabilities, predicted_logits):
    """
    Compute the soft loss function (cross-entropy with soft labels) using PyTorch tensors.
    
    Parameters:
    - true_probabilities: A PyTorch tensor of shape (n, num_classes) containing the true probability distributions.
    - predicted_logits: A PyTorch tensor of shape (n, num_classes) containing the logits from the model.
    
    Returns:
    - loss: The computed soft loss.
    """
    
    predicted_probabilities = torch.nn.functional.softmax(predicted_logits, dim=-1)
    
    
    epsilon = 1e-15
    predicted_probabilities = torch.clamp(predicted_probabilities, epsilon, 1. - epsilon)
    
  
    loss = -torch.sum(true_probabilities * torch.log(predicted_probabilities))
    
    return loss

class TemperatureScalingCalibration(nn.Module):
    def __init__(self, model_path: str, tokenizer, device: torch.device):
        super().__init__()
        self.model_path = model_path
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = tokenizer
        self.device = device

     
        self.temperature = nn.Parameter(torch.ones(1)) 
        
        self.model.to(self.device)
        self.temperature.to(self.device)

    def forward(self, input_ids, attention_mask):
        """Forward method that returns softmax-ed confidence scores."""
        logits = self.forward_logit(input_ids, attention_mask)
        scaled_logits = logits / self.temperature
        scores = nn.functional.softmax(scaled_logits, dim=-1)
        return scores

    def forward_logit(self, input_ids, attention_mask):
        """Forward method that returns logits, to be used with cross-entropy loss."""
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        ).logits
        return outputs

    def fit(self, dataset_tokenized, n_epochs: int = 3, batch_size: int = 64, lr: float = 0.01):
        """Fits the temperature scaling parameter."""
        data_collator = DataCollatorWithPadding(self.tokenizer, padding=True)
        data_loader = DataLoader(dataset_tokenized, collate_fn=data_collator, batch_size=batch_size)

        self.freeze_base_model()
      
       
        optimizer = optim.SGD(self.parameters(), lr=lr)

      
        self.train()

        for epoch in trange(n_epochs):
            epoch_loss = 0.0
            for examples in data_loader:
           
                input_ids = examples['input_ids'].to(self.device)
                attention_mask = examples['attention_mask'].to(self.device)
                soft_labels = examples['labels'].to(self.device)  
                
              
                self.zero_grad()
                logits = self.forward_logit(input_ids, attention_mask)
                
               
                scaled_logits = logits / self.temperature
                
                
                loss = soft_loss_function(soft_labels, scaled_logits)
                loss.backward()
                optimizer.step()

               
                epoch_loss += loss.item() * input_ids.size(0)

           
            print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss / len(dataset_tokenized)}")

        return self

    def freeze_base_model(self):
        """Remember to freeze base model's parameters when training temperature scaler."""
        self.model.eval()
        for parameter in self.model.parameters():
            parameter.requires_grad = False

        return self


In [None]:
#saving calibration module
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
calibration_module = TemperatureScalingCalibration(model_path=output_dir, tokenizer=tokenizer, device=device)
calibration_module.to(device) 

In [81]:
def tokenize_func(examples):
    # Tokenize the input text and map the 'soft_labels' column to 'labels'
    tokenized_inputs = tokenizer(examples['Input'], padding='max_length', truncation=True, max_length=512)
    tokenized_inputs['labels'] = examples['soft_labels_noninst']  # Rename 'soft_labels' to 'labels'
    return tokenized_inputs

In [None]:
#tokenizaton of evaluation data
val_tokenized_cal = val.map(tokenize_func, batched = True) 

In [83]:
val_tokenized_cal.set_format('torch', columns = ['input_ids', 'attention_mask', 'labels'])

In [None]:
#fitting of the calibrated module 
calibration_module.fit(val_tokenized_cal,n_epochs=6, batch_size=64, lr=0.01) 

In [None]:
calibration_module.eval()

In [86]:
def predict_cal(texts, model, tokenizer, device):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items() if k in ['input_ids', "token_ids", 'attention_mask']}  
    with torch.no_grad():
        logits = model(**inputs)  
    return logits.cpu().numpy()

In [87]:
all_predictions = []
all_labels = []
batch_size = 8  

for i in range(0, len(test), batch_size):
    batch = test[i:i+batch_size]
    batch_texts = batch['Input']
    batch_predictions = predict_cal(batch_texts, calibration_module, tokenizer, device)
    all_predictions.extend(batch_predictions)
    all_labels.extend(batch['labels']) 

In [88]:
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

In [89]:
from scipy.special import softmax
# Apply softmax to predictions
softmax_predictions = softmax(all_predictions, axis=1)

In [90]:
df_test_cal = test.to_pandas()

In [91]:
df_test_cal['Predicted_scores'] = all_predictions.tolist()
df_test_cal['Predicted_Softmax_scores'] = softmax_predictions.tolist()

In [92]:
df_test_cal['predicted_labels'] = df_test['Predicted_scores'].apply(lambda x: x.index(max(x)))
df_test_cal['predicted_softmax_labels'] = df_test['Predicted_Softmax_scores'].apply(lambda x: x.index(max(x)))

In [93]:
from datasets import Dataset
test_cal = Dataset.from_pandas(df_test_cal) 

In [95]:
label_ecnoding_multip = {'Pro': 0,
'Against': 1,
'Neutral': 2,
'Not-about': 3,} 


m_test_cal['true_labels'] = m_test_cal['majority_llm_noninst'].map(label_ecnoding_multip) 

In [None]:
#calculation of confidence and classification metrics (accuracy, precision, recall, f1, confusion matrix) for calibrated module

def calculate_confidences_cal(df, model, tokenizer):
    """
    Calculate confidence scores and update the DataFrame with a new column.
    
    Args:
    - df: DataFrame with input data
    - model: Trained model with a method to get logits
    - tokenizer: Tokenizer to preprocess text
    
    Returns:
    - df: Updated DataFrame with a 'confidence_scores' column
    """
    confidences = []

    calibration_module.eval()  

    for index, row in df.iterrows():
        text = row['Input']  
        
   
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
        
        with torch.no_grad():
            logits = model(**inputs).logits
            probabilities = F.softmax(logits, dim=-1)  
            probabilities = probabilities.cpu().numpy().flatten()  
            
            
            max_prob = np.max(probabilities)
            confidences.append(max_prob)
    
   
    df['confidence_scores'] = confidences
    return df


m_test_cal = calculate_confidences_cal(m_test_cal, model, tokenizer)


y_true = m_test_cal['true_labels']
y_pred = m_test_cal['predicted_labels']
confidence_scores = m_test_cal['confidence_scores']

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy * 100)

# Precision
precision = precision_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Precision:", precision * 100)

# Recall
recall = recall_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("Recall:", recall * 100)

# F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' averaging for multiclass
print("F1 Score:", f1 * 100)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

# Average Confidence Score
avg_confidence = np.mean(confidence_scores)
print("Average Confidence Score:", avg_confidence * 100)

# Confidence for Correct and Incorrect Predictions
correct_confidence = np.mean([confidence for pred, true, confidence in zip(y_pred, y_true, confidence_scores) if pred == true])
incorrect_confidence = np.mean([confidence for pred, true, confidence in zip(y_pred, y_true, confidence_scores) if pred != true])

print("Average Confidence for Correct Predictions:", correct_confidence * 100)
print("Average Confidence for Incorrect Predictions:", incorrect_confidence * 100)

Accuracy: 61.76470588235294
Precision: 15.441176470588236
Recall: 25.0
F1 Score: 19.090909090909093
Confusion Matrix:
[[63  0  0  0]
 [13  0  0  0]
 [ 6  0  0  0]
 [20  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      1.00      0.76        63
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00        20

    accuracy                           0.62       102
   macro avg       0.15      0.25      0.19       102
weighted avg       0.38      0.62      0.47       102

Average Confidence Score: 45.91071307659149
Average Confidence for Correct Predictions: 46.44718411422911
Average Confidence for Incorrect Predictions: 45.044117325391525