In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd

from torch.nn import CrossEntropyLoss


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: c:\Python311\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary c:\Python311\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


In [2]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device 

device(type='cuda')

### Load data

In [4]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

# Load the Excel file into a DataFrame
df = pd.read_excel("dataset_for_transformer_v3.xlsx")

# Extract the "texts" column
texts = df["text"].tolist()

# Encoding the labels
encoder = LabelEncoder()
df['encoded_correct_answer'] = encoder.fit_transform(df['CorrectAnswer'])
df['encoded_student_answer'] = encoder.transform(df['AnswerByStudent'])
labels = df['encoded_student_answer'].tolist()

# Tokenizing
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
input_data = tokenizer(texts, padding='max_length', truncation=True, max_length=512, return_tensors="pt")

# Convert the list of labels to a tensor
label_tensor = torch.tensor(labels, dtype=torch.long)

# Combine the training inputs into a TensorDataset
dataset = TensorDataset(input_data["input_ids"], input_data["attention_mask"], input_data["token_type_ids"], label_tensor)

# Splitting data in a stratified manner and ensuring that the same StudentId doesn't appear in multiple sets
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, test_size=0.2, random_state=42)
train_idx, temp_idx = next(gss.split(texts, groups=df['StudentId']))

# Now split the remaining (temp) into validation and test sets
gss_val_test = GroupShuffleSplit(n_splits=1, train_size=0.5, test_size=0.5, random_state=42)
val_idx, test_idx = next(gss_val_test.split([texts[i] for i in temp_idx], groups=df['StudentId'][temp_idx]))

# Create train, val, test datasets
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, [temp_idx[i] for i in val_idx])
test_dataset = torch.utils.data.Subset(dataset, [temp_idx[i] for i in test_idx])

# Create DataLoaders for training, validation, and testing
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train size: {len(train_dataloader.dataset)}")
print(f"Validation size: {len(validation_dataloader.dataset)}")
print(f"Test size: {len(test_dataloader.dataset)}")


Train size: 7279
Validation size: 1243
Test size: 1363


### Init model

In [5]:
# Model initialization (for binary classification)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Training loop

In [6]:
from transformers import AdamW

# Initialize the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

best_val_loss = float("inf")
patience_counter = 0
patience_limit = 1  # Adjust based on your requirements

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Load batch to GPU
        b_input_ids, b_attention_mask, b_token_type_ids, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_token_type_ids = b_token_type_ids.to(device)
        b_labels = b_labels.to(device)

        # Zero out any previously calculated gradients
        model.zero_grad()

        # Perform a forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_mask, token_type_ids=b_token_type_ids)
        
        # Get the logits from the model
        logits = outputs.logits 

        # Compute loss and accumulate the loss values
        loss = loss_fn(logits, b_labels)
        total_loss += loss.item()

        # Perform a backward pass to calculate gradients
        loss.backward()

        # Update parameters
        optimizer.step()

    # Average loss over the training epoch
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch: {epoch + 1}, Train Loss: {avg_train_loss:.4f}")

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0

    # Disable gradient updates
    with torch.no_grad():
        for batch in validation_dataloader:
            b_input_ids, b_attention_mask, b_token_type_ids, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)
            b_token_type_ids = b_token_type_ids.to(device)
            b_labels = b_labels.to(device)
        
            outputs = model(b_input_ids, attention_mask=b_attention_mask, token_type_ids=b_token_type_ids)
            logits = outputs.logits 
            loss = loss_fn(logits, b_labels)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(validation_dataloader)
    print(f"Epoch: {epoch + 1}, Validation Loss: {avg_val_loss:.4f}")
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience_limit:
        print("Early stopping triggered!")
        break




Epoch: 1, Train Loss: 0.5700
Epoch: 1, Validation Loss: 0.0117
Epoch: 2, Train Loss: 0.0092
Epoch: 2, Validation Loss: 0.0037
Epoch: 3, Train Loss: 0.0039
Epoch: 3, Validation Loss: 0.0020


### Save model

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device 

device(type='cuda')

In [8]:
# Directory where model will be saved
save_directory = "./model_part2_v2"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer 
tokenizer.save_pretrained(save_directory)


('./model_part2_v2\\tokenizer_config.json',
 './model_part2_v2\\special_tokens_map.json',
 './model_part2_v2\\vocab.txt',
 './model_part2_v2\\added_tokens.json')

#### Load the saved model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
model = BertForSequenceClassification.from_pretrained(save_directory)
model.to(device)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(save_directory)


### Evaluation

In [9]:
from sklearn.metrics import f1_score, confusion_matrix

model.to(device)

# Step 1: Predictions and True Labels
all_predictions = []
all_true_labels = []
total_eval_accuracy = 0

model.eval()
for batch in test_dataloader:
    b_input_ids, b_attention_mask, b_token_type_ids, b_labels = batch
    b_input_ids = b_input_ids.to(device)
    b_attention_mask = b_attention_mask.to(device)
    b_token_type_ids = b_token_type_ids.to(device)
    b_labels = b_labels.to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask, token_type_ids=b_token_type_ids)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
    
    all_predictions.extend(predictions.cpu().numpy())
    all_true_labels.extend(b_labels.cpu().numpy())
    
    # Calculate the accuracy rate
    total_eval_accuracy += (predictions == b_labels).sum().item()

# Average accuracy over all batches
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)


# Step 2: Compute F1-Score
f1_micro = f1_score(all_true_labels, all_predictions, average='weighted')
f1_macro = f1_score(all_true_labels, all_predictions, average='weighted')
f1_weighted = f1_score(all_true_labels, all_predictions, average='weighted')


# Step 3: Compute Confusion Matrix
conf_matrix = confusion_matrix(all_true_labels, all_predictions)

print(f"Validation Accuracy: {avg_val_accuracy:.4f}")
print(f"F1 Micro Score: {f1_micro:.4f}")
print(f"F1 Macro Score: {f1_macro:.4f}")
print(f"F1 Weighted Score: {f1_weighted:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Validation Accuracy: 1.0965
F1 Micro Score: 1.0000
F1 Macro Score: 1.0000
F1 Weighted Score: 1.0000
Confusion Matrix:
[[309   0   0   0   0]
 [  0 338   0   0   0]
 [  0   0 343   0   0]
 [  0   0   0 301   0]
 [  0   0   0   0  72]]


In [10]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, cohen_kappa_score, balanced_accuracy_score, 
    hamming_loss, zero_one_loss, jaccard_score, classification_report
)

# Basic Metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
precision_macro = precision_score(all_true_labels, all_predictions, average='macro')
precision_micro = precision_score(all_true_labels, all_predictions, average='micro')
recall_macro = recall_score(all_true_labels, all_predictions, average='macro')
recall_micro = recall_score(all_true_labels, all_predictions, average='micro')
f1_macro = f1_score(all_true_labels, all_predictions, average='macro')
f1_micro = f1_score(all_true_labels, all_predictions, average='micro')
bal_acc = balanced_accuracy_score(all_true_labels, all_predictions)
kappa = cohen_kappa_score(all_true_labels, all_predictions)

# Additional Metrics
hamm_loss = hamming_loss(all_true_labels, all_predictions)
zero_one = zero_one_loss(all_true_labels, all_predictions)
jaccard = jaccard_score(all_true_labels, all_predictions, average='weighted')

# Detailed Metrics
clf_report = classification_report(all_true_labels, all_predictions)

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro Precision: {precision_macro:.4f}")
print(f"Micro Precision: {precision_micro:.4f}")
print(f"Macro Recall: {recall_macro:.4f}")
print(f"Micro Recall: {recall_micro:.4f}")
print(f"Macro F1-Score: {f1_macro:.4f}")
print(f"Micro F1-Score: {f1_micro:.4f}")
print(f"Balanced Accuracy: {bal_acc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Hamming Loss: {hamm_loss:.4f}")
print(f"Zero-One Loss: {zero_one:.4f}")
print(f"Weighted Jaccard Score: {jaccard:.4f}")
print("\nClassification Report:")
print(clf_report)


Accuracy: 1.0000
Macro Precision: 1.0000
Micro Precision: 1.0000
Macro Recall: 1.0000
Micro Recall: 1.0000
Macro F1-Score: 1.0000
Micro F1-Score: 1.0000
Balanced Accuracy: 1.0000
Cohen's Kappa: 1.0000
Hamming Loss: 0.0000
Zero-One Loss: 0.0000
Weighted Jaccard Score: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       309
           1       1.00      1.00      1.00       338
           2       1.00      1.00      1.00       343
           3       1.00      1.00      1.00       301
           4       1.00      1.00      1.00        72

    accuracy                           1.00      1363
   macro avg       1.00      1.00      1.00      1363
weighted avg       1.00      1.00      1.00      1363



In [25]:
for idx, (true_label, prediction) in enumerate(zip(all_true_labels, all_predictions)):
    decoded_true_label = encoder.inverse_transform([true_label])
    decoded_prediction = encoder.inverse_transform([prediction])
    if decoded_true_label != decoded_prediction:
        print(f"Sample {idx + 1}: Ground Truth: {decoded_true_label[0]}, Prediction: {decoded_prediction[0]}")
