In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (BertTokenizer, BertForSequenceClassification,
                          RobertaTokenizer, RobertaForSequenceClassification,
                          DistilBertTokenizer, DistilBertForSequenceClassification,
                          AlbertTokenizer, AlbertForSequenceClassification)
import torch
import random
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.optim as optim


# Set random seeds for reproducibility
seed_value = 42
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)  # if you use multi-GPU
np.random.seed(seed_value)
random.seed(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load the provided ticket dataset
df = pd.read_csv('customer_support_tickets.csv')

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define BERT-based models
bert_models = [
    ('BERT Base Uncased', 'bert-base-uncased'),
    ('RoBERTa Base', 'roberta-base'),
    ('DistilBERT Base Uncased', 'distilbert-base-uncased'),
    ('BERT Large Cased', 'bert-large-cased'),
    ('ALBERT Base V2', 'albert-base-v2'),
    ('DistilBERT Base Cased', 'distilbert-base-cased')
]

# Define a dictionary to store evaluation metrics for each model
evaluation_results = {}

# Iterate over BERT-based models
for model_name, model_type in bert_models:
    print(f"Evaluating {model_name}...")
    
    # Load tokenizer and model
    if 'roberta' in model_type:
        tokenizer = RobertaTokenizer.from_pretrained(model_type)
        model = RobertaForSequenceClassification.from_pretrained(model_type, num_labels=len(df['Ticket Type'].unique()))
    elif 'distilbert' in model_type:
        tokenizer = DistilBertTokenizer.from_pretrained(model_type)
        model = DistilBertForSequenceClassification.from_pretrained(model_type, num_labels=len(df['Ticket Type'].unique()))
    elif 'albert' in model_type:
        tokenizer = AlbertTokenizer.from_pretrained(model_type)
        model = AlbertForSequenceClassification.from_pretrained(model_type, num_labels=len(df['Ticket Type'].unique()))
    else:
        tokenizer = BertTokenizer.from_pretrained(model_type)
        model = BertForSequenceClassification.from_pretrained(model_type, num_labels=len(df['Ticket Type'].unique()))
    
    # Tokenize ticket descriptions and convert labels to indices
    train_encodings = tokenizer(train_df['Ticket Description'].tolist(), truncation=True, padding=True)
    test_encodings = tokenizer(test_df['Ticket Description'].tolist(), truncation=True, padding=True)
    train_labels = torch.tensor(train_df['Ticket Type'].astype('category').cat.codes.tolist())
    test_labels = torch.tensor(test_df['Ticket Type'].astype('category').cat.codes.tolist())
    
    # Create PyTorch datasets and data loaders
    train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), train_labels)
    test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), test_labels)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
    
    # Fine-tune the model
    #optimizer = AdamW(model.parameters(), lr=1e-5)
    optimizer = optim.AdamW(model.parameters(), lr=1e-5)
    model.train()
    for epoch in range(3):  # example of training for 3 epochs
        print("Start epoch : {}".format(epoch))
        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print("End epoch : {}".format(epoch))
    # Evaluate the model on the test data
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')
    f1 = f1_score(true_labels, predictions, average='weighted')
    
    # Store evaluation results
    evaluation_results[model_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

# Convert evaluation results to a DataFrame
evaluation_df = pd.DataFrame.from_dict(evaluation_results, orient='index')

# Print evaluation metrics in tabular form
print("Evaluation Metrics:")
print(evaluation_df)


Evaluating BERT Base Uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start epoch : 0
End epoch : 0
Start epoch : 1
End epoch : 1
Start epoch : 2
End epoch : 2


  _warn_prf(average, modifier, msg_start, len(result))


Evaluating RoBERTa Base...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start epoch : 0
End epoch : 0
Start epoch : 1
End epoch : 1
Start epoch : 2
End epoch : 2


  _warn_prf(average, modifier, msg_start, len(result))


Evaluating DistilBERT Base Uncased...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start epoch : 0
End epoch : 0
Start epoch : 1
End epoch : 1
Start epoch : 2
End epoch : 2


  _warn_prf(average, modifier, msg_start, len(result))


Evaluating BERT Large Cased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start epoch : 0
End epoch : 0
Start epoch : 1
End epoch : 1
Start epoch : 2
End epoch : 2


  _warn_prf(average, modifier, msg_start, len(result))


Evaluating ALBERT Base V2...


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start epoch : 0
End epoch : 0
Start epoch : 1
End epoch : 1
Start epoch : 2
End epoch : 2


  _warn_prf(average, modifier, msg_start, len(result))


Evaluating DistilBERT Base Cased...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start epoch : 0
End epoch : 0
Start epoch : 1
End epoch : 1
Start epoch : 2
End epoch : 2
Evaluation Metrics:
                         Accuracy  Precision    Recall  F1 Score
BERT Base Uncased        0.204250   0.168618  0.204250  0.102458
RoBERTa Base             0.206021   0.042445  0.206021  0.070388
DistilBERT Base Uncased  0.197757   0.162036  0.197757  0.144189
BERT Large Cased         0.206021   0.042445  0.206021  0.070388
ALBERT Base V2           0.206021   0.042445  0.206021  0.070388
DistilBERT Base Cased    0.202479   0.201535  0.202479  0.109438
