## Install Requirements

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!pip install kagglehub
!pip install langdetect
!pip install transformers
!pip install datasets
!pip install -q wandb

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=582b87671c14271d285913c5cff538142769ae9f85782e6c8221641140eb5ace
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

## Import Libraries

In [3]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_API_KEY"] = api_key
os.environ["WANDB_PROJECT"] = 'nlp'

import re
import wandb
import kagglehub
import numpy as np
import pandas as pd
from langdetect import detect
from datasets import DatasetDict, Dataset
from transformers import DistilBertConfig, DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score, confusion_matrix, classification_report
import torch
import torch.nn as nn
from scipy.special import softmax
import matplotlib.pyplot as plt
import seaborn as sns

2025-05-10 22:27:28.225953: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746916048.498451      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746916048.572398      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Data loading and preprocessing

In [None]:
# Download dataset
def load_dataset(used_column):
    dataset_path = kagglehub.dataset_download("tobiasbueck/multilingual-customer-support-tickets")
    print("dataset downloaded to this path:", dataset_path)
    ds = pd.read_csv(os.path.join(dataset_path,'aa_dataset-tickets-multi-lang-5-2-50-version.csv'),usecols=used_column)
    ds = ds.rename(columns={'queue': 'label'})
    return ds

# Cleansing text for unnecessary characters
def cleanse_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"\b\d{10,}\b", "", text)
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load dataset and retain necessary columns
used_column = ['body','queue','language']
ds = load_dataset(used_column)

# Check null label value and discard the value
if ds['label'].isnull().any():
    print('There are some rows that has null label value, discard the rows')
    ds = ds.dropna(subset=['label'])

# Cleansing the text
ds["body"] = ds['body'].apply(cleanse_text)

# Discard non-english text
ds = ds[ds['language'] == 'en']
ds = ds.drop(columns=['language'])#.reset_index(drop=False)

# Enumerate label
label2id = {label: idx for idx, label in enumerate(ds['label'].unique())}
ds['label'] = ds['label'].map(label2id)

# Reverse Enumerate label
id2label = {v: k for k, v in label2id.items()}

## Display Statistics

In [None]:
# Display labels
print('Unique value for category:', ds['label'].unique())

# Check label distribution
print('Label freq:', ds['label'].value_counts(normalize=True) * 100)

# Display total row number for each label
print('Label freq:', ds['label'].value_counts())

## Data Splitting and Oversampling

In [None]:
def rebalance_dataframe_by_oversampling(df, label_col='label', random_state=42):
    # Get the majority class size
    class_counts = df[label_col].value_counts()
    # Oversampling limit (All class or only minority class)
    # max_count = class_counts.max()  # All class
    max_count = int(df[label_col].value_counts().sum()/df[label_col].nunique())  # Only minority class

    # List to hold oversampled DataFrames
    balanced_dfs = []

    for label, count in class_counts.items():
        df_label = df[df[label_col] == label]
        
        if count < max_count:
            df_upsampled = resample(
                df_label,
                replace=True, # oversampling
                n_samples=max_count,
                random_state=random_state
            )
        else:
            df_upsampled = df_label

        balanced_dfs.append(df_upsampled)

    # Concat all balanced data
    df_balanced = pd.concat(balanced_dfs).sample(frac=1, random_state=random_state).reset_index(drop=True)
    return df_balanced

In [None]:
ds_ready = ds.copy()

# Split the dataset
train_ds, test_ds = train_test_split(ds_ready, test_size=0.3, random_state=42, stratify=ds_ready['label'])

# Oversampling minority class
train_ds = rebalance_dataframe_by_oversampling(train_ds)
print("Data distribution after oversampling:\n",train_ds['label'].value_counts())

# Convert to Huggingface dataset object
train_dataset = Dataset.from_pandas(train_ds.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_ds.reset_index(drop=True))
ready_dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

## Tokenization

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

def tokenize_function(datas):
    return tokenizer(datas["body"], padding="max_length", truncation=True, max_length=115)

tokenized_dataset = ready_dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

num_labels = train_ds['label'].nunique()

In [None]:
# Original model without focal loss
# model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids

    probs = softmax(logits, axis=1) 
    preds = np.argmax(probs, axis=1)

    # Accuracy
    acc = accuracy_score(labels, preds)

    # Macro
    macro_f1 = f1_score(labels, preds, average='macro')
    macro_precision = precision_score(labels, preds, average='macro', zero_division=0)
    macro_recall = recall_score(labels, preds, average='macro', zero_division=0)

    # Weighted
    weighted_f1 = f1_score(labels, preds, average='weighted')
    weighted_precision = precision_score(labels, preds, average='weighted', zero_division=0)
    weighted_recall = recall_score(labels, preds, average='weighted', zero_division=0)

    return {
        'accuracy': acc,
        'macro_f1': macro_f1,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'weighted_f1': weighted_f1,
        'weighted_precision': weighted_precision,
        'weighted_recall': weighted_recall
    }

# Get class weights based on label column
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_ds['label']),
    y=train_ds['label']
)

# Convert to tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=1.0):
        super().__init__()
        self.alpha = alpha  # Class weights
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(weight=self.alpha, reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean()

class DistilBERTWithFocalLoss(DistilBertForSequenceClassification):
    def __init__(self, config, class_weights, gamma=1.0):
        super().__init__(config)
        self.focal = FocalLoss(alpha=class_weights, gamma=gamma)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.focal(logits, labels)
        return (loss, outputs) if return_outputs else loss

config = DistilBertConfig.from_pretrained(
    "distilbert-base-uncased", 
    # seq_classif_dropout=0.3, # Override default model classifier's dropout
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id)

# Implement focal loss
model = DistilBERTWithFocalLoss.from_pretrained(
    "distilbert-base-uncased",
    config=config,
    class_weights=class_weights_tensor
)

In [None]:
# Model Structure
print("Model Structure:\n\n")
print(model)

# Model Configuration
print("Model Config:\n\n")
print(model.config)

In [None]:
# Freeze all layer exclude the classifier
# for param in model.distilbert.parameters():
#     param.requires_grad = False

# Check trainable model parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable: {name}")

## Fine-Tuning Model

In [None]:
# Setup Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.1,
    learning_rate=2e-5, 
    logging_dir="./logs",
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="wandb",
    run_name="Version 11",
    warmup_steps=500,  # Add this
    lr_scheduler_type="linear",  # And this
)

# Trainer definition
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

# Do training
print("\nFine-tuning")
trainer.train()

# Evaluate training result for best model found
print("\nResult")
trainer_results = trainer.evaluate()
print(trainer_results)

# Display Classification Report
predictions = trainer.predict(tokenized_dataset["test"])
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)
label_names = [label for idx, label in sorted(id2label.items())]
print(classification_report(y_true, y_pred, target_names=label_names, digits=2))

# Display Confusion Matrix
cm = confusion_matrix(y_true, y_pred, normalize='true')
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues",
            xticklabels=label_names, yticklabels=label_names)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

# Stop wandb run
wandb.finish()

## Optional Scripts

In [None]:
# To remove kaggle output directory when full
!rm -rf /kaggle/working/*

In [None]:
# To download the model

import shutil
model.save_pretrained("./my_distilbert_model_version12")
tokenizer.save_pretrained("./my_distilbert_model_version12")
shutil.make_archive("my_distilbert_model_version12", 'zip', "./my_distilbert_model_version12")
from IPython.display import FileLink
FileLink('my_distilbert_model_version12.zip')