In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
!pip install sentence-transformers

In [None]:
!pip install datasets

In [None]:
!pip install evaluate

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
nb_path = '/content/notebooks'
#os.symlink('/content/drive/My Drive/Colab Notebooks', nb_path)
sys.path.insert(0, nb_path)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from datasets import Dataset, DatasetDict
import evaluate


In [None]:
classifier = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=77)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Reading Training Data - contains banking 77 dataset
train_data_readin = pd.read_csv('train.csv')
train_data_readin = train_data_readin.loc[:,["clean_text", "label"]]

In [None]:
# Create a label mapping dictionary for the 77 classes
# First get all unique labels
unique_labels = sorted(train_data_readin["label"].unique())
# Create mapping dictionary: string label -> integer
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Convert string labels to integers
train_data_readin["label_id"] = train_data_readin["label"].map(label_to_id)

In [None]:
train_data_readin.head(5)

In [None]:
# Split into train and validation sets
train_data, val_data = train_test_split(train_data_readin, train_size=0.8,
                                        stratify=train_data_readin.label_id,
                                        random_state=1234)

In [None]:
print("training data : ",train_data.shape)
print("validation data : ",val_data.shape)

In [None]:
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_data),
    "eval": Dataset.from_pandas(val_data)})

In [None]:
print("Dataset Dict:\n", raw_datasets)
print("\n\nTrain's features:\n", raw_datasets["train"].features)
print("\n\nFirst row of Train:\n", raw_datasets["train"][0])

In [None]:
# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['clean_text'], truncation=True, padding='max_length', max_length=512)

# Tokenize datasets
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)

In [None]:
#validating for completeness
print(tokenized_datasets["train"][0])

In [None]:
# Remove unnecessary columns and rename label column
tokenized_datasets = tokenized_datasets.remove_columns(["clean_text", "label", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("label_id", "labels")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Define metrics computation function for multiclass
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "precision_macro": precision.compute(predictions=predictions, references=labels, average='macro')["precision"],
        "precision_weighted": precision.compute(predictions=predictions, references=labels, average='weighted')["precision"],
        "recall_macro": recall.compute(predictions=predictions, references=labels, average='macro')["recall"],
        "recall_weighted": recall.compute(predictions=predictions, references=labels, average='weighted')["recall"],
        "f1_macro": f1.compute(predictions=predictions, references=labels, average='macro')["f1"],
        "f1_weighted": f1.compute(predictions=predictions, references=labels, average='weighted')["f1"],
    }

In [None]:
# Define training arguments with class imbalance handling
training_args = TrainingArguments(
    output_dir="distilbert-multiclass-classifier",
    num_train_epochs=5,  # Increased epochs for more complex task
    per_device_train_batch_size=16,  # DistilBERT is smaller than BERT-medium
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",  # Changed to weighted F1 for multiclass
    weight_decay=1e-3,  # Increased for better regularization
    learning_rate=3e-5,
    report_to="none",
    # Add class weights calculation if needed
    # For severe imbalance, you might want to implement custom loss weighting
)

# Initialize trainer
trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
# Make predictions on evaluation dataset
eval_predictions = trainer.predict(tokenized_datasets["eval"])
y_pred = np.argmax(eval_predictions.predictions, axis=-1)

# Get the true labels
y_true = np.array(tokenized_datasets["eval"]["labels"])

# Print the classification report for validation data
print("\nValidation Results:")
print(classification_report(y_true, y_pred, digits=3, target_names=[id_to_label[i] for i in range(len(id_to_label))]))

In [None]:
trainer.save_model('/content/drive/MyDrive/Banking-DistilBERT')

# **Test Data Performance Checks**

In [None]:
uploaded = files.upload()

In [None]:
# Reading Test Data
test_data_readin = pd.read_csv('test.csv')
test_data_readin = test_data_readin.loc[:,["clean_text", "label"]]

# Convert test labels to integers using the same mapping
test_data_readin["label_id"] = test_data_readin["label"].map(label_to_id)

print(test_data_readin.shape)
test_data_readin.head(5)

In [None]:
# Prepare test dataset
test_df = Dataset.from_pandas(test_data_readin)
test_df = test_df.map(tokenize_function, batched=True)

# Remove only columns that exist in the test dataset
columns_to_remove = ["clean_text", "label"]
existing_columns = [col for col in columns_to_remove if col in test_df.column_names]
test_df = test_df.remove_columns(existing_columns)
test_df = test_df.rename_column("label_id", "labels")

# Make predictions on test data
test_predictions = trainer.predict(test_df)
test_preds = np.argmax(test_predictions.predictions, axis=-1)

# Get true test labels
test_true = np.array(test_df["labels"])

In [None]:
# Get true test labels
test_true = np.array(test_df["labels"])

# Get classification report as a dictionary
target_names = [id_to_label[i] for i in range(len(id_to_label))]
report = classification_report(test_true, test_preds, digits=3,
                              target_names=target_names,
                              output_dict=True)

# Convert the classification report to a DataFrame
report_df = pd.DataFrame(report).transpose()

In [None]:
# Save the classification report to Google Drive
report_path = '/content/drive/My Drive/banking_distilbert_classification_report.csv'
report_df.to_csv(report_path)
print(f"Classification report saved to {report_path}")

In [None]:
# Print classification report for test data
print("\nTest Results:")
print(classification_report(test_true, test_preds, digits=3, target_names=[id_to_label[i] for i in range(len(id_to_label))]))

In [None]:
# Create a DataFrame with original labels and predictions
predictions_df = pd.DataFrame({
    'true_label_id': test_true,
    'predicted_label_id': test_preds,
    'true_label': [id_to_label[id] for id in test_true],
    'predicted_label': [id_to_label[id] for id in test_preds]
})

# Save detailed predictions to Google Drive
predictions_path = '/content/drive/My Drive/banking_distilbert_predictions.csv'
predictions_df.to_csv(predictions_path, index=False)

In [None]:
predictions_df.head(5)

In [None]:
!mkdir -p "/content/drive/MyDrive/fine-tune-envs/"

In [None]:
!pip freeze > "/content/drive/MyDrive/fine-tune-envs/requirements.txt"