In [None]:
# Install necessary libraries
%pip install peft transformers trl
%pip install scikit-learn
%pip install matplotlib

In [None]:
# Import all required libraries
import torch
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset, list_datasets, Features, Value, ClassLabel
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
from google.colab import files
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [None]:
# Placeholder array for label names. The mapping between labels & intent categories is implemented in fusion.
class_names = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
               "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
               "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
               "f30", "f31", "f32", "f33", "part number", "model number", "rated", "gibberish"]

# Define features for the dataset
label_features = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

# Number of labels in the dataset
num_labels = 12

# Load train and validation datasets
categories = load_dataset("csv", data_files={"train": "training.csv", "validation": "validation.csv"}, sep=",", names=["label", "text"], features=label_features)

# Function to translate label integers to class names
def label_int2str(row):
    return categories["train"].features["label"].int2str(row)

# Specify the Hugging Face model to be fine-tuned
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Use GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

# Tokenize the dataset
categories_encoded = categories.map(tokenize, batched=True, batch_size=None)

In [None]:
# Training parameters
batch_size = 60
logging_steps = len(categories_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-cat-categorizer"

training_args = TrainingArguments(output_dir="./results",
                                  num_train_epochs=20,  # Increase epochs for better accuracy
                                  learning_rate=1e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

# Load pre-trained model with specified number of labels
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)

# Function to compute evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Initialize the Trainer
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=categories_encoded["train"],
                  eval_dataset=categories_encoded["validation"],
                  tokenizer=tokenizer)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model(model_name)


In [None]:
# Function to plot the confusion matrix
def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="pred")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

# Generate predictions and plot the confusion matrix
preds_output = trainer.predict(categories_encoded["validation"])
y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = np.array(categories_encoded["validation"]["label"])
labels = list(range(num_labels))

plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
# Zip and download the trained model
!zip -r distilbert-base-uncased-cat-categorizer.zip distilbert-base-uncased-cat-categorizer/
files.download("distilbert-base-uncased-cat-categorizer.zip")