**Intent recognition**

distilbert-base-uncased

a distilled version of the BERT base model.

Bert is a Pretrained model on English language using a masked language modeling (MLM) objective pretrained on a large corpus of English data in a self-supervised fashion.

 it was pretrained with two objectives:


*   Masked language modeling (MLM)
*   Next sentence prediction (NSP)

DistilBERT was pretrained with three objectives

*   Distillation loss: the model was trained to return the same probabilities as the BERT base model
*   Masked language modeling (MLM): this is part of the original training loss of the BERT base model.
*  Cosine embedding loss: the model was also trained to generate hidden states as close as possible as the BERT base model







In [None]:
from datasets import load_dataset,DatasetDict
import kagglehub

dataset = load_dataset(
    "csv",
    data_files="/content/final_data_set(in).csv"
)

print("Path to dataset files:",dataset)

Path to dataset files: DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'category', 'urgency'],
        num_rows: 750
    })
})


This dataset provides a comprehensive collection of real-world IT support ticket data

Each row in the dataset represents a single IT support ticket, with the following key attributes:
- **id**: unque question id
- **question**: question asked
- **urgency**:  Indicates the urgency or criticality level assigned to the IT support ticket
- **category**: A comprehensive list of keywords or labels that provide more granular detail about the nature, specific topic, affected components, or sub-categories of the IT support ticket.


In [None]:
# Split the dataset
split_dataset = dataset['train'].train_test_split(
    test_size=0.1,
    seed=42
)

# Rename "test" â†’ "validation"
final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"]
})


In [None]:
final_dataset["train"] = final_dataset["train"].filter(
    lambda x: x["category"] is not None and len(x["category"].strip()) > 0
)

final_dataset["validation"] = final_dataset["validation"].filter(
    lambda x: x["category"] is not None and len(x["category"].strip()) > 0
)


Filter:   0%|          | 0/675 [00:00<?, ? examples/s]

Filter:   0%|          | 0/75 [00:00<?, ? examples/s]

In [None]:
print(final_dataset["train"]["category"][:5])


['Email & Communication', 'Classroom/Lab Support', 'Software & Applications', 'Classroom/Lab Support', 'Classroom/Lab Support']


In [None]:
import ast

def fix_tags(tag):
    """
    Ensures that the category field is always a list of non-empty strings.
    Handles:
      - Already a list
      - String representation of a list
      - Comma-separated strings
      - None or empty strings
    """
    if tag is None:
        return []

    # Already a list
    if isinstance(tag, list):
        return [str(t).strip() for t in tag if str(t).strip()]

    # String representation of a list, e.g., "['a','b']"
    if isinstance(tag, str):
        tag = tag.strip()
        if tag == "":
            return []
        try:
            parsed = ast.literal_eval(tag)
            if isinstance(parsed, list):
                return [str(t).strip() for t in parsed if str(t).strip()]
        except:
            return [str(t).strip() for t in tag.split(",") if str(t).strip()]

    # Fallback: wrap as list if something else
    return [str(tag).strip()]

# Fix all splits
for split in ["train", "validation", "test"]:
    if split in final_dataset:
        final_dataset[split] = final_dataset[split].map(
            lambda x: {
                "category": fix_tags(x.get("category")),
                **{k: v for k, v in x.items() if k != "category"}
            }
        )


Map:   0%|          | 0/675 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert datasets columns to lists
train_tags = list(final_dataset["train"]["category"])
val_tags = list(final_dataset["validation"]["category"]) if "validation" in final_dataset else []
test_tags = list(final_dataset["test"]["category"]) if "test" in final_dataset else []

# Combine all tags
all_tags = train_tags + val_tags + test_tags

# Fit MultiLabelBinarizer on all tags
mlb = MultiLabelBinarizer()
mlb.fit(all_tags)

# Save classes and number of tags
tag_classes = mlb.classes_
num_tags = len(tag_classes)
print("âœ… Number of tags:", num_tags)


âœ… Number of tags: 10


In [None]:
from sklearn.preprocessing import LabelEncoder

# Combine all splits to fit the encoder
all_urgency = list(final_dataset["train"]["urgency"])
if "validation" in final_dataset:
    all_urgency += list(final_dataset["validation"]["urgency"])

# Fit LabelEncoder
urgency_encoder = LabelEncoder()
urgency_encoder.fit(all_urgency)

# Save number of classes
num_urgency = len(urgency_encoder.classes_)
print("âœ… Urgency classes:", urgency_encoder.classes_)


âœ… Urgency classes: ['critical' 'high' 'low' 'medium']


**Label Preparation**

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert Columns to lists
train_tags = list(final_dataset["train"]["category"])
val_tags = list(final_dataset["validation"]["category"]) if "validation" in final_dataset else []

# Combine all tags
all_tags = train_tags + val_tags

# Fit MultiLabelBinarizer on all tags
mlb = MultiLabelBinarizer()
mlb.fit(all_tags)
tag_classes = mlb.classes_

print("âœ… Classes after merging splits:", tag_classes)


âœ… Classes after merging splits: ['Account & Access' 'Classroom/Lab Support' 'Data Management'
 'Email & Communication' 'General IT Support' 'Hardware & Equipment'
 'Network & Connectivity' 'Security & Compliance'
 'Software & Applications' 'System Administration']


In [None]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'category', 'urgency'],
        num_rows: 675
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'category', 'urgency'],
        num_rows: 75
    })
})

**Tokenization**

In [None]:

from transformers import AutoTokenizer, DataCollatorWithPadding
checkpoint = "Sandei/tech-support-classifier"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_with_labels(example):
    text = example["question"]
    if not isinstance(text, str) or text.strip() == "":
        text = "[EMPTY]"

    tokenized = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256
    )

    # Multi-hot vector for categories
    tokenized["category_labels"] = mlb.transform([example["category"]])[0].astype(float).tolist()

    # Encode urgency as integer
    tokenized["urgency_label"] = int(urgency_encoder.transform([example["urgency"]])[0])

    return tokenized


tokenized_datasets = final_dataset.map(
    tokenize_with_labels,
    batched=False,
    remove_columns=["id", "question", "answer", "category", "urgency"]
)
tokenized_datasets.set_format("torch")

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/675 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [None]:
display(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'category_labels', 'urgency_label'],
        num_rows: 675
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'category_labels', 'urgency_label'],
        num_rows: 75
    })
})

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 256]),
 'attention_mask': torch.Size([8, 256]),
 'category_labels': torch.Size([8, 10]),
 'urgency_label': torch.Size([8])}

In [None]:
from transformers import AutoModel
import torch.nn as nn
import torch

class MultiTaskModel(nn.Module):
    def __init__(self, checkpoint, num_category_labels, num_urgency_labels):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint)
        hidden_size = self.encoder.config.hidden_size

        # Head 1: multi-label category
        self.category_classifier = nn.Linear(hidden_size, num_category_labels)

        # Head 2: single-label urgency
        self.urgency_classifier = nn.Linear(hidden_size, num_urgency_labels)

    def forward(self, input_ids, attention_mask, category_labels=None, urgency_label=None, pos_weight=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # [CLS] token

        category_logits = self.category_classifier(pooled)
        urgency_logits = self.urgency_classifier(pooled)

        loss = None
        if category_labels is not None and urgency_label is not None:
            # Multi-label BCE loss with pos_weight
            if pos_weight is not None:
                category_loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
            else:
                category_loss_fn = nn.BCEWithLogitsLoss()
            category_loss = category_loss_fn(category_logits, category_labels.float())

            # Multi-class CrossEntropy for urgency
            urgency_loss = nn.CrossEntropyLoss()(urgency_logits, urgency_label)

            # Total loss (sum)
            loss = category_loss + urgency_loss

        return {"loss": loss, "category_logits": category_logits, "urgency_logits": urgency_logits}


In [None]:
model = MultiTaskModel(checkpoint, num_category_labels=num_tags, num_urgency_labels=num_urgency)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

MultiTaskModel(
  (encoder): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Line

In [None]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

255


In [None]:
import torch
import numpy as np

# Convert all category labels to a single NumPy array
all_category_labels = np.vstack([
    mlb.transform([x])[0] for x in final_dataset["train"]["category"]
])

# Compute class frequencies
class_counts = all_category_labels.sum(axis=0)
total_samples = all_category_labels.shape[0]

# pos_weight = (# negative samples / # positive samples)
pos_weight = torch.tensor((total_samples - class_counts) / (class_counts + 1e-5), dtype=torch.float32).to(device)
print("âœ… Pos weight for categories:", pos_weight)


âœ… Pos weight for categories: tensor([8.9265, 9.0746, 8.9265, 9.7143, 8.5070, 9.0746, 9.0746, 8.9265, 8.7826,
        9.0746], device='cuda:0')


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm


# Scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [None]:
progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            category_labels=batch["category_labels"],
            urgency_label=batch["urgency_label"],
            pos_weight=pos_weight
        )
        loss = outputs["loss"]

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Optimizer & scheduler step
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print(f"Epoch {epoch+1}/{num_epochs} completed. Last batch loss: {loss.item():.4f}")


  0%|          | 0/255 [00:00<?, ?it/s]

Epoch 1/3 completed. Last batch loss: 1.6402
Epoch 2/3 completed. Last batch loss: 0.8227
Epoch 3/3 completed. Last batch loss: 0.2887


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, accuracy_score

def evaluate_multi_task_model(threshold=0.5):
    model.eval()
    all_category_preds, all_category_true, all_category_probs = [], [], []
    all_urgency_preds, all_urgency_true = [], []

    eval_split = "validation" if "validation" in tokenized_datasets else "train"
    print(f"Evaluating on: {eval_split}")

    eval_dataloader = DataLoader(
        tokenized_datasets[eval_split],
        batch_size=16,
        collate_fn=data_collator
    )

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                category_labels=batch["category_labels"],
                urgency_label=batch["urgency_label"]
            )

            # ===== Category =====
            category_logits = outputs["category_logits"]
            category_probs = torch.sigmoid(category_logits)
            category_preds = (category_probs > threshold).int()

            all_category_probs.extend(category_probs.cpu().numpy())
            all_category_preds.extend(category_preds.cpu().numpy())
            all_category_true.extend(batch["category_labels"].cpu().numpy())

            # ===== Urgency =====
            urgency_logits = outputs["urgency_logits"]
            urgency_preds = torch.argmax(urgency_logits, dim=-1)
            all_urgency_preds.extend(urgency_preds.cpu().numpy())
            all_urgency_true.extend(batch["urgency_label"].cpu().numpy())

    # ===== Metrics =====
    category_micro_f1 = f1_score(all_category_true, all_category_preds, average="micro", zero_division=0)
    category_macro_f1 = f1_score(all_category_true, all_category_preds, average="macro", zero_division=0)
    urgency_acc = accuracy_score(all_urgency_true, all_urgency_preds)

    print(f"ðŸ“Š Category Micro F1: {category_micro_f1:.4f}")
    print(f"ðŸ“Š Category Macro F1: {category_macro_f1:.4f}")
    print(f"ðŸ“Š Urgency Accuracy: {urgency_acc:.4f}")

    # Show a few samples safely
    n_samples = min(5, len(all_category_true))
    for i in range(n_samples):
        true_tags = [tag_classes[j] for j, v in enumerate(all_category_true[i]) if v == 1]
        pred_tags = [tag_classes[j] for j, v in enumerate(all_category_preds[i]) if v == 1]
        true_urgency = urgency_encoder.inverse_transform([all_urgency_true[i]])[0]
        pred_urgency = urgency_encoder.inverse_transform([all_urgency_preds[i]])[0]
        print(f"Sample {i+1}: Category True={true_tags}, Pred={pred_tags}, Urgency True={true_urgency}, Pred={pred_urgency}")

    return {
        "category_micro_f1": category_micro_f1,
        "category_macro_f1": category_macro_f1,
        "urgency_accuracy": urgency_acc,
        "category_predictions": all_category_preds,
        "category_true_labels": all_category_true,
        "category_probabilities": all_category_probs,
        "urgency_predictions": all_urgency_preds,
        "urgency_true_labels": all_urgency_true
    }

In [None]:
import json
import torch
import numpy as np

def to_serializable(obj):
    """Recursively convert objects to JSON-serializable Python types."""
    if isinstance(obj, torch.Tensor):
        return to_serializable(obj.detach().cpu().numpy())
    elif isinstance(obj, np.ndarray):
        return [to_serializable(o) for o in obj.tolist()]
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, list):
        return [to_serializable(o) for o in obj]
    elif isinstance(obj, dict):
        return {k: to_serializable(v) for k, v in obj.items()}
    else:
        return obj

# Run the updated multi-task evaluation
evaluation_results = evaluate_multi_task_model()

# Convert everything to JSON-serializable types
evaluation_results_serializable = to_serializable(evaluation_results)

# Save to JSON
with open("evaluation_results.json", "w") as f:
    json.dump(evaluation_results_serializable, f, indent=4)

print("âœ… Evaluation results successfully serialized to JSON.")


Evaluating on: validation


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

ðŸ“Š Category Micro F1: 0.7541
ðŸ“Š Category Macro F1: 0.7577
ðŸ“Š Urgency Accuracy: 0.8400
Sample 1: Category True=['Data Management'], Pred=['Data Management', 'General IT Support'], Urgency True=low, Pred=low
Sample 2: Category True=['Hardware & Equipment'], Pred=['Hardware & Equipment'], Urgency True=critical, Pred=critical
Sample 3: Category True=['Account & Access'], Pred=['Account & Access', 'Security & Compliance'], Urgency True=high, Pred=high
Sample 4: Category True=['Security & Compliance'], Pred=['Account & Access', 'Security & Compliance'], Urgency True=medium, Pred=medium
Sample 5: Category True=['Security & Compliance'], Pred=['Account & Access', 'Email & Communication', 'Security & Compliance'], Urgency True=high, Pred=medium
âœ… Evaluation results successfully serialized to JSON.


In [None]:
import gradio as gr
import torch

# Make sure the model is in eval mode
model.eval()

def predict_with_ui(question, threshold=0.5):
    if not isinstance(question, str) or question.strip() == "":
        question = "[EMPTY]"

    # Tokenize
    inputs = tokenizer(
        question,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Forward pass
    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )

    # ===== Multi-label category =====
    category_logits = outputs["category_logits"]
    category_probs = torch.sigmoid(category_logits)[0].cpu().numpy()

    # Only show categories above threshold
    predicted_categories = [
        {"category": tag_classes[i], "confidence": float(category_probs[i])}
        for i in range(len(tag_classes)) if category_probs[i] > threshold
    ]

    # ===== Urgency (single-label) =====
    urgency_logits = outputs["urgency_logits"]
    urgency_probs = torch.softmax(urgency_logits, dim=-1)[0].cpu().numpy()
    urgency_index = int(torch.argmax(urgency_logits, dim=-1)[0].cpu().item())
    predicted_urgency = {
        "urgency": urgency_encoder.inverse_transform([urgency_index])[0],
        "confidence": float(urgency_probs[urgency_index])
    }

    return predicted_categories, predicted_urgency

# Create Gradio interface
ui = gr.Interface(
    fn=predict_with_ui,
    inputs=gr.Textbox(lines=2, placeholder="Type your question here..."),
    outputs=[
        gr.JSON(label="Predicted Categories with Confidence"),
        gr.JSON(label="Predicted Urgency with Confidence")
    ],
    title="Multi-task Support Ticket Classifier",
    description="Enter a question to get predicted categories and urgency level along with confidence scores."
)

ui.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://659de50024350a8f75.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
outputs = model(**batch)
category_logits = outputs["category_logits"]
print("Logits min/max:", category_logits.min().item(), category_logits.max().item())


Logits min/max: -2.7113037109375 3.2476887702941895


In [None]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [None]:
import os
import torch

model_path = "tech-support-classifier"

# Create the folder if it doesn't exist
os.makedirs(model_path, exist_ok=True)

# Save model state_dict
torch.save(model.state_dict(), f"{model_path}/pytorch_model.bin")

# Save tokenizer
tokenizer.save_pretrained(model_path)

print("âœ… Model weights and tokenizer saved successfully!")


âœ… Model weights and tokenizer saved successfully!


In [None]:
from transformers import AutoTokenizer
import torch

# Recreate the model (same architecture as before)
model_loaded = MultiTaskModel(
    checkpoint="distilbert/distilbert-base-uncased",
    num_category_labels=num_tags,
    num_urgency_labels=num_urgency
)

# Load trained weights
model_loaded.load_state_dict(torch.load(f"{model_path}/pytorch_model.bin"))
model_loaded.to(device)
model_loaded.eval()

# Load tokenizer
tokenizer_loaded = AutoTokenizer.from_pretrained(model_path)

print("âœ… Model and tokenizer loaded for inference")


âœ… Model and tokenizer loaded for inference


In [None]:
from huggingface_hub import upload_folder

upload_folder(
    folder_path="tech-support-classifier",   # your local folder
    repo_id="Sandei/tech-support-classifier",
    repo_type="model",                       # default
    commit_message="Upload trained MultiTaskModel"
)


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ssifier/pytorch_model.bin:   0%|          |  575kB /  266MB            

CommitInfo(commit_url='https://huggingface.co/Sandei/tech-support-classifier/commit/fb033fe6d40089c5d4b8ff7e4ea105a0a0f4bdbc', commit_message='Upload trained MultiTaskModel', commit_description='', oid='fb033fe6d40089c5d4b8ff7e4ea105a0a0f4bdbc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sandei/tech-support-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='Sandei/tech-support-classifier'), pr_revision=None, pr_num=None)