In [7]:
import kagglehub
from datasets import load_dataset,DatasetDict
# Download latest version
path = kagglehub.dataset_download("tobiasbueck/multilingual-customer-support-tickets")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'multilingual-customer-support-tickets' dataset.
Path to dataset files: /kaggle/input/multilingual-customer-support-tickets


Importing dataset

In [8]:
import os
import pandas as pd

# Required columns
required_cols = ["subject", "body", "answer"]

# Output folder (writeable in Kaggle)
output_folder = "/kaggle/working/clean/"
os.makedirs(output_folder, exist_ok=True)

# Loop through CSVs in the downloaded dataset
for file in os.listdir(path):
    if file.endswith(".csv"):
        file_path = os.path.join(path, file)
        df = pd.read_csv(file_path)

        # Keep only English rows
        df = df[df["language"].astype(str).str.startswith("en")]

        # Keep only required columns
        df = df[[col for col in df.columns if col in required_cols]]

        # Add missing columns if any
        for col in required_cols:
            if col not in df.columns:
                df[col] = ""

        # Reorder columns
        df = df[required_cols]

        # Save cleaned CSV
        output_path = os.path.join(output_folder, file.replace(".csv", "_clean.csv"))
        df.to_csv(output_path, index=False)

print("Finished cleaning all English-only CSVs! Saved in /kaggle/working/clean/")


Finished cleaning all English-only CSVs! Saved in /kaggle/working/clean/


In [9]:
raw_dataset=load_dataset(path=output_folder)
raw_dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['subject', 'body', 'answer'],
        num_rows: 29652
    })
})

In [10]:
split_dataset = raw_dataset['train'].train_test_split(
    test_size=0.1,
    seed=42
)

# Rename "test" → "validation"
final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"]
})


In [11]:
final_dataset.column_names


{'train': ['subject', 'body', 'answer'],
 'validation': ['subject', 'body', 'answer']}

In [13]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(example):
    question = example["body"] if isinstance(example["body"], str) else "[EMPTY]"
    answer = example["answer"] if isinstance(example["answer"], str) else ""

    model_input = tokenizer(
        question,
        max_length=256,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            answer,
            max_length=128,
            truncation=True,
            padding="max_length"
        )["input_ids"]

    model_input["labels"] = labels
    return model_input

tokenized_datasets = final_dataset.map(
    tokenize_fn,
    batched=False,
    remove_columns=["subject", "body", "answer"]
)

tokenized_datasets.set_format("torch")


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/26686 [00:00<?, ? examples/s]



Map:   0%|          | 0/2966 [00:00<?, ? examples/s]

In [14]:
display(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 26686
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2966
    })
})

In [18]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained(
    checkpoint
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [19]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [20]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


{'input_ids': torch.Size([8, 256]),
 'attention_mask': torch.Size([8, 256]),
 'labels': torch.Size([8, 128]),
 'decoder_input_ids': torch.Size([8, 128])}

In [21]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [22]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

10008


In [23]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [25]:
from tqdm.auto import tqdm
model.train()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|          | 0/10008 [00:00<?, ?it/s]

In [27]:
import json
import numpy as np

evaluation_results = evaluate_tag_model()

# Convert numpy arrays in evaluation_results to lists for JSON serialization
for key in ['predictions', 'true_labels', 'probabilities']:
    if key in evaluation_results:
        evaluation_results[key] = [arr.tolist() if isinstance(arr, np.ndarray) else arr for arr in evaluation_results[key]]



Evaluating on: validation


Evaluating:   0%|          | 0/186 [00:00<?, ?it/s]

NotImplementedError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'

### Fixing the `evaluate_tag_model` function

The error indicates that the `labels` tensor was cast to `torch.float32`, but the loss function for sequence generation expects `torch.long` (integer) labels. I will redefine the `evaluate_tag_model` function to ensure labels are of type `torch.long` and do not undergo an incorrect type cast.

In [28]:
import torch
from tqdm.auto import tqdm

def evaluate_tag_model():
    model.eval() # Set model to evaluation mode
    total_eval_loss = 0

    print("Evaluating on: validation")
    progress_bar_eval = tqdm(eval_dataloader, desc="Evaluating")

    with torch.no_grad(): # Disable gradient calculations
        for batch in progress_bar_eval:
            batch = {k: v.to(device) for k, v in batch.items()}

            # Ensure labels are of type long, not float
            # The problematic line was: batch["labels"] = batch["labels"].to(torch.float32)
            # We remove this line and ensure labels are long if they aren't already.
            if batch["labels"].dtype != torch.long:
                batch["labels"] = batch["labels"].to(torch.long)

            outputs = model(**batch)
            loss = outputs.loss
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    print(f"Validation Loss: {avg_eval_loss}")

    return {"validation_loss": avg_eval_loss}

In [29]:
import json
import numpy as np

evaluation_results = evaluate_tag_model()

# Convert numpy arrays in evaluation_results to lists for JSON serialization
# (This part was from the original cell, kept for consistency if other keys were added)
for key in evaluation_results:
    if isinstance(evaluation_results[key], np.ndarray):
        evaluation_results[key] = evaluation_results[key].tolist()

print(evaluation_results)

Evaluating on: validation


Evaluating:   0%|          | 0/371 [00:00<?, ?it/s]

Validation Loss: 0.8114696774842605
{'validation_loss': 0.8114696774842605}


In [30]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/it_support/')

# Create a directory for your model in Drive
model_dir = "/content/drive/MyDrive/it_support_flan_t5"

# Create the directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)

print(f"📁 Model will be saved to: {model_dir}")

Mounted at /content/it_support/
📁 Model will be saved to: /content/drive/MyDrive/it_support_flan_t5


In [32]:
import json
import os

# Make sure model_dir exists
os.makedirs(model_dir, exist_ok=True)

# Save model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print("💾 Model saved!")

# The following lines related to 'tag_mapping' and 'tag_classes' are not relevant
# for this sequence-to-sequence model and were causing a NameError.
# Therefore, they are removed.

# Optional: save training info
training_info = {
    "model_name": checkpoint, # Use the actual checkpoint name
    "training_date": "2024-07-29" # Update with current date
}

with open(f"{model_dir}/training_info.json", "w") as f:
    json.dump(training_info, f, indent=2)
print("📊 Training info saved")

💾 Model saved!
📊 Training info saved


In [34]:
# Install Gradio
!pip install gradio -q

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Changed from AutoModelForSequenceClassification
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/it_support/')

# Define model directory (already defined as model_dir)

print(f"🔍 Looking for model at: {model_dir}")

if not os.path.exists(model_dir):
    raise FileNotFoundError(f"Model directory not found at: {model_dir}")

# Load the fine-tuned T5 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir) # Changed to AutoModelForSeq2SeqLM

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.eval()

print("✅ Fine-tuned T5 model and tokenizer loaded successfully!")

# Gradio interface function for T5 generation
def generate_answer(question):
    if not question.strip():
        return "Please enter a question."

    # Prepare the input for the T5 model
    inputs = tokenizer(
        question,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=256
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate the answer
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128, num_beams=5, early_stopping=True)

    # Decode the generated output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

# Launch Gradio interface
iface = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(
        lines=3,
        placeholder="Enter your IT support question here...",
        label="IT Support Question"
    ),
    outputs=gr.Textbox(label="Generated Answer"),
    title="📝 IT Support Ticket Answer Generator (Flan-T5)",
    description="Generate answers to IT support questions using a fine-tuned Flan-T5 model.",
    examples=[
        ["My internet is not working."],
        ["How do I reset my password?"],
        ["The printer in my office is not responding."]
    ]
)

print("🌐 Launching web interface...")
iface.launch(share=True)

Drive already mounted at /content/it_support/; to attempt to forcibly remount, call drive.mount("/content/it_support/", force_remount=True).
🔍 Looking for model at: /content/drive/MyDrive/it_support_flan_t5
✅ Fine-tuned T5 model and tokenizer loaded successfully!
🌐 Launching web interface...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6181e3f58a98534510.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


