##### 1. Downloading the dataset

In [None]:
import pandas as pd, os, io, requests
ASSIGNMENT2_CSV = "https://raw.githubusercontent.com/suralk/travel_domain_data/master/5000TravelQuestionsDataset.csv"
os.makedirs("data", exist_ok=True)
def fetch_csv(url, path):
    r = requests.get(url)
    r.raise_for_status()
    with open(path, "wb") as f:
        f.write(r.content)
csv_path = "data/travel-classification.csv"
if not os.path.exists(csv_path):
    fetch_csv(ASSIGNMENT2_CSV, csv_path)
df = pd.read_csv(csv_path,encoding="latin-1")
print("\nDataset shape:", df.shape)
df.head()


Dataset shape: (4999, 3)


Unnamed: 0,What are the special things we (husband and me) can do during a 5 day stay at Cape Town?,TTD,TTDSIG
0,What are the companies which organize shark fe...,TTD,TTDOTH
1,Is it safe for female traveller to go alone to...,TGU,TGUHEA
2,What are the best places around Cape Town for ...,TTD,TTDSIG
3,What are the best places to stay for a family ...,ACM,ACMOTH
4,What are the train services that travels from ...,TRS,TRSTRN


###### 1.1 Data cleaning and sanity checking

In [None]:
# Renaming columns for clarity
df.columns = ["question", "coarse_label", "fine_label"]
# Droping the fine-grain column
df = df.drop(columns=["fine_label"])
# Removing duplicates and nulls
df = df.drop_duplicates(subset="question")
df = df.dropna()
# Strip whitespace and normalize spaces
df["question"] = df["question"].str.strip()
df["question"] = df["question"].str.replace(r'\s+', ' ', regex=True)
# Sanity checks
print("\nAfter cleaning:")
print("Shape:", df.shape)
print("Any nulls?\n", df.isnull().sum())
# Coarse label distribution
print("\nCoarse label counts:")
print(df["coarse_label"].value_counts())
# Check for expected classes
expected_classes = {"TTD","TGU","ACM","TRS","WTH","FOD","ENT"}
missing_classes = expected_classes - set(df["coarse_label"].unique())
if missing_classes:
    print("\nWarning: Missing expected classes ->", missing_classes)
else:
    print("\nAll expected coarse classes present!")
# Question length sanity
df["q_length"] = df["question"].str.len()
print("\nQuestion length stats:")
print(df["q_length"].describe())
print("Shortest question example:", df.loc[df["q_length"].idxmin(), "question"])
print("Longest question example:", df.loc[df["q_length"].idxmax(), "question"])
# Normalize coarse labels: strip whitespace and newlines, uppercase
df["coarse_label"] = df["coarse_label"].str.strip().str.upper()
# Verify again
print("\nUnique labels after normalization:", df["coarse_label"].unique())



After cleaning:
Shape: (4992, 2)
Any nulls?
 question        0
coarse_label    0
dtype: int64

Coarse label counts:
coarse_label
TGU      1216
TTD      1137
TRS      1011
ACM       717
FOD       521
ENT       214
WTH       170
TGU\n       3
\nENT       2
TTD\n       1
Name: count, dtype: int64

All expected coarse classes present!

Question length stats:
count    4992.000000
mean       60.470553
std        20.360721
min        17.000000
25%        46.000000
50%        57.000000
75%        70.000000
max       181.000000
Name: q_length, dtype: float64
Shortest question example: Do we need shoes?
Longest question example: Can anyone tell me if the spa at the Hilton Moorea is massages only or if there is an area with jacuzzi's and any other relaxing things like that that is for the spa customers only?

Unique labels after normalization: ['TTD' 'TGU' 'ACM' 'TRS' 'WTH' 'FOD' 'ENT']


##### 2. Train/validation/test split

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# splitting the data into 700 for test and the rest for training/validation
train_val_df, test_df = train_test_split(df, test_size=700, random_state=42, shuffle=True)

# splitting the remaining data (train_val_df) into 4000 for training and 300 for validation
train_df, val_df = train_test_split(train_val_df, train_size=4000, random_state=42, shuffle=True)

# Quick sanity check
print("Training set shape:", train_df.shape)  # Should be (4000, X)
print("Validation set shape:", val_df.shape)  # Should be (300, X)
print("Test set shape:", test_df.shape)      # Should be (700, X)

# Checking label distribution
print("\nTraining label distribution:\n", train_df['coarse_label'].value_counts())
print("\nValidation label distribution:\n", val_df['coarse_label'].value_counts())
print("\nTest label distribution:\n", test_df['coarse_label'].value_counts())

Training set shape: (4000, 3)
Validation set shape: (292, 3)
Test set shape: (700, 3)

Training label distribution:
 coarse_label
TGU    967
TTD    914
TRS    820
ACM    565
FOD    423
ENT    172
WTH    139
Name: count, dtype: int64

Validation label distribution:
 coarse_label
TTD    66
TGU    64
TRS    60
ACM    51
FOD    31
WTH    11
ENT     9
Name: count, dtype: int64

Test label distribution:
 coarse_label
TGU    188
TTD    158
TRS    131
ACM    101
FOD     67
ENT     35
WTH     20
Name: count, dtype: int64


##### Installing unsloth beore model selection

In [None]:

!pip install unsloth


##### 3. Model Selection : The selected model is a 4-bit quantized  version of LLaMA-3.2B-Instruct from Unsloth





In [None]:
!pip install transformers datasets torch


##### 4. Supervised Fine-tuning

In [14]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from tqdm import tqdm
import pandas as pd
import itertools
import os
import shutil

from unsloth import FastLanguageModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from sklearn.metrics import accuracy_score


# Dataset class
class QADataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        prompt = f"Q: {row['question']}\nA:"
        completion = row['coarse_label']
        enc = self.tokenizer(
            prompt,
            text_target=completion,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "labels": enc["labels"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

# Tokenizer & Model Config
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
use_amp = torch.cuda.is_available()
scaler = torch.cuda.amp.GradScaler() if use_amp else None

# Datasets
train_dataset = QADataset(train_df, tokenizer)
val_dataset = QADataset(val_df, tokenizer)
test_dataset = QADataset(test_df, tokenizer)

# Hyperparameter grid
learning_rates = [5e-6]
batch_sizes = [4]
epochs_list = [3]

# Tracking best config
best_val_loss = float("inf")
best_config = None
best_model_path = "best_model"


# Grid Search
for lr, batch_size, epochs in itertools.product(learning_rates, batch_sizes, epochs_list):
    print(f"\n=== Testing configuration: LR={lr}, Batch Size={batch_size}, Epochs={epochs} ===")

    # Load base model fresh
    model, _ = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=1024,
        load_in_4bit=True
    )
    model = prepare_model_for_kbit_training(model)

    # Apply LoRA
    lora_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.1)
    model = get_peft_model(model, lora_config)

    model.train()

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, pin_memory=True)

    optimizer = AdamW(model.parameters(), lr=lr)
    loss_fn = CrossEntropyLoss(ignore_index=-100)

    # Training
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        train_loss = 0
        for batch in tqdm(train_loader, desc="Training"):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with torch.cuda.amp.autocast(enabled=use_amp):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            if scaler:
                scaler.scale(loss).backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

            train_loss += loss.item()
        print(f"Train Loss: {train_loss / len(train_loader):.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            with torch.cuda.amp.autocast(enabled=use_amp):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Save best model
    if avg_val_loss < best_val_loss:
        print(">>> New best model found!")
        best_val_loss = avg_val_loss
        best_config = {"learning_rate": lr, "batch_size": batch_size, "epochs": epochs}
        if os.path.exists(best_model_path):
            shutil.rmtree(best_model_path)
        model.save_pretrained(best_model_path)
        tokenizer.save_pretrained(best_model_path)

    # Clean up GPU memory
    torch.cuda.empty_cache()

# Final Evaluation
print(f"\n=== Best Config: {best_config} ===")
print(f"Best Validation Loss: {best_val_loss:.4f}")

print("\nLoading best model for final test evaluation...")

# Load base + LoRA model
# Load the quantized base model
base_model, _ = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=1024,
    load_in_4bit=True,
    dtype=None,  # Automatically sets to float16 for 4bit models
    device_map="auto"
)

# Attach the LoRA adapters
best_model = PeftModel.from_pretrained(base_model, best_model_path)
best_model.eval()

test_loader = DataLoader(test_dataset, batch_size=best_config["batch_size"], pin_memory=True)

test_loss = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.cuda.amp.autocast(enabled=use_amp):
            outputs = best_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

        test_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        mask = labels != -100

        masked_preds = preds[mask]
        masked_labels = labels[mask]

        all_preds.extend(masked_preds.cpu().numpy())
        all_labels.extend(masked_labels.cpu().numpy())

# Final metrics
avg_test_loss = test_loss / len(test_loader)
accuracy = accuracy_score(all_labels, all_preds)

print(f"\nFinal Test Loss: {avg_test_loss:.4f}")
print(f"Final Test Accuracy: {accuracy:.4f}")


  scaler = torch.cuda.amp.GradScaler() if use_amp else None



=== Testing configuration: LR=5e-06, Batch Size=4, Epochs=1 ===
==((====))==  Unsloth 2025.9.4: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Epoch 1/1


  with torch.cuda.amp.autocast(enabled=use_amp):
Training: 100%|██████████| 1000/1000 [13:10<00:00,  1.26it/s]
  with torch.cuda.amp.autocast(enabled=use_amp):


Train Loss: 9.8529
Validation Loss: 2.3970
>>> New best model found!

=== Best Config: {'learning_rate': 5e-06, 'batch_size': 4, 'epochs': 1} ===
Best Validation Loss: 2.3970

Loading best model for final test evaluation...
==((====))==  Unsloth 2025.9.4: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  with torch.cuda.amp.autocast(enabled=use_amp):
Testing: 100%|██████████| 175/175 [00:53<00:00,  3.27it/s]


Final Test Loss: 2.4379
Final Test Accuracy: 0.8069





##### 5. Prompt Development

In [None]:
import torch
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
import pandas as pd
import difflib

# Loading best_model and tokenizer
base_model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # original base model repo
lora_model_path = "best_model"  # fine-tuned LoRA checkpoint folder
tokenizer = AutoTokenizer.from_pretrained(lora_model_path)
tokenizer.pad_token = tokenizer.eos_token
base_model, _ = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=2048,
    load_in_4bit=True,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, lora_model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"

def create_prompt(question, examples=None):
    """
    examples: list of tuples [(q1, a1), ...] for few-shot prompting
    """
    prompt = (
        "Your task is to classify the following question into one of these categories: "
        "TTD, TGU, ACM, TRS, WTH, FOD, ENT. "
    )

    if examples:
        prompt += "Here are some examples:\n"
        for q, a in examples:
            prompt += f"Q: {q}\nA: {a}\n\n"

    prompt += f"Now classify this question:\nQ: {question}\nA:"
    return prompt

import os
import torch

if torch.cuda.is_available():
    device_capability = torch.cuda.get_device_capability()
    if device_capability[0] < 8:  # compute capability < 8.0
        os.environ["XFORMERS_NO_MEM_EFF_ATTENTION"] = "1"
        print(f"Memory-efficient attention disabled (GPU capability {device_capability})")
device = "cuda" if torch.cuda.is_available() else "cpu"
#predict function
def predict(question, model, tokenizer, examples_list=None, max_length=10, temperature=0.01, top_p=0.99):
    prompt = create_prompt(question, examples_list)
    enc = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids, attention_mask = enc["input_ids"], enc["attention_mask"]

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_length,
            temperature=temperature,
            top_p=top_p
        )

    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    answer = answer.replace(prompt, "").strip()
    return map_to_valid_label(answer)

# --- Evaluation function ---
def evaluate(test_questions, test_labels, model, tokenizer, examples_list=None, temperature=0.01, top_p=0.99):
    preds = []
    for q in tqdm(test_questions, desc="Evaluation"):
        pred = predict(
            q,
            model,
            tokenizer,
            examples_list=examples_list,
            temperature=temperature,
            top_p=top_p
        )
        preds.append(pred)

    true_labels_norm = [normalize_label(l) for l in test_labels]
    acc = accuracy_score(true_labels_norm, preds)
    return acc, preds






##### 6. Zero-shot and few-shot testing

In [None]:
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
from sklearn.metrics import accuracy_score

#  Normalizing labels
def normalize_label(label):
    return label.strip().upper()

valid_labels = {"TTD","TGU","ACM","TRS","WTH","FOD","ENT"}



# Map prediction safely
def map_to_valid_label(pred):
    pred_norm = normalize_label(pred)
    for label in valid_labels:
        if label in pred_norm:   # substring match
            return label
    # fallback: picking closest match instead of UNKNOWN
    return max(valid_labels, key=lambda x: difflib.SequenceMatcher(None, pred_norm, x).ratio())

# Zero shot
zero_shot_acc, pred_labels_zero = evaluate(
    test_df['question'], test_df['coarse_label'], model, tokenizer,
    examples_list=None, temperature=0.01, top_p=0.9
)
print("Zero-shot accuracy:", zero_shot_acc)

# 1-shot
example_1 = train_df.sample(1, random_state=42)
examples_1 = list(zip(example_1['question'], example_1['coarse_label']))

acc_1shot, pred_labels_1shot = evaluate(
    test_df['question'], test_df['coarse_label'], model, tokenizer,
    examples_list=examples_1, temperature=0.01, top_p=0.9
)
print("1-shot accuracy:", acc_1shot)

# 3-shot
example_3 = train_df.sample(3, random_state=42)
examples_3 = list(zip(example_3['question'], example_3['coarse_label']))

acc_3shot, pred_labels_3shot = evaluate(
    test_df['question'], test_df['coarse_label'], model, tokenizer,
    examples_list=examples_3, temperature=0.01, top_p=0.9
)
print("3-shot accuracy:", acc_3shot)


