# 🔤 Text Classification with DistilBERT (HuggingFace Transformers)
**Task:** Predict labels from user prompts  
**Model:** `distilbert-base-uncased` (small and accurate)  
**Resources:** Optimized for Colab free tier

In [None]:
!pip install -q transformers datasets scikit-learn

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch

## 🔹 Step 1: Load Your Dataset

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

data = [
    {"text": "Get me the flight details from New York to London on June 10th", "label": "flight_info"},
    {"text": "Fetch the cheapest flights from Delhi to Mumbai next week", "label": "flight_search"},
    {"text": "What are the available hotel rooms in Paris for July 1st to July 5th?", "label": "hotel_search"},
    {"text": "Show bus schedules from Boston to Washington on May 30th", "label": "bus_schedule"},
    {"text": "Find all trains from Paris to Lyon available next Friday", "label": "train_search"},
    {"text": "List flights from San Francisco to Chicago departing on August 12th", "label": "flight_search"},
    {"text": "Get me hotels in Rome for the weekend of June 20th", "label": "hotel_search"},
    {"text": "Show bus tickets from Dallas to Houston leaving tomorrow morning", "label": "bus_search"},
    {"text": "Fetch train timings from Berlin to Munich on September 5th", "label": "train_schedule"},
    {"text": "What flights are available from Los Angeles to Miami on December 15th?", "label": "flight_search"},
    {"text": "Get hotel availability in Amsterdam from May 22nd for three nights", "label": "hotel_search"},
    {"text": "Show bus routes from Seattle to Portland on July 18th", "label": "bus_route"},
    {"text": "List train schedules from Madrid to Barcelona on October 10th", "label": "train_schedule"},
    {"text": "Get flight options from London to Dubai on November 3rd", "label": "flight_search"},
    {"text": "Find hotels in New Delhi available for a week starting April 10th", "label": "hotel_search"},
    {"text": "Show bus tickets from Chicago to Detroit on June 1st", "label": "bus_search"},
    {"text": "Fetch train availability from Sydney to Melbourne for next Monday", "label": "train_search"},
    {"text": "Get all flights from Toronto to Vancouver on September 22nd", "label": "flight_search"},
    {"text": "What hotels are available in Paris from July 10th to July 15th?", "label": "hotel_search"},
    {"text": "Show bus options from Atlanta to Orlando on August 5th", "label": "bus_search"},
    {"text": "List train seats available from Moscow to St. Petersburg on December 2nd", "label": "train_search"},
    {"text": "Fetch flight details from Beijing to Shanghai on October 30th", "label": "flight_info"},
    {"text": "Get hotel bookings in Cairo for two nights starting November 12th", "label": "hotel_search"},
    {"text": "Show bus tickets from Philadelphia to New York on May 25th", "label": "bus_search"},
    {"text": "List trains from Amsterdam to Brussels for next Wednesday", "label": "train_search"},
    {"text": "Book a flight from New York to London on June 10th", "label": "flight_booking"},
    {"text": "Reserve a hotel room in Tokyo from July 1st to July 5th", "label": "hotel_booking"},
    {"text": "I need a bus ticket from Boston to Washington on May 30th", "label": "bus_booking"},
    {"text": "Book a train ticket from Paris to Lyon for next Friday", "label": "train_booking"},
    {"text": "Find me a flight from San Francisco to Chicago on August 12th", "label": "flight_booking"},
    {"text": "Reserve a hotel in Rome for the weekend of June 20th", "label": "hotel_booking"},
    {"text": "Get me a bus from Dallas to Houston leaving tomorrow morning", "label": "bus_booking"},
    {"text": "Book a train from Berlin to Munich on September 5th", "label": "train_booking"},
    {"text": "Flight booking from Los Angeles to Miami on December 15th", "label": "flight_booking"},
    {"text": "I want to book a hotel room in Amsterdam for three nights starting May 22nd", "label": "hotel_booking"},
    {"text": "Bus ticket needed from Seattle to Portland on July 18th", "label": "bus_booking"},
    {"text": "Train reservation from Madrid to Barcelona on October 10th", "label": "train_booking"},
    {"text": "Book a flight to Dubai from London on November 3rd", "label": "flight_booking"},
    {"text": "Reserve a hotel in New Delhi for a week starting April 10th", "label": "hotel_booking"},
    {"text": "I want to buy a bus ticket from Chicago to Detroit on June 1st", "label": "bus_booking"},
    {"text": "Train ticket from Sydney to Melbourne for next Monday", "label": "train_booking"},
    {"text": "Book a flight from Toronto to Vancouver on September 22nd", "label": "flight_booking"},
    {"text": "Hotel booking in Paris from July 10th to July 15th", "label": "hotel_booking"},
    {"text": "Bus from Atlanta to Orlando on August 5th, please", "label": "bus_booking"},
    {"text": "Reserve a train seat from Moscow to St. Petersburg on December 2nd", "label": "train_booking"},
    {"text": "Flight from Beijing to Shanghai on October 30th", "label": "flight_booking"},
    {"text": "Book a hotel room in Cairo for two nights starting November 12th", "label": "hotel_booking"},
    {"text": "I need a bus ticket from Philadelphia to New York on May 25th", "label": "bus_booking"},
    {"text": "Train ticket booking from Amsterdam to Brussels for next Wednesday", "label": "train_booking"},
    {"text": "Flight reservation from Miami to Cancun on January 15th", "label": "flight_booking"},
]


# Map labels to integers
label2id = {label: idx for idx, label in enumerate(sorted(set(d["label"] for d in data)))}
id2label = {v: k for k, v in label2id.items()}
for d in data:
    d["label"] = label2id[d["label"]]

# Split dataset
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_ds = Dataset.from_list(train_data)
test_ds = Dataset.from_list(test_data)


## 🔹 Step 2: Tokenize and Prepare

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

## 🔹 Step 3: Load Model and Fine-tune

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

trainer.train()

## 🔹 Step 4: Test on New Example

In [None]:
def predict_label(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predicted_class_id = torch.argmax(outputs.logits).item()
    return id2label[predicted_class_id]

predict_label("Book a table at an Italian restaurant for tonight")