In [None]:
# Install required libraries
!pip install transformers datasets scikit-learn torch --quiet

In [None]:
pip install --upgrade transformers datasets --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.3/506.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.[0m[31m
[0m

In [None]:
!pip uninstall -y transformers datasets pyarrow



Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: datasets 4.2.0
Uninstalling datasets-4.2.0:
  Successfully uninstalled datasets-4.2.0
Found existing installation: pyarrow 21.0.0
Uninstalling pyarrow-21.0.0:
  Successfully uninstalled pyarrow-21.0.0


In [None]:

!pip install -U transformers datasets scikit-learn


Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Using cached datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Using cached pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached datasets-4.2.0-py3-none-any.whl (506 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_6

In [None]:
# -------------------------------
# Step 1: Imports
# -------------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import re

# -------------------------------
# Step 2: Load dataset
# -------------------------------
df = pd.read_csv("airline_customer_requests.csv")  # replace with your CSV file
print("Dataset size:", len(df))
print(df.head())

# -------------------------------
# Step 3: Preprocessing function
# -------------------------------
def preprocess_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r"\s+", " ", text)  # remove extra spaces
    text = re.sub(r"[^a-z0-9\s?.!,]", "", text)  # remove unwanted chars
    text = text.strip()
    return text

df['utterance'] = df['utterance'].apply(preprocess_text)

# -------------------------------
# Step 4: Encode labels
# -------------------------------
labels = df['intent'].unique().tolist()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
df['label'] = df['intent'].map(label2id)

# -------------------------------
# Step 5: Split dataset
# -------------------------------
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['utterance'], df['label'], test_size=0.1, random_state=42, stratify=df['label']
)

# -------------------------------
# Step 6: Tokenization
# -------------------------------
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['utterance'], padding='max_length', truncation=True, max_length=128)

train_dataset = Dataset.from_dict({'utterance': train_texts.tolist(), 'label': train_labels.tolist()})
test_dataset = Dataset.from_dict({'utterance': test_texts.tolist(), 'label': test_labels.tolist()})

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# -------------------------------
# Step 7: Load model
# -------------------------------
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# -------------------------------
# Step 8: Training arguments
# -------------------------------
training_args = TrainingArguments(
    output_dir='./distilbert_airline',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=1,
    learning_rate=2e-5
)

# -------------------------------
# Step 9: Metrics
# -------------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

# -------------------------------
# Step 10: Trainer
# -------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# -------------------------------
# Step 11: Train
# -------------------------------
trainer.train()

# -------------------------------
# Step 12: Evaluate
# -------------------------------
results = trainer.evaluate()
print("✅ Evaluation Results:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

# -------------------------------
# Step 13: Save model
# -------------------------------
model.save_pretrained('./distilbert_airline')
tokenizer.save_pretrained('./distilbert_airline')
print("✅ Model and tokenizer saved successfully!")


Dataset size: 31024
                                      utterance                intent
0                  Are knives allowed on board?  Prohibited Items Faq
1                  I need wheelchair assistance    Special Assistance
2      Does the insurance cover booking delays?             Insurance
3  Assistance at airport for elderly passengers    Special Assistance
4                Is ticket insurance included??             Insurance


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/27921 [00:00<?, ? examples/s]

Map:   0%|          | 0/3103 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m22i261[0m ([33m22i261-psg-college-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,2.9086
200,1.5216
300,0.5135
400,0.1345
500,0.0497
600,0.0293
700,0.0203
800,0.0146
900,0.0113
1000,0.0092


✅ Evaluation Results:
eval_loss: 0.0003
eval_accuracy: 1.0000
eval_precision: 1.0000
eval_recall: 1.0000
eval_f1: 1.0000
eval_runtime: 11.2674
eval_samples_per_second: 275.3960
eval_steps_per_second: 17.2180
epoch: 3.0000
✅ Model and tokenizer saved successfully!


In [9]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch

# Load saved model & tokenizer
model_path = './distilbert_airline'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.eval()

# Example text
text = ""

# Tokenize
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()

# Get label using id2label from model config
predicted_label = model.config.id2label[predicted_class_id]

print("Predicted intent:", predicted_label)


Predicted intent: Change Flight


In [None]:
from google.colab import drive
drive.mount('/content/drive')