In [1]:
# Cell 1 — Imports
import re
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)

import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch: 2.5.1+cu121
CUDA available: True


In [2]:
# Cell 2 — Load & Clean Data
ds = load_dataset("7Xan7der7/us_airline_sentiment")
df = pd.DataFrame(ds["train"])

# Clean duplicates and NaN
df = df.drop_duplicates(subset=['text'])
df = df.dropna(subset=['text'])
df = df[df['text'].str.strip() != '']

# Minimal preprocessing (BERT handles most things)
def clean_text(text):
    text = re.sub(r"@\w+", "", text)          # Remove @mentions
    text = re.sub(r"http\S+|www\S+", "", text) # Remove URLs
    text = re.sub(r"\s+", " ", text).strip()   # Clean whitespace
    return text

df["text_clean"] = df["text"].apply(clean_text)
df = df[df['text_clean'].str.strip() != '']

# Encode labels
label_map = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["airline_sentiment"].map(label_map)

df = df.reset_index(drop=True)
print(f"Dataset size: {len(df)}")
print(f"Label distribution:\n{df['label'].value_counts()}")

Dataset size: 14427
Label distribution:
label
0    9080
1    3057
2    2290
Name: count, dtype: int64


In [3]:
# Cell 3 — Train/Test Split
train_df, test_df = train_test_split(
    df[["text_clean", "label"]],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

print(f"Train: {len(train_dataset)} | Test: {len(test_dataset)}")

Train: 11541 | Test: 2886


here we going to train our model


In [4]:
# Cell 4 — Load Tokenizer & Tokenize
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text_clean"],
        padding="max_length",
        truncation=True,
        max_length=128  # Tweets are short
    )

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

print(f"Tokenized! Sample keys: {train_tokenized[0].keys()}")

Map:   0%|          | 0/11541 [00:00<?, ? examples/s]

Map:   0%|          | 0/2886 [00:00<?, ? examples/s]

Tokenized! Sample keys: dict_keys(['label', 'input_ids', 'attention_mask'])


In [5]:
# Cell 5 — Load Pre-trained Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,  # negative, neutral, positive
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id={"negative": 0, "neutral": 1, "positive": 2}
)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model loaded on: {device}")
print(f"Parameters: {model.num_parameters():,}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on: cuda
Parameters: 66,955,779


In [6]:
# Cell 6 — Define Training Arguments
training_args = TrainingArguments(
    output_dir="../models/distilbert_sentiment",
    
    # Training hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    
    # Evaluation
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    
    # Logging
    logging_dir="../logs",
    logging_steps=100,
    
    # Performance
    fp16=torch.cuda.is_available(),  # Mixed precision if GPU
    
    # Disable wandb
    report_to="none"
)

print("Training arguments configured!")

Training arguments configured!


In [7]:
# Cell 7 — Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

print("Metrics function ready!")

Metrics function ready!


In [8]:
# Cell 8 — Create Trainer & Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    compute_metrics=compute_metrics,
)

print("Starting training...")
print("(This may take 10-30 minutes depending on your hardware)")
trainer.train()

Starting training...
(This may take 10-30 minutes depending on your hardware)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4484,0.433233,0.836798
2,0.3584,0.43824,0.834373
3,0.2509,0.475285,0.845461


TrainOutput(global_step=2166, training_loss=0.4045304054478639, metrics={'train_runtime': 435.3155, 'train_samples_per_second': 79.535, 'train_steps_per_second': 4.976, 'total_flos': 1146625133983488.0, 'train_loss': 0.4045304054478639, 'epoch': 3.0})

In [9]:
# Cell 9 — Evaluate
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")

# Detailed predictions
predictions = trainer.predict(test_tokenized)
y_pred = np.argmax(predictions.predictions, axis=-1)
y_true = predictions.label_ids

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["negative", "neutral", "positive"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

Test Accuracy: 0.8455

Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.92      0.91      1816
     neutral       0.73      0.66      0.69       612
    positive       0.78      0.82      0.80       458

    accuracy                           0.85      2886
   macro avg       0.80      0.80      0.80      2886
weighted avg       0.84      0.85      0.84      2886


Confusion Matrix:
[[1665  113   38]
 [ 144  401   67]
 [  46   38  374]]


In [10]:
# Cell 10 — Save Model
import os
save_path = "../models/distilbert_sentiment_final"
os.makedirs(save_path, exist_ok=True)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")

Model saved to ../models/distilbert_sentiment_final


In [11]:
# Cell 11 — Quick Inference Test
classifier = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

test_texts = [
    "The flight was amazing! Best service ever.",
    "Terrible experience. Lost my luggage and rude staff.",
    "It was okay, nothing special about the flight."
]

for text in test_texts:
    result = classifier(text)[0]
    print(f"{result['label']:10} ({result['score']:.2f}) → {text[:50]}...")

Device set to use cuda:0


positive   (0.99) → The flight was amazing! Best service ever....
negative   (1.00) → Terrible experience. Lost my luggage and rude staf...
negative   (0.84) → It was okay, nothing special about the flight....


## Summary: Embedding (Notebook 01) vs Fine-tuning (Notebook 02)

| Aspect | Embeddings + LogReg | DistilBERT Fine-tuning |
|--------|---------------------|------------------------|
| **Training time** | ~2 min | ~20 min |
| **Accuracy** | ~78-82% | ~85-90% |
| **Model size** | ~1 MB | ~250 MB |
| **Inference speed** | Fast | Slower |
| **Best for** | Quick baseline, limited resources | Best accuracy, production |

**Recommendation:** Use embeddings for development/testing, fine-tuned model for production.