In [8]:
from transformers import pipeline

# 1. Load the sentiment-analysis pipeline
# This will automatically download and load the 'distilbert-base-uncased-finetuned-sst-2-english' model
# along with its tokenizer.

# 2. Prepare your text data (call/conversation snippets)
texts_to_analyze = [
    "The customer was very satisfied with the resolution.",
    "I had a terrible experience and the issue was not fixed.",
    "The agent was polite but couldn't help me resolve the problem.",
    "This is a neutral statement.",
    "Everything worked perfectly,except my last order took more than expected time but eggs were spoiled. I expect quickly delivery.... thank you!",
]

# 3. Perform sentiment analysis
results = sentiment_pipeline(texts_to_analyze)

# 4. Print the results
print("Sentiment Analysis Results:")
for i, text in enumerate(texts_to_analyze):
    label = results[i]['label']
    score = results[i]['score']
    print(f"  Text: \"{text}\"")
    print(f"  Sentiment: {label} (Score: {score:.4f})\n")

# Example for a single text
single_text = "This product is absolutely amazing and exceeded my expectations!"
single_result = sentiment_pipeline(single_text)
print(f"Single Text Sentiment: {single_result[0]['label']} (Score: {single_result[0]['score']:.4f})")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Sentiment Analysis Results:
  Text: "The customer was very satisfied with the resolution."
  Sentiment: POSITIVE (Score: 0.9983)

  Text: "I had a terrible experience and the issue was not fixed."
  Sentiment: NEGATIVE (Score: 0.9997)

  Text: "The agent was polite but couldn't help me resolve the problem."
  Sentiment: NEGATIVE (Score: 0.9364)

  Text: "This is a neutral statement."
  Sentiment: NEGATIVE (Score: 0.8372)

  Text: "Everything worked perfectly,except my last order took more than expected time but eggs were spoiled. I expect quickly delivery.... thank you!"
  Sentiment: NEGATIVE (Score: 0.9307)

Single Text Sentiment: POSITIVE (Score: 0.9999)


In [2]:
# ✅ distilbert-base-uncased is a base language model: trained for understanding language, not for specific tasks like sentiment analysis.
#It is not fine-tuned for sentiment analysis by default, so you'll need to fine-tune it yourself, or manually load a classifier head on top and use a fine-tuned checkpoint.

In [4]:
#🧠 1. Topic Modeling (Unsupervised)
#✅ Best for:
#Exploring unknown topics in conversation data
#Grouping conversations by theme without labels
#BERTopic (modern, works well on short texts like chat)
# BERTopic uses embeddings (from Sentence-BERT or DistilBERT) + clustering to find interpretable topics.


!pip install bertopic


Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.17.0-py3-none-any.whl (150 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.6/150.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[?

In [9]:
#1. Topic Modeling (Unsupervised)

from bertopic import BERTopic
import umap
from sklearn.cluster import KMeans
import pandas as pd

# Example: small set of airline chat snippets
conversations = [
    "I need to reset my account password",
    "I was charged twice for my ticket",
    "Please call me back about my delayed flight",
    "How do I apply for a refund?",
    "Can I change my payment method?",
    "I want to update my contact number",
    "When will my refund be processed?",
    "Schedule a callback for tomorrow",
]

# Custom UMAP and KMeans for small dataset
umap_model = umap.UMAP(n_neighbors=3, n_components=2, metric='cosine', random_state=42)
kmeans_model = KMeans(n_clusters=4, random_state=42)

# Fit BERTopic model
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=kmeans_model)
topics, probs = topic_model.fit_transform(conversations)

# Get topic info as a DataFrame
df_topics = topic_model.get_topic_info()

# Generate topic names from top 3 keywords
df_topics["Topic_Name"] = df_topics["Representation"].apply(lambda x: "_".join(x[:3]))

# Display with readable topic names
print(df_topics[["Topic", "Count", "Topic_Name", "Representative_Docs"]])


   Topic  Count           Topic_Name  \
0      0      3  for_tomorrow_ticket   
1      1      2   to_password_update   
2      2      2   how_method_payment   
3      3      1  when_will_processed   

                                 Representative_Docs  
0  [Schedule a callback for tomorrow, I was charg...  
1  [I want to update my contact number, I need to...  
2  [How do I apply for a refund?, Can I change my...  
3                [When will my refund be processed?]  


In [23]:
# !pip install datasets
# !pip install --upgrade transformers
# !pip install --upgrade datasets transformers
!pip install "numpy<2.0"


Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you ha

In [2]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()) if torch.cuda.is_available() else "No GPU")


CUDA available: True
Number of GPUs: 1
Current device: 0
Device name: Tesla T4


In [1]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import torch
import numpy as np # It's good practice to import numpy

# --- 1. Data Preparation ---
data = [
    ("I need to reset my account password", "account"),
    ("I forgot my login credentials", "account"),
    ("Can you help me update my email address?", "account"),
    ("I want to change my account phone number", "account"),
    ("Unable to access my profile", "account"),
    ("My account is locked after multiple attempts", "account"),
    ("Can I delete my account permanently?", "account"),
    ("How do I link my frequent flyer number?", "account"),
    ("How do I apply for a refund?", "refund"),
    ("When will I get my money back?", "refund"),
    ("The flight was cancelled, I need a refund", "refund"),
    ("I haven’t received the refund yet", "refund"),
    ("Can I get a refund for my missed flight?", "refund"),
    ("What is the refund policy for international tickets?", "refund"),
    ("I was promised a refund but it hasn’t arrived", "refund"),
    ("Is there a cancellation fee if I want a refund?", "refund"),
    ("Can I change my payment method?", "payment"),
    ("The transaction failed but money was deducted", "payment"),
    ("How do I add a new credit card?", "payment"),
    ("Is EMI available for ticket bookings?", "payment"),
    ("I was charged twice for one booking", "payment"),
    ("Do you support PayPal payments?", "payment"),
    ("Can I pay later for my ticket?", "payment"),
    ("I need a payment receipt for reimbursement", "payment"),
    ("Please call me back later", "callback"),
    ("I missed your call, please try again", "callback"),
    ("Can you schedule a callback at 5 PM?", "callback"),
    ("I need a call from a supervisor", "callback"),
    ("Is it possible to get a callback for support?", "callback"),
    ("Call me after 2 hours", "callback"),
    ("I requested a callback but no one called", "callback"),
    ("How do I request a callback online?", "callback")
]

texts, labels = zip(*data)
unique_labels = sorted(list(set(labels)))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
encoded_labels = [label2id[label] for label in labels]

# --- 2. Tokenizer ---
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# --- 3. Dataset ---
dataset_dict = {
    "text": list(texts),
    "label": encoded_labels
}
dataset = Dataset.from_dict(dataset_dict)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

print("Tokenizing dataset...")
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["text"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# --- 4. Model ---
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# --- 5. Training ---
training_args = TrainingArguments(
    output_dir="./flight_chat_classifier",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    report_to=[]  # disables W&B integration
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

print("\n--- Starting Model Training ---")
trainer.train()
print("--- Training Complete ---")

# --- 6. Inference ---
print("\n--- Performing Inference ---")
# Use the trained model directly from the Trainer object or load from the output directory
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# --- 7. Test Predictions ---
test_inputs = [
    "I need a refund for my flight",
    "I missed my password",
    "I have not got reverse payment for my cancell",
    "Can you call me back tomorrow morning?",
    "I want to know my account balance"
]

for text in test_inputs:
    print(f"\nInput: {text}")
    prediction = classifier(text)
    print("Prediction:", prediction)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizing dataset...


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda

--- Starting Model Training ---


Step,Training Loss
10,1.3594
20,1.1599
30,0.8856


Device set to use cuda:0


--- Training Complete ---

--- Performing Inference ---

Input: I need a refund for my flight
Prediction: [{'label': 'refund', 'score': 0.4685707092285156}]

Input: I missed my password
Prediction: [{'label': 'account', 'score': 0.5573247671127319}]

Input: I have not got reverse payment for my cancell
Prediction: [{'label': 'payment', 'score': 0.36302199959754944}]

Input: Can you call me back tomorrow morning?
Prediction: [{'label': 'callback', 'score': 0.32531049847602844}]

Input: I want to know my account balance
Prediction: [{'label': 'account', 'score': 0.5365785360336304}]
