# 🤖 BERT-based Intent Recognition Chatbot

In [None]:
!pip install -q transformers datasets gradio scikit-learn pandas torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import gradio as gr
import pickle

In [None]:
# Load your real dataset
df = pd.read_csv("Customer_Service_Training_Dataset.csv")

# Rename columns to match what the model expects
df = df.rename(columns={
    df.columns[1]: "text",      # The actual customer query
    df.columns[3]: "intent"     # The final intent label (like 'cancel_order')
})

# Keep only the necessary columns
df = df[["text", "intent"]]
df.head()


Unnamed: 0,text,intent
0,I do not know how to speak with customer service,contact_customer_service
1,can you help me submitting some feedback?,review
2,how do I get refunds?,get_refund
3,could you help me use another account?,switch_account
4,can I make a consumer reclamation against your...,complaint


In [None]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['intent'])
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [None]:
dataset = Dataset.from_pandas(df[["text", "label"]])
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding=True)
dataset = dataset.map(tokenize, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Tokenize the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Clean up and remove unneeded columns
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Check if tokenization was successful
print(tokenized_dataset[0])

Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

{'label': tensor(8), 'input_ids': tensor([ 101, 1045, 2079, 2025, 2113, 2129, 2000, 3713, 2007, 8013, 2326,  102,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
      

In [None]:
# 1. Import the necessary libraries
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizer
from datasets import Dataset
import os

# 2. Disable W&B (Weights and Biases) logging completely to avoid API key prompts
os.environ["WANDB_DISABLED"] = "true"

# 3. Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 4. Assume 'df' is already your DataFrame
df['label'] = le.fit_transform(df['intent'])

# 5. Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["text", "label"]])

# 6. Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 7. Remove unnecessary columns
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 8. Load BERT model for sequence classification with the number of labels
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))

# 9. Set up training arguments without evaluation_strategy
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for model results
    per_device_train_batch_size=16,  # Use larger batch size if memory allows (adjust as needed)
    num_train_epochs=3,  # Keep epochs low to save time
    logging_dir="./logs",  # Logging directory for logs (no W&B)
    logging_steps=10,  # Frequency of logging
    save_strategy="epoch",  # Save model after each epoch
    disable_tqdm=True,  # Disable progress bars to speed up training
    gradient_accumulation_steps=2,  # Helps speed up by accumulating gradients (this helps with smaller batch sizes)
    load_best_model_at_end=False,  # Disable loading the best model at the end of training
)

# 10. Initialize Trainer with the model and training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# 11. Start training
trainer.train()

# 12. Save the trained model and tokenizer
model.save_pretrained("./bert_intent_model")
tokenizer.save_pretrained("./bert_intent_model")


ModuleNotFoundError: No module named 'datasets'

In [None]:
intent_response_map = {
    "contact_customer_service": "📞 Sure! Connecting you to customer support.",
    "review": "📝 Thanks for your valuable feedback!",
    "get_refund": "💸 I'll help you with the refund process.",
    "switch_account": "🔄 Switching your account now.",
    "complaint": "🚨 Sorry to hear that! Please tell us more about your issue.",
    "check_payment_methods": "💳 You can pay via credit card, UPI, or net banking.",
    "contact_human_agent": "👩‍💼 Let me get a human agent for you.",
    "delivery_period": "⏱ Deliveries usually take 3–5 business days.",
    "check_invoice": "🧾 I can check your invoice details.",
    "get_invoice": "📄 Here’s your invoice. You should also receive it via email.",
    "newsletter_subscription": "📬 You've been subscribed to our newsletter!",
    "place_order": "🛒 Placing your order now!",
    "check_cancellation_fee": "⚠ Let me check the cancellation fee details.",
    "delete_account": "⚠ Deleting your account. Please confirm once more.",
    "cancel_order": "❌ Your order has been canceled.",
    "payment_issue": "⚠ I can help you resolve your payment issue.",
    "create_account": "🧾 Let’s get your account created!",
    "track_refund": "🔍 Let me check the status of your refund.",
    "set_up_shipping_address": "📦 Setting up your shipping address.",
    "track_order": "📦 Tracking your order now.",
    "change_order": "✏ Let’s update your order details.",
    "edit_account": "🛠 I can help you update your account info.",
    "check_refund_policy": "📜 Here’s our refund policy.",
    "delivery_options": "🚚 We offer standard and express delivery options.",
    "recover_password": "🔐 No worries! I’ll help you recover your password.",
    "registration_problems": "🚫 Let me help you fix your registration issue.",
    "change_shipping_address": "📍 Let’s update your shipping address."
}

In [None]:
!pip install transformers torch gradio




In [None]:
# Load the saved model and tokenizer
model = BertForSequenceClassification.from_pretrained("./bert_intent_model")
tokenizer = BertTokenizer.from_pretrained("./bert_intent_model")


HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './bert_intent_model'.

In [None]:
with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

# Updated predict function with error handling for tokenization
def predict_intent(text):
    try:
        # Tokenize input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

        # Perform inference without gradients
        with torch.no_grad():
            outputs = model(**inputs)

        # Get predicted label
        pred = torch.argmax(outputs.logits, dim=1).item()

        # Map prediction to intent using label encoder
        intent = le.inverse_transform([pred])[0]

        # Return the response based on the intent
        return intent_response_map.get(intent, "🤖 Sorry, I didn't get that.")
    except Exception as e:
        return f"Error: {e}"

# Launch the chatbot interface
import gradio as gr

def chatbot_ui(user_input):
    response = predict_intent(user_input)
    return response

gr.Interface(
    fn=chatbot_ui,
    inputs=gr.Textbox(label="You"),
    outputs=gr.Textbox(label="🤖 Bot"),
    title="🛍️ E-commerce Chatbot",
    description="Ask me anything about orders, refunds, accounts, and more!"
).launch()



FileNotFoundError: [Errno 2] No such file or directory: 'label_encoder.pkl'

In [None]:
from google.colab import files
files.download("label_encoder.pkl")
!zip -r bert_intent_model.zip bert_intent_model
files.download("bert_intent_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: bert_intent_model/ (stored 0%)
  adding: bert_intent_model/model.safetensors (deflated 7%)
  adding: bert_intent_model/tokenizer_config.json (deflated 75%)
  adding: bert_intent_model/special_tokens_map.json (deflated 80%)
  adding: bert_intent_model/vocab.txt (deflated 53%)
  adding: bert_intent_model/config.json (deflated 65%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Zip the model folder
!zip -r bert_intent_model.zip bert_intent_model

# Save app.py
app_code = '''
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pickle
import gradio as gr

# Load model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert_intent_model")
tokenizer = BertTokenizer.from_pretrained("bert_intent_model")

# Load label encoder
with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

intent_response_map = {
    "contact_customer_service": "📞 Sure! Connecting you to customer support.",
    "review": "📝 Thanks for your valuable feedback!",
    "get_refund": "💸 I'll help you with the refund process.",
    "switch_account": "🔄 Switching your account now.",
    "complaint": "🚨 Sorry to hear that! Please tell us more about your issue.",
    "check_payment_methods": "💳 You can pay via credit card, UPI, or net banking.",
    "contact_human_agent": "👩‍💼 Let me get a human agent for you.",
    "delivery_period": "⏱ Deliveries usually take 3–5 business days.",
    "check_invoice": "🧾 I can check your invoice details.",
    "get_invoice": "📄 Here’s your invoice. You should also receive it via email.",
    "newsletter_subscription": "📬 You've been subscribed to our newsletter!",
    "place_order": "🛒 Placing your order now!",
    "check_cancellation_fee": "⚠ Let me check the cancellation fee details.",
    "delete_account": "⚠ Deleting your account. Please confirm once more.",
    "cancel_order": "❌ Your order has been canceled.",
    "payment_issue": "⚠ I can help you resolve your payment issue.",
    "create_account": "🧾 Let’s get your account created!",
    "track_refund": "🔍 Let me check the status of your refund.",
    "set_up_shipping_address": "📦 Setting up your shipping address.",
    "track_order": "📦 Tracking your order now.",
    "change_order": "✏ Let’s update your order details.",
    "edit_account": "🛠 I can help you update your account info.",
    "check_refund_policy": "📜 Here’s our refund policy.",
    "delivery_options": "🚚 We offer standard and express delivery options.",
    "recover_password": "🔐 No worries! I’ll help you recover your password.",
    "registration_problems": "🚫 Let me help you fix your registration issue.",
    "change_shipping_address": "📍 Let’s update your shipping address.",
}

def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    intent = le.inverse_transform([pred])[0]
    return intent_response_map.get(intent, "🤖 Sorry, I didn't get that.")

gr.Interface(
    fn=predict_intent,
    inputs=gr.Textbox(lines=2, placeholder="Ask about orders, refunds, account..."),
    outputs="text",
    title="🛍️ E-commerce Chatbot",
    description="Ask me anything related to e-commerce support!"
).launch()
'''
with open("app.py", "w") as f:
    f.write(app_code)

# Save requirements.txt
with open("requirements.txt", "w") as f:
    f.write("transformers\ntorch\nscikit-learn\ngradio")

# Download files
from google.colab import files
files.download("bert_intent_model.zip")
files.download("label_encoder.pkl")
files.download("app.py")
files.download("requirements.txt")



zip error: Nothing to do! (try: zip -r bert_intent_model.zip . -i bert_intent_model)


FileNotFoundError: Cannot find file: bert_intent_model.zip