<a href="https://colab.research.google.com/github/shubhu1026/AI-ML/blob/main/NLP_FInal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle transformers tensorflow scikit-learn



In [None]:
# Set up kaggle credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the chatbot dataset
!kaggle datasets download -d niraliivaghani/chatbot-dataset

Dataset URL: https://www.kaggle.com/datasets/niraliivaghani/chatbot-dataset
License(s): DbCL-1.0


In [None]:
!unzip chatbot-dataset.zip

Archive:  chatbot-dataset.zip
  inflating: intents.json            


In [None]:
# Load and preprocess the dataset
import json
import pandas as pd

# Load JSON file
with open("intents.json") as f:
    data = json.load(f)

# Convert to DataFrame
patterns = []
tags = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(pattern)
        tags.append(intent['tag'])

df = pd.DataFrame({'text': patterns, 'label': tags})

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])
num_classes = len(label_encoder.classes_)

In [None]:
df.head()

Unnamed: 0,text,label,label_id
0,Hi,greeting,14
1,How are you?,greeting,14
2,Is anyone there?,greeting,14
3,Hello,greeting,14
4,Good day,greeting,14


In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label_id'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label_id']
)

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import tensorflow as tf

def encode_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))

train_dataset = encode_dataset(train_encodings, train_labels).batch(16)
val_dataset = encode_dataset(val_encodings, val_labels).batch(16)


In [None]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

model.fit(train_dataset, validation_data=val_dataset, epochs=3)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x79743a709ed0>

In [None]:
loss, accuracy = model.evaluate(val_dataset)
print(f"Validation Accuracy: {accuracy:.2f}")

Validation Accuracy: 0.52


In [None]:
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True)
    output = model(inputs).logits
    pred_id = tf.argmax(output, axis=1).numpy()[0]
    return label_encoder.inverse_transform([pred_id])[0]


predict_intent("What are the admission requirements?")

'course'

In [None]:
# Interactive Chatbot in Colab
import ipywidgets as widgets
from IPython.display import display, clear_output
import random

# Load your original intent-response mapping
with open("intents.json") as f:
    intents_data = json.load(f)

# Build a dictionary: tag -> list of responses
response_dict = {intent["tag"]: intent["responses"] for intent in intents_data["intents"]}

# Function to get model prediction
def get_response(user_input):
    inputs = tokenizer(user_input, return_tensors="tf", truncation=True, padding=True)
    logits = model(inputs).logits
    pred_id = tf.argmax(logits, axis=1).numpy()[0]
    tag = label_encoder.inverse_transform([pred_id])[0]
    response = random.choice(response_dict.get(tag, ["I'm not sure how to help with that."]))
    return tag, response

# Create chatbot UI using widgets
input_box = widgets.Text(placeholder='Ask me something about university...')
output_area = widgets.Output()

def on_enter_key(change):
    if change['name'] == 'value' and change['new'] != '':
        with output_area:
            clear_output()
            user_input = change['new']
            tag, response = get_response(user_input)
            print(f"🧑 You: {user_input}")
            print(f"🤖 Bot ({tag}): {response}")
        input_box.value = ''  # Clear input

input_box.observe(on_enter_key)
display(input_box, output_area)

Text(value='', placeholder='Ask me something about university...')

Output()