In [19]:
import json
import random
import nltk
from nltk.stem import WordNetLemmatizer # Lemmatizer to reduce words to their base form
import re
from nltk.tokenize import word_tokenize  # Tokenizer to split text into words
from nltk.corpus import stopwords  # List of common stopwords to remove
import pandas as pd

In [20]:
# Download necessary NLTK data
nltk.download('punkt')# Tokenizer models
nltk.download('wordnet')
nltk.download('stopwords') # Stopword list

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
# ----- Load the Dataset -----
with open("intents.json", "r") as file:
    data = json.load(file)

In [22]:
# def preprocess(text):
#     # Convert text to lowercase
#     text = text.lower()
#     # Step 2: Removing special characters and numbers (keep only alphabets and spaces)
#     text = re.sub(r'[^a-z\s]', '', text)
#     # step 3: Tokenize the text into words
#     words = word_tokenize(text)
#     # step 4:Remove stopwords; you can customize this list or use nltk's stopwords
#     stop_words = set(stopwords.words('english'))
#     filtered_words = [word for word in words if word not in stop_words]
#     # Lemmatize the filtered words
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
#     return lemmatized_words
# # Use the preprocess function as our tokenizer
# def tokenize_and_lemmatize(text):
#     return preprocess(text)
    
# **** Since my data is already fairly clean I don't want to risk removing words that might be important for my intent classification.
# thats why i am not doing this steps of preprocessing 

In [65]:
# Initialize a lemmatizer to reduce words to their base form.
lemmatizer = WordNetLemmatizer()

# tokenizer and lemmatizer function
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text.lower())
    return [lemmatizer.lemmatize(token.lower()) for token in tokens]
#This function tokenizes a text string into words and converts each word to lowercase and lemmatizes it.

In [63]:
# Preparing the training data
# Creating empty lists to hold training(patterns) and their intent tags.
patterns = []
tags = []
# Loop through each intent in the dataset
for intent in data["intents"]:
    for pattern in intent["patterns"]:
        patterns.append(pattern)   # Append the pattern to the training sentences
        tags.append(intent["tag"])  # Append the corresponding tag

# Convert to DataFrame
df = pd.DataFrame({'text': patterns, 'label': tags})
df.head(15)

Unnamed: 0,text,label
0,Hi,greeting
1,Hello,greeting
2,Hey,greeting
3,Good morning,greeting
4,Good evening,greeting
5,Bye,goodbye
6,See you later,goodbye
7,Goodbye,goodbye
8,Take care,goodbye
9,Thanks,thanks


In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize)
# Transforming the list of patterns into numerical features
X = vectorizer.fit_transform(df['text'])
y = df['label']



In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
# y_pred = model.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))

In [47]:
print("Total patterns:", len(patterns))
print("Total tags:", len(tags))
print("Sample pattern:", patterns[0])
print("Sample tag:", tags[0])
sample = "What are your opening hours?"
tokens = tokenize_and_lemmatize(sample)
print("Tokens:", tokens)


Total patterns: 22
Total tags: 22
Sample pattern: Hi
Sample tag: greeting
Tokens: ['what', 'are', 'your', 'opening', 'hour', '?']


In [55]:
# function to get chatbot response
def predict_intent(user_input):
    # Converting to TF-IDF features
    text_vec = vectorizer.transform([user_input])
    # Predicting the intent
    intent = model.predict(text_vec)[0]
    # finding a random response for the predicted intent
    for item in data["intents"]:
        if item["tag"] == intent:
            return random.choice(item["responses"])
    return "I'm sorry, I didn't understand that."

In [56]:
# Test the model with a sample input
sample_input = "What hour?"
print("Predicted Response:", predict_intent(sample_input))

Predicted Response: The current time is 2 PM.


In [58]:
user_message = input("Type your message: ")
print("Chatbot response:", predict_intent(user_message))


Type your message:  hey,how are you?


Chatbot response: Good to see you!
