In [1]:
# Import required libraries
import random
import warnings
import numpy as np
import pandas as pd
import json
import ssl
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import nltk

# Setup SSL for NLTK downloads
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sridevi.tandley\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sridevi.tandley\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [2]:
# Load the dataset
with open('data.json') as file:
    corpus = json.load(file)

corpus

{'intents': [{'tag': 'Intro',
   'patterns': ['hi',
    'how are you',
    'is anyone there',
    'hello',
    'whats up',
    'hey',
    'yo',
    'listen',
    'please help me',
    'i am learner from',
    'i belong to',
    'aiml team',
    'data analytics team',
    'software team',
    'i am from',
    'my manager is',
    'online',
    'i am from',
    'hey ya',
    'talking to you for first time'],
   'responses': ['Hello! how can i help you ?'],
   'context_set': ''},
  {'tag': 'Exit',
   'patterns': ['thank you',
    'thanks',
    'cya',
    'see you',
    'later',
    'see you later',
    'goodbye',
    'i am leaving',
    'have a Good day',
    'you helped me',
    'thanks a lot',
    'thanks a ton',
    'you are the best',
    'great help',
    'too good',
    'you are a good learning buddy'],
   'responses': ['I hope I was able to assist you, Good Bye'],
   'context_set': ''},
  {'tag': 'SL',
   'patterns': ['i am not able to understand svm',
    'explain me how machine l

In [3]:
# Prepare the data
patterns = []
tags = []
for intent in corpus['intents']:
    for pattern in intent['patterns']:
        patterns.append(pattern)
        tags.append(intent['tag'])

print(patterns)
print('-'*50)
print(tags)

['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml team', 'data analytics team', 'software team', 'i am from', 'my manager is', 'online', 'i am from', 'hey ya', 'talking to you for first time', 'thank you', 'thanks', 'cya', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy', 'i am not able to understand svm', 'explain me how machine learning works', 'i am not able to understand naive bayes', 'i am not able to understand logistic regression', 'i am not able to understand ensemble techb=niques', 'i am not able to understand knn', 'i am not able to understand knn imputer', 'i am not able to understand cross validation', 'i am not able to understand boosting', 'i am not able to understand random forest', 'i am not able to understand ada boos

In [11]:
# Text preprocessing

# Initialize Stemmer
stemmer = PorterStemmer()

# Text preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(token.lower()) for token in tokens if token.isalpha()]
    return ' '.join(tokens)

# Preprocess the patterns
patterns = [preprocess_text(pattern) for pattern in patterns]
print(patterns)
len(patterns)

['hi', 'how are you', 'is anyon there', 'hello', 'what up', 'hey', 'yo', 'listen', 'plea help me', 'i am learner from', 'i belong to', 'aiml team', 'data analyt team', 'softwar team', 'i am from', 'my manag is', 'onlin', 'i am from', 'hey ya', 'talk to you for first time', 'thank you', 'thank', 'cya', 'see you', 'later', 'see you later', 'goodbi', 'i am leav', 'have a good day', 'you help me', 'thank a lot', 'thank a ton', 'you are the best', 'great help', 'too good', 'you are a good learn buddi', 'i am not abl to understand svm', 'explain me how machin learn work', 'i am not abl to understand naiv bay', 'i am not abl to understand logist regress', 'i am not abl to understand ensembl', 'i am not abl to understand knn', 'i am not abl to understand knn imput', 'i am not abl to understand cross valid', 'i am not abl to understand boost', 'i am not abl to understand random forest', 'i am not abl to understand ada boost', 'i am not abl to understand gradient boost', 'machin learn', 'ml', 's

115

In [5]:
# Vectorization

# Vectorize the patterns using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(patterns).toarray()

X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.47211097,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [6]:
# Encode the labels
#Y is coming from Tag which is nothing but target features which consists of responses to various queries from input features (X)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(tags)
y

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6,
       6, 6, 6, 6, 6], dtype=int64)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Define models to compare their performance
models = {
    "Neural Network": Sequential([
        Dense(128, input_dim=X_train.shape[1], activation='relu'),
        Dense(64, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dropout(0.5),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', probability=True)
}

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
# Compile the neural network model
models["Neural Network"].compile(optimizer=Adam(learning_rate=0.01),
                                 loss='sparse_categorical_crossentropy',
                                 metrics=['accuracy'])

In [10]:
models["Neural Network"].summary()

In [12]:
# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    if model_name == "Neural Network":
        # Train Neural Network
        model.fit(X_train, y_train, epochs=30, batch_size=16, verbose=1)
        y_pred = np.argmax(model.predict(X_test), axis=1)
    else:
        # Train Logistic Regression and SVM
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Evaluate the model
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Training Neural Network...
Epoch 1/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.1497 - loss: 1.9396
Epoch 2/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1252 - loss: 1.9429 
Epoch 3/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2995 - loss: 1.7639 
Epoch 4/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4521 - loss: 1.5253 
Epoch 5/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4538 - loss: 1.3559 
Epoch 6/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5789 - loss: 1.1119 
Epoch 7/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5026 - loss: 1.1380  
Epoch 8/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5939 - loss: 1.0465 
Epoch 9/30
[1m6/6[0m [32m━━━━━━━━

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Random Forest Classification Report:
              precision    recall  f1-score   support

         Bot       1.00      1.00      1.00         1
        Exit       0.00      0.00      0.00         1
       Intro       0.75      0.50      0.60         6
          NN       0.75      0.50      0.60         6
     Profane       0.00      0.00      0.00         2
          SL       0.38      1.00      0.56         5
      Ticket       0.00      0.00      0.00         2

    accuracy                           0.52        23
   macro avg       0.41      0.43      0.39        23
weighted avg       0.52      0.52      0.48        23


Training Gradient Boosting...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Gradient Boosting Classification Report:
              precision    recall  f1-score   support

         Bot       1.00      1.00      1.00         1
        Exit       0.00      0.00      0.00         1
       Intro       0.75      0.50      0.60         6
          NN       1.00      0.67      0.80         6
     Profane       0.00      0.00      0.00         2
          SL       0.36      1.00      0.53         5
      Ticket       0.00      0.00      0.00         2

    accuracy                           0.57        23
   macro avg       0.44      0.45      0.42        23
weighted avg       0.58      0.57      0.52        23


Training Support Vector Machine...

Support Vector Machine Classification Report:
              precision    recall  f1-score   support

         Bot       1.00      1.00      1.00         1
        Exit       0.00      0.00      0.00         1
       Intro       0.75      0.50      0.60         6
          NN       0.80      0.67      0.73         6
     Pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


- From the metrics we can clearly see that Neural Network performs better compared to other models

In [13]:
# Function to predict user input using the best model
def predict_class(text, model):
    processed_text = preprocess_text(text)
    text_vector = vectorizer.transform([processed_text]).toarray()
    if isinstance(model, Sequential):
        prediction = np.argmax(model.predict(text_vector), axis=1)
    else:
        prediction = model.predict(text_vector)
    tag = label_encoder.inverse_transform(prediction)[0]
    return tag


In [14]:
# Chat function
def chat(model):
    print("Chat with the bot (type 'quit' to stop):")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "quit":
            print("Goodbye!")
            break
        tag = predict_class(user_input, model)
        responses = [intent['responses'] for intent in corpus['intents'] if intent['tag'] == tag][0]
        print(random.choice(responses))

In [15]:
# Use the best model for chatting (choose based on performance metrics)
best_model = models["Neural Network"]
chat(best_model)

Chat with the bot (type 'quit' to stop):


You:  Hi


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
Link: Neural Nets wiki


You:  hi


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Link: Neural Nets wiki


You:  hi, how are you


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Hello! how can i help you ?


You:  explain me how machine learning works


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
Link: Machine Learning wiki 


You:  i am not able to understand ada boosting


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Link: Machine Learning wiki 


You:  what is deep learning


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Link: Neural Nets wiki


You:  ftmax


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Please use respectful words


You:  softmax


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Link: Neural Nets wiki


You:  have a Good day


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
I hope I was able to assist you, Good Bye


You:  quit


Goodbye!
