# Chatbot tutorial
## A basic chatbot to answer queries of Customers of HDFC Bank.
### Data Source: [Click here](https://github.com/priyamsekra10/ADC_chatbot_event)

### Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import json
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.linear_model import SGDClassifier
from collections import Counter
import nlpaug.augmenter.word as naw
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [1]:
!pip install nlpaug



In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading Dataset

In [4]:
import pandas as pd
df = pd.read_csv("hdfc.csv")

In [5]:
# print the first 5 rows
df.head()

Unnamed: 0,question,answer,found_duplicate
0,How do I change my password?,"After you have logged in, you can change your ...",False
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...,False
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ...",False
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...,False
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us...",False


## Preprocessing Phase

In [6]:
# drop the found_duplicate row
df.drop(["found_duplicate"],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,question,answer
0,How do I change my password?,"After you have logged in, you can change your ..."
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ..."
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us..."


In [8]:
# check for null values in the dataset
# will return the count of null values in each column
df.isnull().sum()

question    0
answer      0
dtype: int64

# Natural language processing
## enables them to understand and generate human-like text

## Stopwords
# unnecesary words

In [9]:
# Original Text:
# ['This', 'is', 'an', 'example', 'sentence', 'for', 'demonstrating', 'the', 'removal', 'of', 'stopwords', '.']

# Text after removing stopwords:
# ['example', 'sentence', 'demonstrating', 'removal', 'stopwords', '.']


In [10]:
# Import the NLTK library and download the stopwords for the English language
# initializing stopwords to use it later
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Get the list of English stop words
# stop_words = stopwords.words('english')
stop_words = set(stopwords.words('english'))
stop_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

# Lemmatization
## reduce words to their base form
## Example: eating --> eat, cats--> cat

In [11]:
# Import the NLTK library
# initializing
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Create a WordNet Lemmatizer instance
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
import re
# re regular expresion
# sub subsitute
def clean_data(text):
    # Convert the text to lowercase
    text = text.lower()

    # Remove non-word characters and whitespace
    text = re.sub(r'[^\w\s]', '', text)   # NOT a word character or a whitespace character.

    # Split the cleaned text into individual words (tokenization)
    words = text.split()

    # Remove stopwords
    # words = [word for word in words if word not in stop_words]

    # Lemmatize each word using the WordNet Lemmatizer
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the lemmatized words back into a sentence
    text = " ".join(words)

    return text


In [13]:
# applying the clean_text function on our data
df["question"] = df["question"].apply(clean_data)

# Data Augmentation
## increase the diversity and quantity of training data
## how:

*   Pre trained models
*   reshuffling sentences
*   replace words by there synonyms



In [143]:
# download the GloVe word vectors dataset
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2023-09-24 20:18:43--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-09-24 20:18:43--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-09-24 20:18:43--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.3’

gl

In [144]:
!unzip glove*.zip

Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## Word embeddings are vector representations of words in a multi-dimensional space, capable of capturing semantic relationships between words.

In [14]:
aug = naw.WordEmbsAug(model_type='glove', model_path="./glove.6B.100d.txt",action="substitute")

In [15]:
# Import the tqdm library to create a progress bar
from tqdm import tqdm

# Create an empty dictionary to store augmented data
aug_data = {}

# Iterate through the 'question' and 'answer' columns of the DataFrame 'df' using tqdm for progress visualization
for ques, ans in tqdm(zip(df['question'], df['answer'])):
    if isinstance(ques, list):
        # Check if 'ques' is a list, and if it is, convert it to a string by joining its elements with spaces
        ques_str = ' '.join(ques)
    else:
        # If 'ques' is not a list, keep it as is (presumed to be a string)
        ques_str = ques

    # Iterate four times (for augmenting the data in this example)
    for i in range(4):
        # Augment the 'ques_str' using some function or library called 'aug.augment'
        aug_question = aug.augment(ques_str)

        # Check if the result of 'aug.augment' is a list
        if isinstance(aug_question, list):
            # If it's a list, convert it to a string by joining its elements with spaces
            aug_question = ' '.join(aug_question)

        # Store the augmented question as a key and its corresponding answer in the 'aug_data' dictionary
        aug_data[aug_question] = ans



2236it [13:44,  2.71it/s]


In [16]:
aug_df = pd.DataFrame(aug_data.items(),columns=['question','answer'])
aug_df

Unnamed: 0,question,answer
0,how give i change my code,"After you have logged in, you can change your ..."
1,might do i change my overdraft,"After you have logged in, you can change your ..."
2,how you actually change my password,"After you have logged in, you can change your ..."
3,how do i change going deleting,"After you have logged in, you can change your ..."
4,when will i receive my until subscriber hat,You will receive your new ATM PIN by post with...
...,...,...
8938,those is the promo code same had entered in th...,Promo Code is an optional field and has to be ...
8939,after loan disbursal how way receive the non l...,"Login to NetBanking, Click Cards tab > Credit ..."
8940,after grants re-assessment how well check this...,"Login to NetBanking, Click Cards tab > Credit ..."
8941,after loan palliation how to check the very lo...,"Login to NetBanking, Click Cards tab > Credit ..."


In [17]:
final_df = pd.concat([df,aug_df])
final_df

Unnamed: 0,question,answer
0,how do i change my password,"After you have logged in, you can change your ..."
1,when will i receive my changed atm pin,You will receive your new ATM PIN by post with...
2,can i get my newly generated pin online,"No, for security reasons we send you your ATM ..."
3,how can i register for autopay,To register for Autopay: Step 1: Click on the ...
4,can chip credit card be used anywhere,"Yes, your HDFC Bank Chip Credit card can be us..."
...,...,...
8938,those is the promo code same had entered in th...,Promo Code is an optional field and has to be ...
8939,after loan disbursal how way receive the non l...,"Login to NetBanking, Click Cards tab > Credit ..."
8940,after grants re-assessment how well check this...,"Login to NetBanking, Click Cards tab > Credit ..."
8941,after loan palliation how to check the very lo...,"Login to NetBanking, Click Cards tab > Credit ..."


In [18]:
final_df['question_type'] = final_df['question'].apply(lambda q: type(q).__name__)
final_df['answer_type'] = final_df['answer'].apply(lambda q: type(q).__name__)
final_df

Unnamed: 0,question,answer,question_type,answer_type
0,how do i change my password,"After you have logged in, you can change your ...",str,str
1,when will i receive my changed atm pin,You will receive your new ATM PIN by post with...,str,str
2,can i get my newly generated pin online,"No, for security reasons we send you your ATM ...",str,str
3,how can i register for autopay,To register for Autopay: Step 1: Click on the ...,str,str
4,can chip credit card be used anywhere,"Yes, your HDFC Bank Chip Credit card can be us...",str,str
...,...,...,...,...
8938,those is the promo code same had entered in th...,Promo Code is an optional field and has to be ...,str,str
8939,after loan disbursal how way receive the non l...,"Login to NetBanking, Click Cards tab > Credit ...",str,str
8940,after grants re-assessment how well check this...,"Login to NetBanking, Click Cards tab > Credit ...",str,str
8941,after loan palliation how to check the very lo...,"Login to NetBanking, Click Cards tab > Credit ...",str,str


In [19]:
final_df.to_csv("augmented.csv",index=False)

## Model Building Phase

In [20]:
final_df = pd.read_csv("augmented.csv")

In [21]:
X = final_df['question']  # X represents your input features (questions) # input
y = final_df['answer']  # y represents the corresponding labels (answers) # expected output

In [22]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of the LabelEncoder
# used for converting categorical variables into numerical variables
le = LabelEncoder()

In [23]:
y = le.fit_transform(y)

## Split into train and test data


In [24]:
from sklearn.model_selection import train_test_split
# test size 0.2 means 20% test data
# stratify: number of cases equal in X and y.
# random state: must use format, ensures same labels as many times code is running
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100,test_size=0.2,stratify=y)

### TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical representation of text data that measures the importance of words in a document relative to a collection of documents. It's commonly used in information retrieval, text mining, and natural language processing to identify the significance of words for text analysis and search.

In [25]:
# Import the necessary library
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer instance with specified settings
# ngram_range=(1, 3) allows for unigrams, bigrams, and trigrams (word combinations)
# min_df=0 specifies that words should be included if they occur in at least 1 document
# stop_words='english' removes common English stopwords from the text
tf = TfidfVectorizer(ngram_range=(1, 3), min_df=0, stop_words='english')

X_train_tf = tf.fit_transform(X_train)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you have a TfidfVectorizer instance 'tf' and training data 'X_train'
# tf.fit(X_train)  # Fit the vectorizer on your training data

# Now, you can transform your test data
X_test_tf = tf.transform(X_test)

In [27]:
X_test_tf = tf.transform(X_test)

In [28]:
from sklearn.linear_model import SGDClassifier

# Create a SGDClassifier model with specified settings
# n_jobs=-1: Utilize all available CPU cores for faster training
# random_state=100: Set a fixed random seed for reproducible results
# loss='modified_huber': Use the modified Huber loss function
# alpha=0.0005: Specify the regularization strength
model = SGDClassifier(n_jobs=-1, random_state=100, loss='modified_huber', alpha=0.0005)

# Fit the model to the training data
model.fit(X_train_tf, y_train)


In [29]:
# Make predictions using the trained model
# X_test_tf: The TF-IDF transformed features of the test data
y_pred = model.predict(X_test_tf)

In [30]:
from sklearn.preprocessing import label_binarize

# Obtain the unique labels in the test data
labels = np.unique(y_test)

# Convert the true labels (y_test) to binary format
ytest_prob = label_binarize(y_test, classes=labels)

# Convert the predicted labels (y_pred) to binary format
ypred_prob = label_binarize(y_pred, classes=labels)

In [31]:
print("ROC-AUC Score:",roc_auc_score(ytest_prob,ypred_prob,multi_class='ovo',average='micro'))

ROC-AUC Score: 0.8635325272334224


## Testing Phase

In [34]:
idx = 8 # (index on question)
print(f"Question: {X_test.iloc[idx]}")
# Make a prediction for the selected test example and obtain the predicted answer
# Retrieve the actual answer for the selected test example
print(f"\nPredicted Answer:\n{le.inverse_transform(model.predict(X_test_tf[idx]))[0]}")
# Retrieve the actual answer for the selected test example
print(f"\nActual Answer:\n{le.inverse_transform([y_test[idx]])[0]}")

Question: what majid all contribution mean

Predicted Answer:
Own Contribution is the total cost of the property less HDFCs home loan.

Actual Answer:
Own Contribution is the total cost of the property less HDFCs home loan.


In [35]:
# Define the new question
questn = "how to open new savings account"

# Clean the new question using the 'clean_data' function
clean_ques = clean_data(questn)

# Transform the cleaned question using the TF-IDF vectorizer
clean_ques = tf.transform([clean_ques])

print(f"Question: {questn}")

# If the confidence is above a certain threshold (0.1 in this case), print the predicted answer
if np.amax(model.predict_proba(clean_ques))>0.1:
    print(f"\nPredicted Answer:\n{le.inverse_transform(model.predict(clean_ques))[0]}")
else:
    print(f"\nPredicted Answer:\n(Not sure about your question, This might help you):\n\n{le.inverse_transform(model.predict(clean_ques))[0]}")

Question: how to open new current account

Predicted Answer:
(Not sure about your question, This might help you):

In order to open a new Savings Account, simply walk into the nearest HDFC Bank and speak to a customer service executive. Remember to carry the following documents (original for verification and self-attested copies for submission):Identity ProofAddress ProofLatest passport size photographsClick here to see the List of valid identity/addres proof.
