In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import json
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.linear_model import SGDClassifier
from collections import Counter
import nlpaug.augmenter.word as naw
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

## Data Loading & Preprocessing

In [2]:
faq_file = open("./chatbot.txt").read()
data = json.loads(faq_file)
df = pd.json_normalize(data)

In [3]:
df.head()

Unnamed: 0,question,answer,found_duplicate
0,Where can I enrol for Aadhaar?,You have to visit an Aadhaar Enrolment centre ...,False
1,What are the documents required for enrolment ...,You need to fill up an application form availa...,False
2,Do I need to bring original documents for Aadh...,"Yes, you need to bring original copies of supp...",False
3,Do I have to pay any fee for Aadhaar enrolment?,"No, Aadhaar enrolment is totally free of cost ...",False
4,What kind of data gets captured during Aadhaar...,There are two types of data gets captured for ...,False


In [4]:
df.drop(["found_duplicate"],axis=1,inplace=True)

In [5]:
df.isnull().sum()

question    0
answer      0
dtype: int64

In [6]:
df.head()

Unnamed: 0,question,answer
0,Where can I enrol for Aadhaar?,You have to visit an Aadhaar Enrolment centre ...
1,What are the documents required for enrolment ...,You need to fill up an application form availa...
2,Do I need to bring original documents for Aadh...,"Yes, you need to bring original copies of supp..."
3,Do I have to pay any fee for Aadhaar enrolment?,"No, Aadhaar enrolment is totally free of cost ..."
4,What kind of data gets captured during Aadhaar...,There are two types of data gets captured for ...


In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stop_words=stopwords.words('english')

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
def clean_data(text):
    text=text.lower() #lower the text
    text = re.sub(r'[^\w\s]', '', text) #remove irrelevant characters    
    text = text.split() #convert sentence to tokens
    text = [lemmatizer.lemmatize(word) for word in text] #lemmatization
    text = " ".join(text) #converting tokens to sentence
    return text

In [11]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
import nlpaug.augmenter.word as naw

In [14]:
import os
os.getcwd()

'C:\\Users\\sovan\\Downloads\\faq-chatbot-main'

In [15]:
# model_type: word2vec, glove or fasttext
aug = naw.SynonymAug(aug_src='wordnet')

In [16]:
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
aug_data = {}
for ques,ans in tqdm(zip(df['question'],df['answer'])):
    for i in range(4):
        aug_data.update({aug.augment(ques):ans})

0it [00:00, ?it/s][nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
0it [00:00, ?it/s]


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng[0m

  Searched in:
    - 'C:\\Users\\sovan/nltk_data'
    - 'C:\\Users\\sovan\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data'
    - 'C:\\Users\\sovan\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data'
    - 'C:\\Users\\sovan\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data'
    - 'C:\\Users\\sovan\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [18]:
aug_df = pd.DataFrame(aug_data.items(),columns=['question','answer'])
aug_df.head()

Unnamed: 0,question,answer


In [19]:
final_df = pd.concat([df,aug_df])

In [20]:
final_df.to_csv("augmented.csv",index=False)

## Modelling

In [21]:
final_df = pd.read_csv("augmented.csv")

In [22]:
X = final_df['question']
y = final_df['answer']

In [23]:
le = LabelEncoder()

In [24]:
y = le.fit_transform(y)

In [25]:
import pandas as pd
pd.Series(y).value_counts()


6     2
20    1
21    1
15    1
3     1
9     1
19    1
1     1
14    1
4     1
5     1
2     1
12    1
13    1
10    1
8     1
0     1
11    1
7     1
17    1
16    1
18    1
Name: count, dtype: int64

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=100, test_size=0.2
)


In [27]:
tf = TfidfVectorizer(
    ngram_range=(1, 3),
    min_df=1,
    stop_words='english'
)


In [28]:
tf = TfidfVectorizer(
    ngram_range=(1,3),
    min_df=1,
    stop_words='english'
)

X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)


In [29]:
model = SGDClassifier(n_jobs=-1,random_state=100,loss='modified_huber',alpha=0.0005)
model.fit(X_train_tf,y_train)

0,1,2
,loss,'modified_huber'
,penalty,'l2'
,alpha,0.0005
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [30]:
y_pred = model.predict(X_test_tf)

In [31]:
labels = np.unique(y_test)
ytest_prob = label_binarize(y_test, classes=labels)
ypred_prob = label_binarize(y_pred, classes=labels)

In [32]:
print("Accuracy Score:",accuracy_score(y_test,y_pred))
print("Precision Score:",precision_score(y_test,y_pred,average='micro'))
print("Recall Score:",recall_score(y_test,y_pred,average='micro'))
print("ROC-AUC Score:",roc_auc_score(ytest_prob,ypred_prob,multi_class='ovo',average='micro'))

Accuracy Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
ROC-AUC Score: 0.5


## Testing

In [33]:
idx = 2 
print("Question:", X_test.iloc[idx])
print("Predicted Answer:", le.inverse_transform([model.predict(X_test_tf[idx])[0]])[0])
print("Actual Answer:", le.inverse_transform([y_test[idx]])[0])


Question: Is there any age limit for Aadhaar enrolment?
Predicted Answer: No, Aadhaar enrolment is totally free of cost therefore you need not pay anything at the enrolment centre.
Actual Answer: No, there is no age limit defined for Aadhaar Enrolment. Even a new born baby can also get Enroled for Aadhaar.


In [34]:
questn = "how to open new savings account"
clean_ques = clean_data(questn)
clean_ques = tf.transform([clean_ques])
print(f"Question: {questn}")
if np.amax(model.predict_proba(clean_ques))>0.1:
    print(f"\nPredicted Answer:\n{le.inverse_transform(model.predict(clean_ques))[0]}")
else:
    print(f"\nPredicted Answer:\n(Not sure about your question, This might help you):\n\n{le.inverse_transform(model.predict(clean_ques))[0]}")

Question: how to open new savings account

Predicted Answer:
(Not sure about your question, This might help you):

Yes. The resident may be allowed to add minor fields such as House No., Lane No., Street Name, correcting typographic errors, minor changes/ corrections to pin code etc. to the address listed in the PoA document as long as these additions/modifications do not alter the base address mentioned in the PoA document. If the changes requested are substantial and change the base address that is listed in the PoA, the resident will require to provide an alternate PoA or enrol through an Introducer.


In [35]:
test_questions = [
    "how to open new savings account",
    "how to update address in aadhaar",
    "what is minimum age for aadhaar",
    "how to link aadhaar with bank",
    "how to download aadhaar card"
]

for question in test_questions:
    clean_q = clean_data(question)
    vec_q = tf.transform([clean_q])
    
    prediction = model.predict(vec_q)
    confidence = np.max(model.predict_proba(vec_q))
    
    print("Question:", question)
    print("Confidence:", round(confidence, 3))
    print("Predicted Answer:", le.inverse_transform(prediction)[0])
    print("-" * 80)


Question: how to open new savings account
Confidence: 0.09
Predicted Answer: Yes. The resident may be allowed to add minor fields such as House No., Lane No., Street Name, correcting typographic errors, minor changes/ corrections to pin code etc. to the address listed in the PoA document as long as these additions/modifications do not alter the base address mentioned in the PoA document. If the changes requested are substantial and change the base address that is listed in the PoA, the resident will require to provide an alternate PoA or enrol through an Introducer.
--------------------------------------------------------------------------------
Question: how to update address in aadhaar
Confidence: 1.0
Predicted Answer: Yes. The resident may be allowed to add minor fields such as House No., Lane No., Street Name, correcting typographic errors, minor changes/ corrections to pin code etc. to the address listed in the PoA document as long as these additions/modifications do not alter the