In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import json
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.linear_model import SGDClassifier
from collections import Counter
import nlpaug.augmenter.word as naw
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [2]:
faq_file = open("./chatbot.txt").read()
data = json.loads(faq_file)
df = pd.json_normalize(data)

In [3]:
df.head()

Unnamed: 0,question,answer,found_duplicate
0,Where can I enrol for Aadhaar?,You have to visit an Aadhaar Enrolment centre ...,False
1,What are the documents required for enrolment ...,You need to fill up an application form availa...,False
2,Do I need to bring original documents for Aadh...,"Yes, you need to bring original copies of supp...",False
3,Do I have to pay any fee for Aadhaar enrolment?,"No, Aadhaar enrolment is totally free of cost ...",False
4,What kind of data gets captured during Aadhaar...,There are two types of data gets captured for ...,False


In [4]:
df.drop(["found_duplicate"],axis=1,inplace=True)

In [5]:
df.isnull().sum()

question    0
answer      0
dtype: int64

In [6]:
df.head()

Unnamed: 0,question,answer
0,Where can I enrol for Aadhaar?,You have to visit an Aadhaar Enrolment centre ...
1,What are the documents required for enrolment ...,You need to fill up an application form availa...
2,Do I need to bring original documents for Aadh...,"Yes, you need to bring original copies of supp..."
3,Do I have to pay any fee for Aadhaar enrolment?,"No, Aadhaar enrolment is totally free of cost ..."
4,What kind of data gets captured during Aadhaar...,There are two types of data gets captured for ...


In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stop_words=stopwords.words('english')

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
def clean_data(text):
    text=text.lower() #lower the text
    text = re.sub(r'[^\w\s]', '', text) #remove irrelevant characters    
    text = text.split() #convert sentence to tokens
    text = [lemmatizer.lemmatize(word) for word in text] #lemmatization
    text = " ".join(text) #converting tokens to sentence
    return text

In [11]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
import nlpaug.augmenter.word as naw

In [14]:
import os
os.getcwd()

'C:\\Users\\sovan\\Downloads\\CodeAlpha Project\\faq-chatbot-main'

In [15]:
# model_type: word2vec, glove or fasttext
aug = naw.SynonymAug(aug_src='wordnet')

In [16]:
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
aug_data = {}

for ques, ans in tqdm(zip(df['question'], df['answer'])):
    for i in range(4):
        augmented_questions = aug.augment(ques)

        # if it returns a list
        if isinstance(augmented_questions, list):
            for aug_q in augmented_questions:
                aug_data[aug_q] = ans
        else:
            aug_data[augmented_questions] = ans


23it [00:00, 276.22it/s]


In [19]:
aug_df = pd.DataFrame(aug_data.items(),columns=['question','answer'])
aug_df.head()

Unnamed: 0,question,answer
0,Where can I enrol for Aadhaar?,You have to visit an Aadhaar Enrolment centre ...
1,Where terminate 1 enter for Aadhaar?,You have to visit an Aadhaar Enrolment centre ...
2,Where buttocks One enrol for Aadhaar?,You have to visit an Aadhaar Enrolment centre ...
3,Where send away Ace enrol for Aadhaar?,You have to visit an Aadhaar Enrolment centre ...
4,What be the documents required for enrolment f...,You need to fill up an application form availa...


In [20]:
final_df = pd.concat([df,aug_df])

In [21]:
final_df.to_csv("augmented.csv",index=False)

In [22]:
final_df = pd.read_csv("augmented.csv")

In [23]:
X = final_df['question']
y = final_df['answer']

In [24]:
le = LabelEncoder()

In [26]:
print(df.columns)


Index(['question', 'answer'], dtype='object')


In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("augmented.csv")

print(df.columns)   # check columns

y = df["answer"]    # use actual column name

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(y_encoded)


Index(['question', 'answer'], dtype='object')
[20 21 15  3  9  6  6 19  1 14  4  5  2 12 13 10  8  0 11  7 17 16 18 20
 20 20 20 21 21 21 21 15 15 15 15  3  3  3  3  9  9  9  9  6  6  6  6  6
  6  6  6 19 19 19 19  1  1  1  1 14 14 14 14  4  4  4  4  5  5  5  5  2
  2  2  2 12 12 12 12 13 13 13 13 10 10 10 10  8  8  8  8  0  0  0  0 11
 11 11 11  7  7  7  7 17 17 17 17 16 16 16 16 18 18 18 18]


In [28]:
y = le.fit_transform(y)

In [29]:
import pandas as pd
pd.Series(y).value_counts()


6     10
20     5
21     5
15     5
3      5
9      5
19     5
1      5
14     5
4      5
5      5
2      5
12     5
13     5
10     5
8      5
0      5
11     5
7      5
17     5
16     5
18     5
Name: count, dtype: int64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=100, test_size=0.2
)


In [31]:
tf = TfidfVectorizer(
    ngram_range=(1, 3),
    min_df=1,
    stop_words='english'
)


In [32]:
tf = TfidfVectorizer(
    ngram_range=(1,3),
    min_df=1,
    stop_words='english'
)

X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)


In [33]:
model = SGDClassifier(n_jobs=-1,random_state=100,loss='modified_huber',alpha=0.0005)
model.fit(X_train_tf,y_train)

0,1,2
,loss,'modified_huber'
,penalty,'l2'
,alpha,0.0005
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [34]:
y_pred = model.predict(X_test_tf)

In [35]:
labels = np.unique(y_test)
ytest_prob = label_binarize(y_test, classes=labels)
ypred_prob = label_binarize(y_pred, classes=labels)

  y_type = type_of_target(y)
  y_type = type_of_target(y)


In [36]:
print("Accuracy Score:",accuracy_score(y_test,y_pred))
print("Precision Score:",precision_score(y_test,y_pred,average='micro'))
print("Recall Score:",recall_score(y_test,y_pred,average='micro'))
print("ROC-AUC Score:",roc_auc_score(ytest_prob,ypred_prob,multi_class='ovo',average='micro'))

Accuracy Score: 0.9565217391304348
Precision Score: 0.9565217391304348
Recall Score: 0.9565217391304348
ROC-AUC Score: 0.9767080745341615


  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)


In [37]:
idx = 2 
print("Question:", X_test.iloc[idx])
print("Predicted Answer:", le.inverse_transform([model.predict(X_test_tf[idx])[0]])[0])
print("Actual Answer:", le.inverse_transform([y_test[idx]])[0])


Question: Can I experience myself Enroled for Aadhaar just by institutionalise require documents by Office?
Predicted Answer: No, you have to personally visit Aadhaar enrolment centre for getting yourself Enroled as your Biometrics will be captured.
Actual Answer: No, you have to personally visit Aadhaar enrolment centre for getting yourself Enroled as your Biometrics will be captured.


In [42]:
import numpy as np

questn = "Does the online downloaded Aadhaar letter have same validity as that of original?"

clean_ques = clean_data(questn)
clean_ques = tf.transform([clean_ques])

# Get probabilities
probs = model.predict_proba(clean_ques)
confidence = np.max(probs)

# Prediction
pred_class = model.predict(clean_ques)[0]
pred_answer = le.inverse_transform([pred_class])[0]

print(f"Question: {questn}")
print(f"Confidence: {confidence*100:.2f}%")
print(f"Predicted Answer: {pred_answer}")
print("------------------------------------------------------------")


Question: Does the online downloaded Aadhaar letter have same validity as that of original?
Confidence: 100.00%
Predicted Answer: Yes, online downloaded e-Aadhaar letter has the same validity as that of original.
------------------------------------------------------------


In [43]:
test_questions = [
    "how to open new savings account",
    "how to update address in aadhaar",
    "what is minimum age for aadhaar",
    "how to link aadhaar with bank",
    "how to download aadhaar card"
]

for question in test_questions:
    clean_q = clean_data(question)
    vec_q = tf.transform([clean_q])
    
    prediction = model.predict(vec_q)
    confidence = np.max(model.predict_proba(vec_q))
    
    print("Question:", question)
    print("Confidence:", round(confidence, 3))
    print("Predicted Answer:", le.inverse_transform(prediction)[0])
    print("-" * 80)


Question: how to open new savings account
Confidence: 0.109
Predicted Answer: Yes. The resident may be allowed to add minor fields such as House No., Lane No., Street Name, correcting typographic errors, minor changes/ corrections to pin code etc. to the address listed in the PoA document as long as these additions/modifications do not alter the base address mentioned in the PoA document. If the changes requested are substantial and change the base address that is listed in the PoA, the resident will require to provide an alternate PoA or enrol through an Introducer.
--------------------------------------------------------------------------------
Question: how to update address in aadhaar
Confidence: 0.045
Predicted Answer: No, there is no age limit defined for Aadhaar Enrolment. Even a new born baby can also get Enroled for Aadhaar.
--------------------------------------------------------------------------------
Question: what is minimum age for aadhaar
Confidence: 0.381
Predicted Ans