In [1]:
import pandas as pd


import nltk   #This is a core NLP library
from nltk.corpus import stopwords  #
import re

#Used for creating a feature vector
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#Used for labelling the feature_output
from sklearn.preprocessing import LabelEncoder

#some classification algorithms to check into
from sklearn.svm import SVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

#Used to compare similarity between docs
from sklearn.metrics.pairwise import cosine_similarity

#Used to return a sample response.
import numpy as np

In [2]:
#Reading initial sample knowledge base i created 
df=pd.read_excel('data.xlsx')

In [3]:
df.head()

Unnamed: 0,Category,Tag,Question,Answer
0,My Account,Rating and Reviews,How to Review Product as well as Seller?,Everything is right here. Please check this ou...
1,My Account,Rating and Reviews,How to Filter Search Results with Product Rati...,Products can also be filtered according to the...
2,My Account,Rating and Reviews,What are Ratings & Reviews and how do I write ...,Daraz customers have the option of rating and ...
3,My Account,Product Warranty,How do I know if a product comes with free ins...,Unfortunately we are not providing the Free In...
4,My Account,Product Warranty,How do I know if a product comes with warranty?,If a warranty is offered on a product the warr...


In [4]:
txt=df['Answer'][2]
txt

'Daraz customers have the option of rating and reviewing their product and seller experiences.,The customers can both leave star ratings and add text to describe what they liked or disliked about their experience.'

In [5]:
#Some answers may have more than 1 responses so splitting them using ',' and checking it
txt.split(',') 

['Daraz customers have the option of rating and reviewing their product and seller experiences.',
 'The customers can both leave star ratings and add text to describe what they liked or disliked about their experience.']

In [6]:
df['Question'].head()

0             How to Review Product as well as Seller?
1    How to Filter Search Results with Product Rati...
2    What are Ratings & Reviews and how do I write ...
3    How do I know if a product comes with free ins...
4      How do I know if a product comes with warranty?
Name: Question, dtype: object

In [7]:
def cleantext(txt):
    txt=txt.split() #spliting all text into single words like nltk.word_tokenize()
    txt=[i.lower() for i in txt if i not in stopwords.words('english') ] #removing stop words
    txt=' '.join(txt)    #again joining the words to make a sentence
    txt=re.sub(r'[^A-Za-z0-9]',' ',txt)   #removing all punctuations and special symbols 
    txt=' '.join(txt.split()) #splitting texts because some of the sentences were having more than 3,4 times and rejoining
                            #them
    return txt #returning the final cleaned text.
    
    
print(cleantext('My name  is  rishav..'))
df['Cleaned']=df['Question'].apply(lambda x :cleantext(x)) #Creating new feature cleaned with cleaned text

my name rishav


In [8]:
df['Cleaned'].head(10)

0                how review product well seller
1     how filter search results product ratings
2     what ratings reviews i write good reviews
3    how i know product comes free installation
4             how i know product comes warranty
5     i lost warranty card how i claim warranty
6        are hidden costs charges i order daraz
7                   are prices daraz negotiable
8            why i see different prices product
9                            how contact vendor
Name: Cleaned, dtype: object

In [9]:
#This is Tfidf vectorizer object
vectorizer=TfidfVectorizer()
X_=vectorizer.fit_transform(df['Cleaned']).toarray()

#This is count vectorizer object
cv=CountVectorizer()
X_cv_=cv.fit_transform(df['Cleaned']).toarray()

In [10]:
X_[:2]  #Checking tfidf vectorized input vector.

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.28192401, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [11]:
X_.shape   #shape of input vector

(44, 123)

In [12]:
labelencoder=LabelEncoder() #trying to encode label of feature with only category.
df['Label']=labelencoder.fit_transform(df['Category'])

In [13]:
df.head()

Unnamed: 0,Category,Tag,Question,Answer,Cleaned,Label
0,My Account,Rating and Reviews,How to Review Product as well as Seller?,Everything is right here. Please check this ou...,how review product well seller,1
1,My Account,Rating and Reviews,How to Filter Search Results with Product Rati...,Products can also be filtered according to the...,how filter search results product ratings,1
2,My Account,Rating and Reviews,What are Ratings & Reviews and how do I write ...,Daraz customers have the option of rating and ...,what ratings reviews i write good reviews,1
3,My Account,Product Warranty,How do I know if a product comes with free ins...,Unfortunately we are not providing the Free In...,how i know product comes free installation,1
4,My Account,Product Warranty,How do I know if a product comes with warranty?,If a warranty is offered on a product the warr...,how i know product comes warranty,1


In [14]:
y=df['Label']

# Since the knowledge base is very small so there is no point of splitting data into training data and test data. So i have taken whole data as a training data. When we try to predict with some input question text than we compare how similar is this question to the one we have in the knowledge base

In [15]:
#Creating all the objects for Classifiers
logistic=LogisticRegression().fit(X_,y)
passive=PassiveAggressiveClassifier().fit(X_,y)
svm=SVC().fit(X_,y)
naive=MultinomialNB().fit(X_,y)
dectr=DecisionTreeClassifier().fit(X_,y)
rand_for=RandomForestClassifier().fit(X_,y)
sgd=SGDClassifier().fit(X_,y)

In [16]:
#Checking whats the training score of particular data.
print(logistic.score(X_,y))
print(passive.score(X_,y))
print(svm.score(X_,y))
print(naive.score(X_,y))
print(dectr.score(X_,y))
print(rand_for.score(X_,y))
print(sgd.score(X_,y))

0.9090909090909091
1.0
1.0
0.9318181818181818
1.0
1.0
1.0


In [17]:
models=[logistic,passive,svm,naive,dectr,rand_for,sgd]

In [18]:
def return_tfidf_data(text):
    return vectorizer.transform([text])    #It returns tfidf vectorized output for the text that we want to predict

def return_cv_data(text):                 #It returns count vectorized output for the text that we want to predict
    return cv.transform([text])

def return_vectorized_data(text):
    text=cleantext(text)                #Firstly, we clean the text to be predicted and pass the text to above 
                                        #tfidf function.
    print('Asked Question: ',text)
    print('-----------------------------')
    text=return_tfidf_data(text)     #Finally we get the vector for particular sentence
    return text.toarray()



def return_countvectorized_data(text):   #This function is similar to the above one and only change is this function is
                                         # for count vectorizer
    text=cleantext(text)
    print('Asked Question: ',text)
    print('-----------------------------')
    text=return_cv_data(text)
    return text.toarray()


#I could have created a single function and pass the parameter whether i want cv or tfidf, but since i didnt want things
#to be messy so i created individual functions for both.


    
def predict_label(model):              #This function is used to predict the vectorized data with the output label
    prediction=model.predict(temp)[0]  #This will return predicted label of index 0 so as to smooth process.
    return prediction
    
def find_relevant_ques_ans_model(temp,items):   #I am running this function to check with all the models and how it perorms
                        #and finding similarity score with the only data with the output label 
                        #that is returned from model prediction which reduces burden of finding similarity score with
                        #the whole data. so we only check similarity with data of predicted output babel
                
        
    for i in models:
        prediction=predict_label(i)  #Trying to predict with available models
        index=df[df['Label']==prediction].index

        cos_sim=[cosine_similarity(j.reshape(1,-1),temp.reshape(1,-1)) for j in X_[df[df['Label']==prediction].index]]
        cos_sim=np.array(cos_sim)

        cos_sim=cos_sim.flatten()
        temp_df=df.iloc[index,:].copy()
        temp_df['cosine_sim']=cos_sim
        temp_df=temp_df.sort_values('cosine_sim',ascending=False)
        print(i)
        print(temp_df[['Question','cosine_sim']].head(items))
        print('----------------------------------------------------------')
  

#Checking with some questions.
temp=return_vectorized_data('How to check warrenty of product?')
find_relevant_ques_ans_model(temp,3)

Asked Question:  how check warrenty product
-----------------------------
LogisticRegression()
                                            Question  cosine_sim
4    How do I know if a product comes with warranty?    0.313713
0           How to Review Product as well as Seller?    0.289147
3  How do I know if a product comes with free ins...    0.267625
----------------------------------------------------------
PassiveAggressiveClassifier()
                                            Question  cosine_sim
4    How do I know if a product comes with warranty?    0.313713
0           How to Review Product as well as Seller?    0.289147
3  How do I know if a product comes with free ins...    0.267625
----------------------------------------------------------
SVC()
                                             Question  cosine_sim
32  Problems with Check Out/ or any issues encount...    0.339369
27                          How do I cancel my order?    0.163451
43         How can I find oversea

In [19]:
#Checking with some other question
temp=return_vectorized_data('How to check product installation?')
find_relevant_ques_ans_model(temp,3)

Asked Question:  how check product installation
-----------------------------
LogisticRegression()
                                            Question  cosine_sim
3  How do I know if a product comes with free ins...    0.502639
4    How do I know if a product comes with warranty?    0.249904
0           How to Review Product as well as Seller?    0.230336
----------------------------------------------------------
PassiveAggressiveClassifier()
                                            Question  cosine_sim
3  How do I know if a product comes with free ins...    0.502639
4    How do I know if a product comes with warranty?    0.249904
0           How to Review Product as well as Seller?    0.230336
----------------------------------------------------------
SVC()
                                            Question  cosine_sim
3  How do I know if a product comes with free ins...    0.502639
4    How do I know if a product comes with warranty?    0.249904
0           How to Review Produc

In [20]:
#This part is checking cosine similarity with all the input data which didnt perform well and also if we have a huge
#data set, this will be veri impractical to compare with the whole dataset.
cos_sim=[cosine_similarity(X_[i].reshape(1,-1),temp.reshape(1,-1)) for i in range(len(X_))]
cos_sim=np.array(cos_sim)
cos_sim=cos_sim.flatten()

In [21]:
df_temp=df.copy()
df_temp['Cos_sim']=cos_sim #creating a temporary dataset to store similarity score for all datas.

In [22]:
df_temp.head()#T

Unnamed: 0,Category,Tag,Question,Answer,Cleaned,Label,Cos_sim
0,My Account,Rating and Reviews,How to Review Product as well as Seller?,Everything is right here. Please check this ou...,how review product well seller,1,0.230336
1,My Account,Rating and Reviews,How to Filter Search Results with Product Rati...,Products can also be filtered according to the...,how filter search results product ratings,1,0.208753
2,My Account,Rating and Reviews,What are Ratings & Reviews and how do I write ...,Daraz customers have the option of rating and ...,what ratings reviews i write good reviews,1,0.0
3,My Account,Product Warranty,How do I know if a product comes with free ins...,Unfortunately we are not providing the Free In...,how i know product comes free installation,1,0.502639
4,My Account,Product Warranty,How do I know if a product comes with warranty?,If a warranty is offered on a product the warr...,how i know product comes warranty,1,0.249904


In [23]:
df_temp.sort_values('Cos_sim',ascending=False).head()  #Finding out top similar questions from whole dataset.

Unnamed: 0,Category,Tag,Question,Answer,Cleaned,Label,Cos_sim
3,My Account,Product Warranty,How do I know if a product comes with free ins...,Unfortunately we are not providing the Free In...,how i know product comes free installation,1,0.502639
32,Ordering,Big mart,Problems with Check Out/ or any issues encount...,If you are facing any error while placing the ...,problems check out issues encountered,2,0.270342
15,DarazMall,What is Darazmall,How can I identify a DarazMall product in the ...,"You can identify"" DarazMall"" products by the ""...",how i identify darazmall product website,0,0.250737
4,My Account,Product Warranty,How do I know if a product comes with warranty?,If a warranty is offered on a product the warr...,how i know product comes warranty,1,0.249904
0,My Account,Rating and Reviews,How to Review Product as well as Seller?,Everything is right here. Please check this ou...,how review product well seller,1,0.230336


In [24]:
def return_response(temp_df):
    most_relevant=temp_df.copy()
    relevant_index=most_relevant.index[0] #This will try to give the index of data with highest score.
    response_list=df.iloc[relevant_index,:]['Answer'] #This will store list of reponses separated by comma
    
    try:   #This function is checking either a string can be splitted into other items or not using comma.
        responses=df.iloc[relevant_index,:]['Answer'].split(',')
        print('Response: ',np.random.choice(responses))
        #This will give a random response corrosponding to particular question 
    except:
        print('Response: ',responses) #This will run if it is a single string.
    
#For finding relavant questions that matches the given question using TFIDF 
def find_relevant(temp,items):
    prediction=predict_label(passive) #Predicting label for test data
    index=df[df['Label']==prediction].index  #finding index of dataset with the predicted label so as to compare with

    cos_sim=[cosine_similarity(j.reshape(1,-1),temp.reshape(1,-1)) for j in X_[index]] #This is vectorized data for TFIDF
    cos_sim=np.array(cos_sim)
    cos_sim=cos_sim.flatten() #Creating a numpy array and flattening to a 1d list.
    temp_df=df.iloc[index,:].copy() #This returns the dataset with the predicted labels only.
    temp_df['cosine_sim']=cos_sim #Store the cosine similarity for particular input data.
    temp_df=temp_df.sort_values('cosine_sim',ascending=False) #Finding out top similarity scores.
    return(temp_df[['Question','cosine_sim']].head(items)),return_response(temp_df) 
    #this will return top questions that are matched with the test data with particular limit specified by items.
    #and also will try to give response to that question.

#For finding relavant questions that matches the given question using CV
def find_relevant_cv(temp,items):
    prediction=predict_label(passive)
    index=df[df['Label']==prediction].index
    cos_sim=[cosine_similarity(j.reshape(1,-1),temp.reshape(1,-1)) for j in X_cv_[index]] #This is vectorized data for CV
    cos_sim=np.array(cos_sim)

    cos_sim=cos_sim.flatten()
    temp_df=df.iloc[index,:].copy()
    temp_df['cosine_sim']=cos_sim
    temp_df=temp_df.sort_values('cosine_sim',ascending=False)
    return(temp_df[['Question','cosine_sim']].head(items)),return_response(temp_df)


#The above two functions can be merged together by specifying a parameter either to take cv or tfidf.

# Using Countvectorizer

In [25]:
temp=return_countvectorized_data('How to know good reviews?')
returned_df,relevant_answer=find_relevant_cv(temp,3)
returned_df

Asked Question:  how know good reviews
-----------------------------
Response:  The customers can both leave star ratings and add text to describe what they liked or disliked about their experience.


Unnamed: 0,Question,cosine_sim
2,What are Ratings & Reviews and how do I write ...,0.53033
4,How do I know if a product comes with warranty?,0.447214
3,How do I know if a product comes with free ins...,0.408248


In [26]:
temp=return_countvectorized_data('How to check product installation?')
returned_df,relevant_answer=find_relevant_cv(temp,3)
returned_df

Asked Question:  how check product installation
-----------------------------
Response:  Make sure to check the product description of products to get more details about the availability of the free installation


Unnamed: 0,Question,cosine_sim
3,How do I know if a product comes with free ins...,0.612372
0,How to Review Product as well as Seller?,0.447214
4,How do I know if a product comes with warranty?,0.447214


In [27]:
temp=return_countvectorized_data('How to write my name?')
returned_df,relevant_answer=find_relevant_cv(temp,3)
returned_df

Asked Question:  how write name
-----------------------------
Response:  Now you can easily contact vendor through Daraz App and ask a query about any product of your choice


Unnamed: 0,Question,cosine_sim
9,How to contact Vendor?,0.408248
0,How to Review Product as well as Seller?,0.316228
4,How do I know if a product comes with warranty?,0.316228


In [28]:
temp=return_countvectorized_data('can i play daraz first games desktop too?')
returned_df,relevant_answer=find_relevant_cv(temp,3)
returned_df

Asked Question:  play daraz first games desktop too
-----------------------------
Response:  No you can only play Daraz First Games on Daraz Android App.


Unnamed: 0,Question,cosine_sim
21,Can I play Daraz First Games on Desktop too?,0.92582
22,What is Daraz First Games?,0.612372
18,Why am I not able to see my Collectible Vouche...,0.0


# Using TFIDF Vectorizer

In [29]:
temp=return_vectorized_data('How to check product installation?')
returned_df,relevant_answer=find_relevant(temp,3)
returned_df

Asked Question:  how check product installation
-----------------------------
Response:  Make sure to check the product description of products to get more details about the availability of the free installation


Unnamed: 0,Question,cosine_sim
3,How do I know if a product comes with free ins...,0.502639
4,How do I know if a product comes with warranty?,0.249904
0,How to Review Product as well as Seller?,0.230336


In [30]:
temp=return_vectorized_data('why i see different prices product?')
returned_df,relevant_answer=find_relevant(temp,3)
returned_df

Asked Question:  see different prices product
-----------------------------
Response:  Daraz is a marketplace. We have a huge seller base and each one sources their product differently due to which prices vary for the same product but you can choose depending on your preference as the product quality remains the same.


Unnamed: 0,Question,cosine_sim
8,Why do I see different prices for the same pro...,0.922422
7,Are the prices on Daraz negotiable?,0.279414
4,How do I know if a product comes with warranty?,0.140928


In [31]:
temp=return_vectorized_data('How to play daraz games desktop too?')
returned_df,relevant_answer=find_relevant(temp,3)
returned_df

Asked Question:  how play daraz games desktop too
-----------------------------
Response:   We will be releasing the games on other platforms soon. Stay tuned!


Unnamed: 0,Question,cosine_sim
21,Can I play Daraz First Games on Desktop too?,0.834274
22,What is Daraz First Games?,0.365411
18,Why am I not able to see my Collectible Vouche...,0.0


In [32]:
temp=return_vectorized_data('I want to play daraz games')
returned_df,relevant_answer=find_relevant(temp,3)
returned_df

Asked Question:  i want play daraz games
-----------------------------
Response:  No you can only play Daraz First Games on Daraz Android App.


Unnamed: 0,Question,cosine_sim
21,Can I play Daraz First Games on Desktop too?,0.621956
22,What is Daraz First Games?,0.525183
18,Why am I not able to see my Collectible Vouche...,0.0


# Saving models:

In [53]:
with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(vectorizer,fin)
with open('svm.pk', 'wb') as fin:
    pickle.dump(svm,fin)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [54]:
mod=pickle.load(open('vectorizer.pk','rb'))

<IPython.core.display.Javascript object>

In [58]:
sv_mod=pickle.load(open('svm.pk','rb'))

<IPython.core.display.Javascript object>

In [56]:
arr=mod.transform(['how review product well seller']).toarray()
arr

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.28192401, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [59]:
sv_mod.predict(arr)

array([1])

# Doc 2 vec

# This thing can also be tried creating a vector for particular document using doc2vec model