In [95]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
import re
import pprint
from sklearn.utils import shuffle
import gensim

from gensim import corpora
from gensim.parsing.preprocessing import remove_stopwords
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer


In [96]:
data = pd.read_csv('../input/amazon-questionanswer-dataset/single_qna.csv')

In [97]:
data.head()

Unnamed: 0,QuestionType,Asin,AnswerTime,UnixTime,Question,AnswerType,Answer,Category
0,yes/no,B00004U9JP,"Jun 27, 2014",1403852000.0,I have a 9 year old Badger 1 that needs replac...,?,I replaced my old one with this without a hitch.,Appliances
1,open-ended,B00004U9JP,"Apr 28, 2014",1398668000.0,model number,,This may help InSinkErator Model BADGER-1: Bad...,Appliances
2,yes/no,B00004U9JP,"Aug 25, 2014",1408950000.0,can I replace Badger 1 1/3 with a Badger 5 1/2...,?,Plumbing connections will vary with different ...,Appliances
3,yes/no,B00004U9JP,"Nov 3, 2014",1415002000.0,Does this come with power cord and dishwasher ...,?,It does not come with a power cord. It does co...,Appliances
4,open-ended,B00004U9JP,"Jun 21, 2014",1403334000.0,loud noise inside when turned on. sounds like ...,,Check if you dropped something inside.Usually ...,Appliances


In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1396896 entries, 0 to 1396895
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   QuestionType  1396896 non-null  object 
 1   Asin          1396896 non-null  object 
 2   AnswerTime    1396896 non-null  object 
 3   UnixTime      1346991 non-null  float64
 4   Question      1396895 non-null  object 
 5   AnswerType    663866 non-null   object 
 6   Answer        1396798 non-null  object 
 7   Category      1396896 non-null  object 
dtypes: float64(1), object(7)
memory usage: 85.3+ MB


In [99]:
data = data.drop(['QuestionType','Asin', 'AnswerTime','UnixTime','AnswerType'],axis =1)

In [100]:
data.head()

Unnamed: 0,Question,Answer,Category
0,I have a 9 year old Badger 1 that needs replac...,I replaced my old one with this without a hitch.,Appliances
1,model number,This may help InSinkErator Model BADGER-1: Bad...,Appliances
2,can I replace Badger 1 1/3 with a Badger 5 1/2...,Plumbing connections will vary with different ...,Appliances
3,Does this come with power cord and dishwasher ...,It does not come with a power cord. It does co...,Appliances
4,loud noise inside when turned on. sounds like ...,Check if you dropped something inside.Usually ...,Appliances


In [101]:
data.describe()

Unnamed: 0,Question,Answer,Category
count,1396895,1396798,1396896
unique,1199374,1162056,21
top,What are the dimensions?,Yes,Electronics
freq,924,33076,314263


In [102]:
data.Category.value_counts()

Electronics                    314263
Home and Kitchen               184439
Sports and Outdoors            146891
Tools and Home Improvement     101088
Automotive                      89923
Cell Phones and Accessories     85865
Health and Personal Care        80496
Patio Lawn and Garden           59595
Toys and Games                  51486
Office Products                 43608
Beauty                          42422
Pet Supplies                    36607
Baby                            28933
Musical Instruments             23322
Clothing Shoes and Jewelry      22068
Arts Crafts and Sewing          21262
Grocery and Gourmet Food        19538
Video Games                     13307
Industrial and Scientific       12136
Software                        10636
Appliances                       9011
Name: Category, dtype: int64

In [103]:
print(data.isnull().sum())
data = data.dropna()

Question     1
Answer      98
Category     0
dtype: int64


In [104]:
data.isnull().sum()

Question    0
Answer      0
Category    0
dtype: int64

In [105]:
digit =[]
for i in range(0,len(data)):
    digit.append(int(i))

In [106]:
digit = pd.DataFrame(digit)

In [107]:
data['index'] = digit

In [108]:
data['index']=data.index.astype('int')

In [109]:
data=data.set_index('index')

In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1396797 entries, 0 to 1396895
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   Question  1396797 non-null  object
 1   Answer    1396797 non-null  object
 2   Category  1396797 non-null  object
dtypes: object(3)
memory usage: 42.6+ MB


In [111]:
data =data.reindex(list(range(data.index.min(),data.index.max()+1)),fill_value=0)
#filling missing indexes with zeros so that we donot get any error

In [112]:
#shuffling the data
data = shuffle(data)

In [113]:
data= data.reset_index()
data = data.drop(['index'],axis =1)


In [114]:
#data = data.drop(['Category'],axis =1)

In [115]:
def processing(sentence,stopwords =False):
    #word =[]
    #for i in range(len(sentence)):
    #word = sentence.lower().strip()
    word = re.sub('^[a-z0-9\s-]'' , ', ' ',str(sentence))
    word = word.lower().strip()
        
        
        #word.append(words)
    #if stopwords:
        #sentence = remove_stopwords(sentence)
    return word   


def get_processed_text(sentence,stopwords =False):
    sents = data[['Question']]
    cleaned_sentences = []
    
    for index,row in data.iterrows():
        cleaned = processing(row['Question'],stopwords=False)
        cleaned_sentences.append(cleaned)
        
    return cleaned_sentences


#print(cleaned_sentences[0])
    

In [116]:
cleaned_sentences = get_processed_text(data,stopwords=False)

In [117]:
sentences = shuffle(cleaned_sentences)
sentences = cleaned_sentences
#sentences = shuffle(sentences)

In [118]:
len(sentences)

1396896

In [119]:

def retreive_and_print_answer(question_embedding,bow_corpus,data_model,word):
    max_sim =-1
    index_sim =-1
    
    for index,embedding_amz in enumerate(bow_corpus):
        sim = cosine_similarity(embedding_amz,question_embedding)[0][0]
        #sim = cosine_similarity(np.array(embedding_amz).reshape(1,-1),np.array(question_embedding).reshape(1,-1))
        #print(index,sim,word[index])
        
    
        if sim>max_sim:
            max_sim=sim
            index_sim =index
            
        
        
    print('\n')
    print("Your Entered Question : ",text)
    print('\n')
    print("Similar Question We Got :  ",data_model.iloc[index_sim,0])
    print('\n')
    print("ANSWER:  ",data_model.iloc[index_sim,1])
    
#retreive_and_print_answer(question_embedding,bow_corpus,data,sentences)
    

In [120]:
sentence_words = [[words for words in str(document).split()]for document in sentences]
dictionary = corpora.Dictionary(sentence_words)

def bagofword_technique(text):
    processed_text = processing(text,stopwords = False)
    bow_corpus = [dictionary.doc2bow(text) for text in sentence_words]
    question_embedding = dictionary.doc2bow(text.split())
    retreive_and_print_answer(question_embedding,bow_corpus,data,sentences)



In [121]:
text = input('please enter a query related to product or general : ')
print('result from bag of word technique')
bagofword_technique(text)

please enter a query related to product or general :  my battery is not working properly. how should i fix it?


result from bag of word technique


Your Entered Question :  my battery is not working properly. how should i fix it?


Similar Question We Got :   Does the product have a strong odor? I jsut purchased another brand with pvc backing that has an overpowering odor and would like an alternative.


ANSWER:   Totally smells normal. No odor.
