In [None]:
from google.colab import files
files.upload()

Saving database.sqlite.zip to database.sqlite.zip


In [101]:
%ls

amzn_food_cleansed_stemmed_100k.csv  database.sqlite      [0m[01;34msample_data[0m/
AMZN_FOOD_REVIEWS_CLNSD.db           database.sqlite.zip


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

In [None]:
!unzip database.sqlite.zip

Archive:  database.sqlite.zip
  inflating: database.sqlite         


In [None]:
#OR use zipfile
import zipfile
with zipfile.ZipFile("database.sqlite.zip","r") as zip_ref:
    zip_ref.extractall()

In [None]:
#Read
conn = sqlite3.connect('database.sqlite')
df = pd.read_sql_query('SELECT * FROM REVIEWS WHERE Score != 3',conn)
df.head(3)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525814 entries, 0 to 525813
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      525814 non-null  int64 
 1   ProductId               525814 non-null  object
 2   UserId                  525814 non-null  object
 3   ProfileName             525814 non-null  object
 4   HelpfulnessNumerator    525814 non-null  int64 
 5   HelpfulnessDenominator  525814 non-null  int64 
 6   Score                   525814 non-null  int64 
 7   Time                    525814 non-null  int64 
 8   Summary                 525814 non-null  object
 9   Text                    525814 non-null  object
dtypes: int64(5), object(5)
memory usage: 40.1+ MB


In [None]:
def map_positive_negative(score):
  return 'positive' if score > 3 else 'negative'


def map_zero_one(score):
  return 1 if score > 3 else 0


scores = df.Score.map(map_zero_one)
df.Score = scores

In [None]:
sorted = df.sort_values('ProductId',axis=0,ascending=True,kind='quicksort',na_position='last')

In [79]:
deduped_df = sorted.drop_duplicates(subset={"UserId","Time","ProfileName","Text"},keep='first')
deduped_df.shape

(364173, 10)

In [80]:
print("Retained {p}% of data on deduping".format(p = '%.2f'%(len(deduped_df.index)/len(df.index)*100) ))

Retained 69.26% of data on deduping


From Domain knowledge we know that HelpfulnessNumerator should be always less than or equal to  HelpfulnessDenominator

In [81]:
deduped_df = deduped_df[deduped_df['HelpfulnessNumerator']<=deduped_df['HelpfulnessDenominator']]

In [82]:
print("Retained {p}% of data on deduping".format(p = '%.2f'%(len(deduped_df.index)/len(df.index)*100) ))
deduped_df.shape

Retained 69.26% of data on deduping


(364171, 10)

In [83]:
deduped_df['Score'].value_counts()

1    307061
0     57110
Name: Score, dtype: int64

In [84]:
print(deduped_df.isna().any())
deduped_df.dropna(axis=0,inplace=True)

Id                        False
ProductId                 False
UserId                    False
ProfileName               False
HelpfulnessNumerator      False
HelpfulnessDenominator    False
Score                     False
Time                      False
Summary                   False
Text                      False
dtype: bool


In [85]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [86]:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords  

import re

stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'));


def clean_text(txt, doStem = True):
    txt = re.sub(r'http\S+','',txt) #remove http uri
    txt = re.sub('[^A-Za-z0-9]+',' ',txt)  #Only alphanumeric
    txt = re.sub('\S*\d\S*','',txt) #\S*\d\S*  ---> eg:- saj3434jkdsd . This will remove all digits and string with digits.
    txt = re.sub(r"won't", "will not", txt)
    txt = re.sub(r"can\'t", "can not", txt)
    txt = re.sub(r"n\'t", " not", txt)
    txt = re.sub(r"\'re", " are", txt)
    txt = re.sub(r"\'s", " is", txt)
    txt = re.sub(r"\'d", " would", txt)
    txt = re.sub(r"\'ll", " will", txt)
    txt = re.sub(r"\'t", " not", txt)
    txt = re.sub(r"\'ve", " have", txt)
    txt = re.sub(r"\'m", " am", txt)
    if doStem:
      txt = ' '.join(stemmer.stem(e.lower()) for e in txt.split() if e.lower() not in stop_words)
    else:
      txt = ' '.join(e.lower() for e in txt.split() if e.lower() not in stop_words)
    return txt

In [76]:
from tqdm.notebook import tqdm # Use from tqdm.notebook import tqdm to avoid \n
tqdm.pandas()

cleansed_text = deduped_df['Text'].progress_map(clean_text)
cleansed_text[1:10]


HBox(children=(FloatProgress(value=0.0, max=364171.0), HTML(value='')))




138688    grew read sendak book watch realli rosi movi i...
138689    fun way children learn month year learn poem t...
138690    great littl book read aloud nice rhythm well g...
138691    book poetri month year goe month cute littl po...
138693    charm rhyme book describ circumst eat chicken ...
138694    set asid least hour day read son point consid ...
138695    rememb book childhood got kid good rememb kid ...
138696    great book ador illustr true classic kid love ...
138697    book famili favorit read children small order ...
Name: Text, dtype: object

In [87]:
deduped_df['Cleansed'] = cleansed_text
deduped_df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleansed
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,1,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witti littl book make son laugh loud recit car...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,1,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew read sendak book watch realli rosi movi i...
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,1,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn month year learn poem t...


In [89]:
"""
df0 =  deduped_df.iloc[0:10000]
df0.to_csv('amzn_food_cleansed_stemmed_100k.csv')

conn = sqlite3.connect('AMZN_FOOD_REVIEWS_CLNSD.db')
df0.to_sql('AMZN_FOOD_REVIEWS_CLNSD_100K', conn, if_exists='replace', index=False)

conn = sqlite3.connect('AMZN_FOOD_REVIEWS_CLNSD_500K.db')
deduped_df.to_sql('REVIEWS', conn, if_exists='replace', index=False)
"""

# Featurize

**BOW**

In [111]:
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(data,min_df_=5,max_features_=100000):
    bow = CountVectorizer(min_df=min_df_,max_features=max_features_)
    bow = bow.fit(data)
    return bow

**TF-IDF**

In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf(data,min_df_=5,max_features_=100000):
    tfidf = TfidfVectorizer(ngram_range=(1,2),min_df=min_df_,max_features=max_features_)
    tfidf = tfidf.fit(data)
    return tfidf

**W2V**

In [114]:
words=[]
for doc in tqdm(deduped_df['Cleansed']):
    words.append(doc.split(' '))
    
print(words[10])
print(len(words))

HBox(children=(FloatProgress(value=0.0, max=364171.0), HTML(value='')))


['get', 'movi', 'sound', 'track', 'sing', 'along', 'carol', 'king', 'great', 'stuff', 'whole', 'extend', 'famili', 'know', 'song', 'heart', 'qualiti', 'kid', 'storytel', 'music']
364171


In [115]:
from gensim.models import Word2Vec  

model = Word2Vec(words,size=150,window=10,min_count=5,workers=4)

In [119]:
print(model.wv.most_similar('book'))
print('\n')
print(model.wv.similarity('song','music'))
print('\n')
print(model.wv.most_similar('littl'))

list(model.wv.vocab)[90:101]

[('kindl', 0.6801992058753967), ('blog', 0.6647010445594788), ('magazin', 0.6586419343948364), ('fife', 0.6439862847328186), ('paperback', 0.6323655247688293), ('author', 0.6044520735740662), ('interview', 0.601325273513794), ('cookbook', 0.5968289971351624), ('bruce', 0.589389443397522), ('articl', 0.5881955027580261)]


0.7307091


[('bit', 0.7552055716514587), ('tad', 0.7194303870201111), ('tini', 0.5689075589179993), ('alittl', 0.4922039806842804), ('slight', 0.47102490067481995), ('wee', 0.4697117209434509), ('teensi', 0.4623396396636963), ('thicker', 0.4342820346355438), ('tend', 0.4267103970050812), ('kinda', 0.42324498295783997)]


['head',
 'long',
 'time',
 'even',
 'came',
 'surpris',
 'poetri',
 'goe',
 'cute',
 'creativ',
 'author']

AVG **W2VEC**

In [120]:
def average_w2v(data):
    avg_list = []
    for review in tqdm(data):
        vec = np.zeros(150)
        count=0
        for word in review.split(' '):
            try:
                vec += model.wv[word]
                count+=1
            except:
                pass
        if count != 0:
            vec =vec/count
            avg_list.append(vec)
        else:
            avg_list.append(np.zeros(150))
            
            
    print("Total number of the vector : ",len(avg_list))  #Length of the list
    print("Dimension of Vector : ",len(avg_list[1])) # Length of average vector
    return np.array(avg_list)