In [32]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline
import pickle
import gensim
import seaborn as sns
import sqlite3
import nltk
from nltk.corpus import stopwords
import re 
from sklearn.feature_extraction.text import CountVectorizer  ## BOW Model
from sklearn.feature_extraction.text import TfidfVectorizer  ## TFIDF Model

from sklearn.manifold import TSNE    ## To visualize high dimensional data


In [31]:
con.close()

In [33]:
con=sqlite3.connect("final_database.sqlite")
final=pd.read_sql_query("""
select * from Reviews 

""",con)

In [34]:
stop = set(stopwords.words('english')) #set of stopwords

In [35]:
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

In [36]:
def cleanhtml(sentence):
    '''This function removes all the html tags in the given sentence'''
    cleantext = re.sub('<.*?>', ' ', sentence)  ## Substitute <space> in place of any html tag
    return cleantext

In [37]:
def cleanpunc(sentence):
    '''This function cleans all the punctuation or special characters from a given sentence'''
    cleaned = re.sub(r'[?|@|!|^|%|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [38]:
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here
    

In [39]:
def preprocessing(series):
    '''The function takes a Pandas Series object containing text in all the cells
       And performs following Preprocessing steps on each cell:
       1. Clean text from html tags
       2. Clean text from punctuations and special characters
       3. Retain only non-numeric Latin characters with lenght > 2
       4. Remove stopwords from the sentence
       5. Apply stemming to all the words in the sentence
       
       Return values:
       1. final_string - List of cleaned sentences
       2. list_of_sent - List of lists which can be used as input to the W2V model'''
    
    i = 0
    str1=" "
    final_string = []    ## This list will contain cleaned sentences
    list_of_sent = []    ## This is a list of lists used as input to the W2V model at a later stage
    
    
    
    for sent in series.values:
        ## 
        filtered_sent = []
        sent = cleanhtml(sent)    ## Clean the HTML tags
        sent = cleanpunc(sent)    ## Clean the punctuations and special characters
        ## Sentences are cleaned and words are handled individually
        for cleaned_words in sent.split():
            ## Only consider non-numeric words with length at least 3
            if((cleaned_words.isalpha()) and (len(cleaned_words) > 2)):
                ## Only consider words which are not stopwords and convert them to lowet case
                if(cleaned_words.lower() not in stop):
                    ## Apply snowball stemmer and add them to the filtered_sent list
                    s = (sno.stem(cleaned_words.lower()))#.encode('utf-8')
                    filtered_sent.append(s)    ## This contains all the cleaned words for a sentence
                    if (final['Score'].values)[i] == 'positive':
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews
        ## Below list is a list of lists used as input to W2V model later
        list_of_sent.append(filtered_sent)
        ## Join back all the words belonging to the same sentence
        str1 = " ".join(filtered_sent)
        ## Finally add the cleaned sentence in the below list
        final_string.append(str1)
        #print(i)
        i += 1
    return final_string, list_of_sent

In [11]:
print(stop)

{'below', 'what', 'yourselves', 'other', "hadn't", 'yours', "aren't", 'too', "you'd", 'how', 'y', 'himself', 'very', 'have', "it's", 'our', 'myself', 'as', 'been', 'over', "she's", 'with', 'they', 'o', 'in', "that'll", 'her', 'this', 'between', 'again', "weren't", "wouldn't", 'when', 'don', 'by', 'on', 'isn', "you'll", 'ain', 'aren', 'because', 'into', 'him', 'your', 'mustn', 'through', 'only', 'it', 've', 'of', 'd', 'you', 'themselves', 'were', 'to', 'both', 'ma', 'had', 'did', 't', 'above', 'that', 'but', 'their', 'mightn', 'most', 'should', "mustn't", "shouldn't", 'does', 'for', 'be', 'haven', 'itself', 'whom', 'being', 're', 'about', 'doesn', 'has', "isn't", 'a', 'me', 'up', "you've", 'such', 'from', 'now', 'hers', "shan't", 'during', 'at', 'here', 'own', "won't", 'weren', 'once', 'why', 'the', 'doing', 'each', 'all', 'any', 'we', 'and', 'theirs', 'hadn', 'shan', 'he', 'am', 'off', 'same', 'an', 'are', 'having', 'herself', 's', 'those', 'where', "mightn't", 'before', 'just', 'yours

In [40]:
lst = ['won', 'nor', 'not', 'against']
for word in lst:
    stop.remove(word)
print(stop)

{'below', 'what', "needn't", 'yourselves', 'other', "hadn't", 'yours', "aren't", 'too', "don't", "you'd", 'how', 'y', 'himself', 'very', 'have', "it's", 'our', 'myself', 'as', 'wasn', 'been', 'over', "she's", 'with', 'they', 'o', 'in', "that'll", 'her', 'this', 'between', 'again', "weren't", "wouldn't", 'when', 'don', 'by', 'on', 'isn', "you'll", 'ain', "couldn't", 'aren', 'because', 'into', 'him', 'your', 'mustn', 'through', 'only', 'it', 've', 'hasn', "hasn't", 'of', 'didn', 'd', 'you', 'themselves', 'were', 'to', 'both', "didn't", 'ma', "haven't", 'had', 'did', 't', 'above', 'that', 'but', 'their', 'needn', 'mightn', 'most', 'should', "mustn't", "shouldn't", 'does', 'for', 'be', 'haven', 'itself', 'whom', 'being', 're', 'about', 'doesn', 'has', "isn't", 'a', 'me', 'up', "you've", 'such', 'from', 'now', 'hers', "shan't", 'during', 'at', 'here', 'own', "won't", 'weren', 'once', 'why', 'the', 'doing', 'each', 'all', 'any', 'we', 'and', 'theirs', 'hadn', 'shan', 'he', 'am', 'off', 'same

In [41]:
## This takes around 1 hour
final_string, list_of_sent=preprocessing(final["Text"])

In [42]:
len(list_of_sent)

364171

In [43]:
type(final["Text"])

pandas.core.series.Series

In [132]:
final_string[0]

'charm rhyme book describ circumst eat dont chicken soup rice sound like kind thing kid would make theyr recess sing drive teacher crazi cute catchi sound realli childlik skill written'

In [133]:
final["Text"][0]

"A charming, rhyming book that describes the circumstances under which you eat (or don't) chicken soup with rice, month-by-month. This sounds like the kind of thing kids would make up while they're out of recess and sing over and over until they drive the teachers crazy. It's cute and catchy and sounds really childlike but is skillfully written."

In [45]:
with open('list_of_positive_words.pkl', 'wb') as pickle_file:
    pickle.dump(all_positive_words, pickle_file)

In [46]:
all_positive_words

['charm',
 'rhyme',
 'book',
 'describ',
 'circumst',
 'eat',
 'dont',
 'chicken',
 'soup',
 'rice',
 'sound',
 'like',
 'kind',
 'thing',
 'kid',
 'would',
 'make',
 'theyr',
 'recess',
 'sing',
 'drive',
 'teacher',
 'crazi',
 'cute',
 'catchi',
 'sound',
 'realli',
 'childlik',
 'skill',
 'written',
 'daughter',
 'love',
 'realli',
 'rosi',
 'book',
 'introduc',
 'realli',
 'rosi',
 'perform',
 'carol',
 'king',
 'also',
 'avail',
 'amazon',
 'birthday',
 'year',
 'later',
 'know',
 'song',
 'far',
 'book',
 'one',
 'johnni',
 'allig',
 'around',
 'chicken',
 'soup',
 'rice',
 'book',
 'well',
 'written',
 'clever',
 'art',
 'work',
 'mauric',
 'sendak',
 'plus',
 'realli',
 'cheap',
 'high',
 'recommend',
 'witti',
 'littl',
 'book',
 'make',
 'son',
 'laugh',
 'loud',
 'recit',
 'car',
 'drive',
 'along',
 'alway',
 'sing',
 'refrain',
 'hes',
 'learn',
 'whale',
 'india',
 'droop',
 'love',
 'new',
 'word',
 'book',
 'introduc',
 'silli',
 'classic',
 'book',
 'will',
 'bet',
 's

In [47]:
final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review

In [24]:
(all_negative_words)

['one',
 'best',
 'children',
 'book',
 'ever',
 'written',
 'mini',
 'version',
 'book',
 'not',
 'portray',
 'one',
 'price',
 'product',
 'sent',
 'email',
 'regard',
 'bewilder',
 'amazon',
 'got',
 'respons',
 'give',
 'five',
 'star',
 'mauric',
 'sendak',
 'stori',
 'one',
 'star',
 'print',
 'edit',
 'book',
 'children',
 'older',
 'copi',
 'book',
 'familiar',
 'previous',
 'softcov',
 'version',
 'order',
 'granddaught',
 'embarrass',
 'give',
 'gift',
 'look',
 'puni',
 'book',
 'size',
 'postcard',
 'think',
 'overpr',
 'ive',
 'learn',
 'not',
 'buy',
 'softcov',
 'children',
 'book',
 'next',
 'time',
 'ill',
 'get',
 'use',
 'copi',
 'dog',
 'love',
 'chicken',
 'product',
 'china',
 'wont',
 'buy',
 'anymor',
 'hard',
 'find',
 'chicken',
 'product',
 'made',
 'usa',
 'one',
 'isnt',
 'bad',
 'good',
 'product',
 'wont',
 'take',
 'chanc',
 'till',
 'know',
 'go',
 'china',
 'import',
 'price',
 'foster',
 'smith',
 'select',
 'compani',
 'even',
 'though',
 'price',
 '

In [48]:
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
final.to_sql('Reviews', conn, if_exists='replace', index = False)
conn.close()

In [28]:
final

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,138693,150511,0006641040,A1C9K534BCI9GO,Laura Purdie Salas,0,0,positive,1344211200,Charming and childlike,"A charming, rhyming book that describes the ci...",charm rhyme book describ circumst eat dont chi...
1,138708,150526,0006641040,A3E9QZFE9KXH8J,R. Mitchell,11,18,negative,1129507200,awesome book poor size,This is one of the best children's books ever ...,one best children book ever written mini versi...
2,138707,150525,0006641040,A2QID6VCFTY51R,Rick,1,2,positive,1025481600,"In December it will be, my snowman's anniversa...","My daughter loves all the ""Really Rosie"" books...",daughter love realli rosi book introduc realli...
3,138706,150524,0006641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witti littl book make son laugh loud recit car...
4,138705,150523,0006641040,A2P4F2UO0UMP8C,"Elizabeth A. Curry ""Lovely Librarian""",0,0,positive,1096675200,MMMM chicken soup....,Summary: A young boy describes the usefulness...,young boy describ use chicken soup rice month ...
5,138704,150522,0006641040,A1S3C5OFU508P3,Charles Ashbacher,0,0,positive,1219536000,Children will find it entertaining and a gener...,This book contains a collection of twelve shor...,book contain collect twelv short statement end...
6,138703,150521,0006641040,A3RMCRB2NDTDYP,Carol Carruthers,0,0,positive,1243468800,This book is great!,My 7 year old daughter brought this book home ...,year old daughter brought book home school lib...
7,138702,150520,0006641040,ADBFSA9KTQANE,"James L. Hammock ""Pucks Buddy""",0,0,positive,1256688000,Great Gift,This book was purchased as a birthday gift for...,book purchas birthday gift year old boy squeal...
8,138701,150519,0006641040,A12HY5OZ2QNK4N,Elizabeth H. Roessner,0,0,positive,1256774400,It's a great book!,I've always loved chicken soup and rice. My la...,ive alway love chicken soup rice late ethel al...
9,138700,150518,0006641040,AK1L4EJBA23JF,L. M. Kraus,0,0,positive,1288224000,love this book,"Great book, perfect condition arrived in a sho...",great book perfect condit arriv short amount t...
