## import all the libraries

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sqlite3    ## SQL Interface
import pickle     ## Used to save your data - Converts objects to byte stream and vice versa

import re
import nltk
from nltk.corpus import stopwords

## read sqlite file

In [2]:
conn = sqlite3.connect('final.sqlite')

final = pd.read_sql_query("""SELECT * FROM Reviews""", conn)

final.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witti littl book make son laugh loud recit car...
1,138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew read sendak book watch realli rosi movi i...
2,138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn month year learn poem t...
3,138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,great littl book read nice rhythm well good re...
4,138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,book poetri month year goe month cute littl po...


In [4]:
final.shape

(364171, 12)

## text preprocessing

In [7]:
i = 0
for sen in final['Summary'].values:
    if(len(re.findall('<.*?>', sen))):
        print(sen,"\n\n")
        i += 1
    if i == 5:
        break

 there in no HTML tags in summary

#### creating the set of stop words

In [8]:
stop = set(stopwords.words('english'))
lst = ['won', 'nor', 'not', 'against']
for word in lst:
    stop.remove(word)
print(stop)

{'that', 'are', 'them', 'their', 'mightn', "won't", 'don', 've', 'it', 'until', 'at', 'about', 'why', 'between', 'than', "needn't", 'or', 'out', 'didn', "you'll", 'now', 'was', 'here', 'no', 'll', "weren't", 'himself', "don't", 'each', 'hers', "hasn't", 'from', 'most', 'is', 'into', 'other', "shouldn't", 'themselves', 'do', 'our', 'some', 'itself', 'few', "mustn't", 'own', 'what', 'while', 'me', 'its', 'where', 'shan', 'who', 'have', "should've", 'herself', "that'll", "you'd", "mightn't", 'under', "isn't", 'yourself', 'through', 'myself', 'ain', 'again', 'further', 'mustn', 'you', "hadn't", 'had', 'by', 'shouldn', 'm', 'does', 'did', 'needn', 'her', 'of', "aren't", "haven't", 'on', 'my', 'yours', 'been', 'as', 'y', 'but', 'when', "you're", 'the', 'wasn', 'she', 'such', 'up', 'which', 'below', "shan't", 'has', 'being', 'very', 'these', 'him', 'we', 'your', 'this', 'if', "doesn't", 'to', 'wouldn', 'down', "couldn't", 'all', 'so', 'will', 'then', 'both', 'those', 'a', 'during', "wasn't", 

#### steming

In [9]:
sno = nltk.stem.SnowballStemmer('english')

#### function to remove punctuations an spacial character

In [10]:
def cleanpunc(sentence):
    '''This function cleans all the punctuation or special characters from a given sentence'''
    cleaned = re.sub(r'[?|@|!|^|%|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

#### funtion to implement preprocessing steps

In [11]:
def preprocessing(series):
    '''The function takes a Pandas Series object containing text in all the cells
       And performs following Preprocessing steps on each cell:
       1. Clean text from html tags
       2. Clean text from punctuations and special characters
       3. Retain only non-numeric Latin characters with lenght > 2
       4. Remove stopwords from the sentence
       5. Apply stemming to all the words in the sentence
       
       Return values:
       1. final_string - List of cleaned sentences
       2. list_of_sent - List of lists which can be used as input to the W2V model'''
    
    i = 0
    str1=" "
    final_string = []    ## This list will contain cleaned sentences
    list_of_sent = []    ## This is a list of lists used as input to the W2V model at a later stage
    
    ## Creating below lists for future use
    all_positive_words=[] # store words from +ve reviews here
    all_negative_words=[] # store words from -ve reviews here
    
    
    for sent in series.values:
        ## 
        filtered_sent = []
        list_of_sent = []
        sent = cleanpunc(sent)    ## Clean the punctuations and special characters
        ## Sentences are cleaned and words are handled individually
        for cleaned_words in sent.split():
            ## Only consider non-numeric words with length at least 3
            if((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                ## Only consider words which are not stopwords and convert them to lowet case
                if(cleaned_words.lower() not in stop):
                    ## Apply snowball stemmer and add them to the filtered_sent list
                    s = (sno.stem(cleaned_words.lower()))#.encode('utf-8')
                    filtered_sent.append(s)    ## This contains all the cleaned words for a sentence
                    if (final['Score'].values)[i] == 'positive':
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews
        ## Below list is a list of lists used as input to W2V model later
        list_of_sent.append(filtered_sent)
        ## Join back all the words belonging to the same sentence
        str1 = " ".join(filtered_sent)
        ## Finally add the cleaned sentence in the below list
        final_string.append(str1)
        #print(i)
        i += 1
    return final_string, list_of_sent

#### first 5 rows without preprocessing

In [12]:
for x in final['Summary'].iloc[:5].values:
    print(x,"\n\n")

EVERY book is educational 


Love the book, miss the hard cover version 


chicken soup with rice months 


a good swingy rhythm for reading aloud 


A great way to learn the months 




#### first 5 rows after preprocessing

In [13]:
final_string, list_of_sent = preprocessing(final['Summary'].iloc[:5])
for x in final_string:
    print(x,"\n\n")

everi book educ 


love book miss hard cover version 


chicken soup rice month 


good swingi rhythm read aloud 


great way learn month 




In [14]:
final_string, list_of_sent = preprocessing(final['Summary'])

In [16]:
len(final_string)

364171

#### append the list of final string to the data frame

In [17]:
final['CleanedSummary']=final_string
final.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText,CleanedSummary
0,138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witti littl book make son laugh loud recit car...,everi book educ
1,138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew read sendak book watch realli rosi movi i...,love book miss hard cover version
2,138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn month year learn poem t...,chicken soup rice month
3,138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,great littl book read nice rhythm well good re...,good swingi rhythm read aloud
4,138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,book poetri month year goe month cute littl po...,great way learn month


### save the updated data frame as a sql file

In [18]:
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
final.to_sql('Reviews', conn, if_exists='replace')
conn.close()

In [19]:
with open('list_of_sent_for_input_to_w2v.pkl', 'wb') as pickle_file:
    pickle.dump(list_of_sent, pickle_file)