In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline
import pickle
import gensim
import seaborn as sns
import sqlite3
import nltk
from nltk.corpus import stopwords
import re 
from sklearn.feature_extraction.text import CountVectorizer  ## BOW Model
from sklearn.feature_extraction.text import TfidfVectorizer  ## TFIDF Model

from sklearn.manifold import TSNE    ## To visualize high dimensional data




## loading data file

In [2]:
con=sqlite3.connect("saved_data//final.sqlite")
data=pd.read_sql_query("select * from Reviews",con)
con.close()

## extracting Summary column

In [20]:
data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,540125,B002W08W56,A2STPOZAT37RNE,Betty Baker,1,1,1,1300320000,"Wonderful, but surprised to find this size can...",Great beans....the best texture hands down. T...,great bean best textur hand seem great tast do...
1,31761,B000UXWQMC,A3AKMRCK9C8KWO,Traveller,3,4,1,1263686400,Better than store bought,We bought the Roland Kalamata Halves (Barchett...,bought roland kalamata halv barchetta recent e...
2,509496,B000P09RJA,A29WSMQQ9WTM7E,Kari Carper,0,0,1,1286496000,Better than a regular granola bar,These bars are very tasty (as are the chocolat...,bar tasti chocol strawberri theyr larger regul...
3,27728,B000K8ESBY,A1RYTIMIWPDJ4O,"Bill Ison ""Bill""",0,0,1,1256169600,Coffee flavoring,"<a href=""http://www.amazon.com/gp/product/B000...",flavour creation coffe flavor tablet french va...
4,128657,B001AVJT0K,AJ6SZ4YAPOOO7,Mwebi,0,0,1,1348099200,They all come running,"I have the pickiest cats, they hate 99/100 kin...",pickiest cat hate kind food better vari otherw...
5,399982,B002IEVJRY,A3LXXYBYUHZWS5,Monica Garcia,0,0,1,1334102400,A pretty good Starbucks replacement,I love frappucinos from Starbucks so I decided...,love frappucino starbuck decid give one tri al...
6,393311,B006GIS3EA,A250FFYP8R4OMJ,Cee H,0,0,0,1344902400,disappointing,I don't know if I got a bad box but mine did n...,dont know got bad box mine not resembl pictur ...
7,59452,B000FBQ5GW,A1TPJYBZVDMP4G,Venkatesh Nagarajan,0,0,1,1252713600,Excellent..,Now it is available only through nutricity. Th...,avail nutric avail status wrong order aug toda...
8,96635,B008KZ5KZ2,A3G1VVKZ3DJS76,"M. Krumm ""mom of three""",0,0,1,1346284800,Best Gluten Free Cookie Mix on the Market,We all know there are good and bad gluten free...,know good bad gluten free product rare find on...
9,506467,B003V8Y5KY,A2VJV5HA2C20E4,Willa,0,0,1,1340064000,Housewife,"Our family loves this product, and my daughter...",famili love product daughter like especi must ...


In [3]:
final=data["Summary"]

##  creating set of stop words which are to be removed later

In [4]:
stop = set(stopwords.words('english')) #set of stopwords

In [5]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## removing unwanted words from stop set which could be relevant

In [6]:
lst = ['won', 'nor', 'not', 'against',"doesn't","couldn't","wasn't","wouldn't","won't","weren't","isn't","haven't","hasn't","hadn't","don't","didn't","aren't","shouldn't","needn't","mustn't","mightn't"]
for word in lst:
    stop.remove(word)
print(stop)

{'just', 'further', 'are', 'all', 'about', 'he', 'doesn', 'they', 'me', 'doing', 'each', 'few', 'but', "you're", 'him', 'from', 'then', 'wouldn', 'themselves', 'very', 'such', 'theirs', 'been', 'if', 'did', 'myself', 'and', 'off', 'were', 'have', 'on', 'the', 'only', 'down', 'o', 'why', 'didn', 'this', 't', "you'll", 'than', 'for', 'with', 'yourselves', 'had', 'until', 'your', 'isn', 'while', "shan't", 'mustn', 'hasn', 'ourselves', 'ain', "it's", 'when', 'herself', 'y', 'again', 'to', 'in', 'too', 'most', 'during', 'itself', 'as', 'between', 'those', 'you', 'their', "you've", 'that', 'above', 'before', 'no', 'so', 'should', 'at', 'does', 'how', 'or', 'i', 'below', 'shan', 'ma', 'some', 'up', 'where', 'mightn', 'once', 'm', 'couldn', 'has', 'can', 'hers', 'don', 'haven', 'by', 'am', 've', 'having', 'because', 'over', 'be', 'more', 'an', 'needn', 'it', "she's", 'her', 'these', 'here', 'other', 'my', 'of', 'weren', "you'd", 'own', 'is', 'we', 'them', 'its', 'both', 'his', 's', 'out', 'our


## defining stemming object which will be used to find root of words

In [7]:
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

In [8]:
print(sno.stem("works"))
print(sno.stem("working"))

work
work


## defining function to remove HTML tags from the sentences

In [9]:
def cleanhtml(sentence):
    return re.sub("<.*?>"," ",sentence)

## defining function to remove punctuation marks from the sentences

In [10]:
def cleanpunc(sentence):
    '''This function cleans all the punctuation or special characters from a given sentence'''
    cleaned = re.sub(r'[?|@|!|^|%|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

## defining function which will clean the Summary Column of HTML tags and punctuation marks

In [11]:
def preprocessing(series):
    '''The function takes a Pandas Series object containing text in all the cells
       And performs following Preprocessing steps on each cell:
       1. Clean text from html tags
       2. Clean text from punctuations and special characters
       3. Retain only non-numeric Latin characters with lenght > 2
       4. Remove stopwords from the sentence
       5. Apply stemming to all the words in the sentence
       
       Return values:
       1. final_string - List of cleaned sentences
       2. list_of_sent - List of lists which can be used as input to the W2V model'''
    
    i = 0
    str1=" "
    final_string = []    ## This list will contain cleaned sentences
    list_of_sent = []    ## This is a list of lists used as input to the W2V model at a later stage
    
    
    
    for sent in series.values:
        ## 
        filtered_sent = []
        sent = cleanhtml(sent)    ## Clean the HTML tags
        sent = cleanpunc(sent)    ## Clean the punctuations and special characters
        ## Sentences are cleaned and words are handled individually
        for cleaned_words in sent.split():
            ## Only consider non-numeric words with length at least 3
            if((cleaned_words.isalpha()) and (len(cleaned_words) > 2)):
                ## Only consider words which are not stopwords and convert them to lowet case
                if(cleaned_words.lower() not in stop):
                    ## Apply snowball stemmer and add them to the filtered_sent list
                    s = (sno.stem(cleaned_words.lower()))#.encode('utf-8')
                    filtered_sent.append(s)    ## This contains all the cleaned words for a sentence
                    
        ## Below list is a list of lists used as input to W2V model later
        list_of_sent.append(filtered_sent)
        ## Join back all the words belonging to the same sentence
        str1 = " ".join(filtered_sent)
        ## Finally add the cleaned sentence in the below list
        final_string.append(str1)
        #print(i)
        i += 1
    return final_string, list_of_sent

## calling the above function for Summary Column

In [12]:
final,list_of_sent=preprocessing(final)

In [13]:
list_of_sent

[['wonder', 'surpris', 'find', 'size'],
 ['better', 'store', 'bought'],
 ['better', 'regular', 'granola', 'bar'],
 ['coffe', 'flavor'],
 ['come', 'run'],
 ['pretti', 'good', 'starbuck', 'replac'],
 ['disappoint'],
 ['excel'],
 ['best', 'gluten', 'free', 'cooki', 'mix', 'market'],
 ['housewif'],
 ['stuf'],
 ['good', 'ginger', 'ale'],
 ['not', 'sure', 'qualiti'],
 [],
 ['awesom', 'cook'],
 ['amaz'],
 ['smoke', 'salmon', 'foil', 'seal'],
 ['chocol', 'cake'],
 ['guilt', 'free', 'chocol'],
 ['textur', 'tast', 'total', 'differ', 'press', 'oat'],
 ['right', 'kick', 'alright'],
 ['excel', 'product'],
 ['babi', 'move'],
 ['skinless', 'boneless', 'sardin'],
 ['fabul', 'healthi'],
 ['healthier', 'coffe'],
 ['good', 'skin', 'thing', 'homework'],
 ['great', 'product'],
 ['morn', 'ritual'],
 ['good', 'tast', 'decaf'],
 ['lipton', 'spring', 'veget'],
 ['world', 'best', 'ginger', 'ale'],
 ['like'],
 ['even', 'better', 'arrowmil', 'pancak', 'mix'],
 ['tasti'],
 ['excel', 'aroma', 'good', 'flavor'],
 ['

## saving the above list_of_sentences as a pickle file 

In [14]:
with open("saved_data//list_of_sent.pkl","wb") as pkl1:
    pickle.dump(list_of_sent,pkl1)

## concatenating the preprocessed  final data to the final.sqlite file

In [19]:
?pd.DataFrame

In [23]:
temp=pd.DataFrame(final,columns={"Cleaned Summary"})

In [24]:
temp

Unnamed: 0,Cleaned Summary
0,wonder surpris find size
1,better store bought
2,better regular granola bar
3,coffe flavor
4,come run
5,pretti good starbuck replac
6,disappoint
7,excel
8,best gluten free cooki mix market
9,housewif


In [29]:
final=pd.concat([data,temp],axis=1)

In [30]:
final

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText,Cleaned Summary
0,540125,B002W08W56,A2STPOZAT37RNE,Betty Baker,1,1,1,1300320000,"Wonderful, but surprised to find this size can...",Great beans....the best texture hands down. T...,great bean best textur hand seem great tast do...,wonder surpris find size
1,31761,B000UXWQMC,A3AKMRCK9C8KWO,Traveller,3,4,1,1263686400,Better than store bought,We bought the Roland Kalamata Halves (Barchett...,bought roland kalamata halv barchetta recent e...,better store bought
2,509496,B000P09RJA,A29WSMQQ9WTM7E,Kari Carper,0,0,1,1286496000,Better than a regular granola bar,These bars are very tasty (as are the chocolat...,bar tasti chocol strawberri theyr larger regul...,better regular granola bar
3,27728,B000K8ESBY,A1RYTIMIWPDJ4O,"Bill Ison ""Bill""",0,0,1,1256169600,Coffee flavoring,"<a href=""http://www.amazon.com/gp/product/B000...",flavour creation coffe flavor tablet french va...,coffe flavor
4,128657,B001AVJT0K,AJ6SZ4YAPOOO7,Mwebi,0,0,1,1348099200,They all come running,"I have the pickiest cats, they hate 99/100 kin...",pickiest cat hate kind food better vari otherw...,come run
5,399982,B002IEVJRY,A3LXXYBYUHZWS5,Monica Garcia,0,0,1,1334102400,A pretty good Starbucks replacement,I love frappucinos from Starbucks so I decided...,love frappucino starbuck decid give one tri al...,pretti good starbuck replac
6,393311,B006GIS3EA,A250FFYP8R4OMJ,Cee H,0,0,0,1344902400,disappointing,I don't know if I got a bad box but mine did n...,dont know got bad box mine not resembl pictur ...,disappoint
7,59452,B000FBQ5GW,A1TPJYBZVDMP4G,Venkatesh Nagarajan,0,0,1,1252713600,Excellent..,Now it is available only through nutricity. Th...,avail nutric avail status wrong order aug toda...,excel
8,96635,B008KZ5KZ2,A3G1VVKZ3DJS76,"M. Krumm ""mom of three""",0,0,1,1346284800,Best Gluten Free Cookie Mix on the Market,We all know there are good and bad gluten free...,know good bad gluten free product rare find on...,best gluten free cooki mix market
9,506467,B003V8Y5KY,A2VJV5HA2C20E4,Willa,0,0,1,1340064000,Housewife,"Our family loves this product, and my daughter...",famili love product daughter like especi must ...,housewif


## savaing the  final file as sqlite file

In [28]:
?pd.concat

In [37]:
conn=sqlite3.connect("saved_data//final.sqlite")
c=conn.cursor()
final.to_sql("Reviews",conn,if_exists="replace")

  chunksize=chunksize, dtype=dtype)


In [36]:
?final.to_sql