# Sentiment Analysis for Amazon reviews on cellphones and accessories
# Cellphone reviews dataset is at http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz


In [33]:
### Import necessary depencencies

import gzip
import json
import re
import nltk
import pandas as pd
import utils
from nltk.tokenize.toktok import ToktokTokenizer


# ============================================
#   Overall plan to pre-processing dataset 
# ============================================
### 0. Load data
### 1. Reformat dates and times for visualization
### 2. Prune for local development
### 3. Cleaning review text
     a. strip HTML
     b. Removing accented characters
     c. Expanding Contractions
     d. Removing Special Characters
     e. Lemmatizing text
     f. Removing Stopwords
     g. Remove special characters and repeating characters
     g. Spelling corrections





In [34]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)


In [35]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


# ============================================
# 0.  Load data 
# ============================================


In [36]:
dfraw = getDF('./data/cell_reviews_5core.json.gz')

print('Total Rows: ' + str(len(dfraw)))
# Sample of raw dataset
dfraw.head(20)


Total Rows: 1128437


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"08 4, 2014",A24E3SXTC62LJI,7508492919,{'Color:': ' Bling'},Claudia Valdivia,Looks even better in person. Be careful to not...,Can't stop won't stop looking at it,1407110400,,
1,5.0,True,"02 12, 2014",A269FLZCB4GIPV,7508492919,,sarah ponce,When you don't want to spend a whole lot of ca...,1,1392163200,,
2,3.0,True,"02 8, 2014",AB6CHQWHZW4TV,7508492919,,Kai,"so the case came on time, i love the design. I...",Its okay,1391817600,,
3,2.0,True,"02 4, 2014",A1M117A53LEI8,7508492919,,Sharon Williams,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...,CASE,1391472000,,
4,4.0,True,"02 3, 2014",A272DUT8M88ZS8,7508492919,,Bella Rodriguez,"I liked it because it was cute, but the studs ...",Cute!,1391385600,,
5,2.0,True,"01 27, 2014",A1DW2L6XCC5TJS,7508492919,,Amazon Customer,The product looked exactly like the picture an...,Not so happy,1390780800,,
6,3.0,True,"01 23, 2014",AQC61R4UST7UH,7508492919,,DaMara Estep,I FINALLY got my case today. It took forever t...,It's cute!,1390435200,,
7,5.0,True,"01 17, 2014",A31OVFL91BCKXG,7508492919,,Ashley Nicole Miller,It is a very cute case. None of the jewels hav...,Cute case,1389916800,,
8,1.0,True,"12 27, 2013",A1K0VLK6O5Z22M,7508492919,,BeeLove21,DO NOT BUY! this item is seriously cheap as he...,WORST ITEM!,1388102400,,
9,4.0,True,"12 16, 2013",A1K3BWU73YB44P,7508492919,,Mrs. Ochoa,I really love this case... you have to keep yo...,Pretty Cute!,1387152000,,


# ============================================
# 1. Reformat dates for visualization
# ============================================

In [37]:
#convert review date to standard format
dfraw['Review_Time']= pd.to_datetime(dfraw['reviewTime'])
dfraw['Month']=dfraw['Review_Time'].dt.month
dfraw['Year']=dfraw['Review_Time'].dt.year


print('Total Rows: ' + str(len(dfraw)))
# Sample of raw dataset with dates reformatted. Notice new month and year columns
dfraw.head(20)

Total Rows: 1128437


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,Review_Time,Month,Year
0,5.0,True,"08 4, 2014",A24E3SXTC62LJI,7508492919,{'Color:': ' Bling'},Claudia Valdivia,Looks even better in person. Be careful to not...,Can't stop won't stop looking at it,1407110400,,,2014-08-04,8,2014
1,5.0,True,"02 12, 2014",A269FLZCB4GIPV,7508492919,,sarah ponce,When you don't want to spend a whole lot of ca...,1,1392163200,,,2014-02-12,2,2014
2,3.0,True,"02 8, 2014",AB6CHQWHZW4TV,7508492919,,Kai,"so the case came on time, i love the design. I...",Its okay,1391817600,,,2014-02-08,2,2014
3,2.0,True,"02 4, 2014",A1M117A53LEI8,7508492919,,Sharon Williams,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...,CASE,1391472000,,,2014-02-04,2,2014
4,4.0,True,"02 3, 2014",A272DUT8M88ZS8,7508492919,,Bella Rodriguez,"I liked it because it was cute, but the studs ...",Cute!,1391385600,,,2014-02-03,2,2014
5,2.0,True,"01 27, 2014",A1DW2L6XCC5TJS,7508492919,,Amazon Customer,The product looked exactly like the picture an...,Not so happy,1390780800,,,2014-01-27,1,2014
6,3.0,True,"01 23, 2014",AQC61R4UST7UH,7508492919,,DaMara Estep,I FINALLY got my case today. It took forever t...,It's cute!,1390435200,,,2014-01-23,1,2014
7,5.0,True,"01 17, 2014",A31OVFL91BCKXG,7508492919,,Ashley Nicole Miller,It is a very cute case. None of the jewels hav...,Cute case,1389916800,,,2014-01-17,1,2014
8,1.0,True,"12 27, 2013",A1K0VLK6O5Z22M,7508492919,,BeeLove21,DO NOT BUY! this item is seriously cheap as he...,WORST ITEM!,1388102400,,,2013-12-27,12,2013
9,4.0,True,"12 16, 2013",A1K3BWU73YB44P,7508492919,,Mrs. Ochoa,I really love this case... you have to keep yo...,Pretty Cute!,1387152000,,,2013-12-16,12,2013


# ============================================
# 2. Prune for local development
# ============================================

In [38]:
trial=5000

#subset for local runs, will remove on final runs or on server
five=(dfraw['overall'] == 5.0)
four=(dfraw['overall'] >= 4.0) & (dfraw['overall'] < 5.0)
three=(dfraw['overall'] == 3.0) & (dfraw['overall'] < 4.0)
two=(dfraw['overall'] == 2.0) & (dfraw['overall'] < 3.0)
one=(dfraw['overall'] == 1.0) & (dfraw['overall'] < 2.0)
zero=(dfraw['overall'] == 0.0) & (dfraw['overall'] < 1.0)

df=dfraw[five].iloc[0:trial]
df=df.append(dfraw[four].iloc[0:trial])
df=df.append(dfraw[three].iloc[0:trial])
df=df.append(dfraw[two].iloc[0:trial])
df=df.append(dfraw[one].iloc[0:trial])
df=df.append(dfraw[zero].iloc[0:trial])

print('Total Rows: ' + str(len(df)))
# Sample of Unprocessed reviews pruned for local development
df.head(10)

Total Rows: 25000


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,Review_Time,Month,Year
0,5.0,True,"08 4, 2014",A24E3SXTC62LJI,7508492919,{'Color:': ' Bling'},Claudia Valdivia,Looks even better in person. Be careful to not...,Can't stop won't stop looking at it,1407110400,,,2014-08-04,8,2014
1,5.0,True,"02 12, 2014",A269FLZCB4GIPV,7508492919,,sarah ponce,When you don't want to spend a whole lot of ca...,1,1392163200,,,2014-02-12,2,2014
7,5.0,True,"01 17, 2014",A31OVFL91BCKXG,7508492919,,Ashley Nicole Miller,It is a very cute case. None of the jewels hav...,Cute case,1389916800,,,2014-01-17,1,2014
11,5.0,True,"10 23, 2013",A2ZB7KGUSBR9P3,7508492919,,E. Bryce,Another great product that my daughter she use...,Bling bling for iPhone 4S,1382486400,,,2013-10-23,10,2013
14,5.0,True,"09 10, 2013",A18U23JWTMQX5C,7508492919,,KaitlynxO625,Beautiful quality and outstanding product! Eve...,I can't stop using this case!,1378771200,,,2013-09-10,9,2013
15,5.0,True,"08 28, 2013",A1JQUCTFM4UKMQ,7508492919,,M. Antillon,It is such a good case for a low price. I have...,I love it,1377648000,,,2013-08-28,8,2013
18,5.0,True,"06 4, 2013",A29KSIE8BKYVQN,7508492919,,Janine B.,Super durable and I get compliments on it dail...,Good case,1370304000,,,2013-06-04,6,2013
19,5.0,True,"06 3, 2013",A2CQO0FORCTC2R,7508492919,,Sulli,I have used this case for a couple weeks & so ...,very sparkly,1370217600,,,2013-06-03,6,2013
21,5.0,True,"05 4, 2013",A2ROMLP8COJ6JA,7508492919,,Mary Beth Anderson,I chose this case because it is so beautiful. ...,Beautiful,1367625600,,,2013-05-04,5,2013
22,5.0,True,"04 24, 2013",A23MRCVKI8M8OY,7508492919,,T...,Of all my cases this is the one that I have on...,pink bow,1366761600,,,2013-04-24,4,2013


# ============================================
#   3. Clean data as below 
# ============================================
##     a.  strip HTML: using regex, not BeautifulSoup (which is slow)
##     b.  Removing accented characters using Regex
##     c.  Expanding Contractions using a map of common contractions
##     d.  Removing Special Characters using Regex
##     e.  Lemmatizing text using Spacy and WordNet
##     f.  Removing Stopwords using ToktokTokenizer (NLTK)  
##     g.  Removing special characters and repeating characters using Regex
##     h.  Spelling corrections using TextBlob


In [40]:
nltk.download('all', halt_on_error = True)

i=0

tokenizer = ToktokTokenizer()

clean_reviews=[]
dflocal=df.head(100);

# normalize each review in the dataframe
for index, row in dflocal.iterrows():  
        i += 1     
        #strip HTML
        doc0=row['reviewText']
        doc = utils.strip_html_tags(doc0)


        # remove accented characters
        doc = utils.remove_accented_chars( doc0)

        # expand contractions
        doc = utils.expand_contractions(doc)

        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', '', doc)


        # lemmatize text
        doc = utils.lemmatize_text( doc)


        # remove special characters and\ or digits
        # insert spaces between special characters to isolate them
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = utils.remove_special_characters( doc, remove_digits = False)
        #remove extra whitespaces
        doc = re.sub(' +', ' ', doc)


        #tokenize and process each token
        tokens = tokenizer.tokenize(doc)
        tokens1=[]
        for token in tokens:
            # remove stopwords
            tokentemp = utils.is_stopword(token)
            if tokentemp is not '':
                #remove repeated characters
                tokens1.append(utils.remove_repeated_characters(tokentemp).lower())
        
        #bring list back into document
        doc=' '.join(tokens1)
        
        #correct spelling
        #doc=correct_spelling(doc)

        clean_reviews.append(doc)        
       
        """
        if(len(doc0) !=len(doc)):
            print(doc0)
            print(doc)
            print("-------")
        """
        
        if i % 1000 == 0:
            print('Processed ' + str(i))

dflocal.insert(8,'Clean_Review',clean_reviews)

print('Total Rows: ' + str(len(df)))
print('Sample of processed dataset. Notice the column named Clean_Review');
dflocal.head(20)

Total Rows: 25000
Sample of processed dataset. Notice the column named Clean_Review


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,Clean_Review,summary,unixReviewTime,vote,image,Review_Time,Month,Year
0,5.0,True,"08 4, 2014",A24E3SXTC62LJI,7508492919,{'Color:': ' Bling'},Claudia Valdivia,Looks even better in person. Be careful to not...,look even well person careful drop phone often...,Can't stop won't stop looking at it,1407110400,,,2014-08-04,8,2014
1,5.0,True,"02 12, 2014",A269FLZCB4GIPV,7508492919,,sarah ponce,When you don't want to spend a whole lot of ca...,want spend whole lot cash want great deal shop...,1,1392163200,,,2014-02-12,2,2014
7,5.0,True,"01 17, 2014",A31OVFL91BCKXG,7508492919,,Ashley Nicole Miller,It is a very cute case. None of the jewels hav...,it cute case none jewel fall bow glue well com...,Cute case,1389916800,,,2014-01-17,1,2014
11,5.0,True,"10 23, 2013",A2ZB7KGUSBR9P3,7508492919,,E. Bryce,Another great product that my daughter she use...,another great product daughter use long time i...,Bling bling for iPhone 4S,1382486400,,,2013-10-23,10,2013
14,5.0,True,"09 10, 2013",A18U23JWTMQX5C,7508492919,,KaitlynxO625,Beautiful quality and outstanding product! Eve...,beautiful quality outstanding product everyone...,I can't stop using this case!,1378771200,,,2013-09-10,9,2013
15,5.0,True,"08 28, 2013",A1JQUCTFM4UKMQ,7508492919,,M. Antillon,It is such a good case for a low price. I have...,it good case low price i right never problem b...,I love it,1377648000,,,2013-08-28,8,2013
18,5.0,True,"06 4, 2013",A29KSIE8BKYVQN,7508492919,,Janine B.,Super durable and I get compliments on it dail...,super durable i get compliment daily i find on...,Good case,1370304000,,,2013-06-04,6,2013
19,5.0,True,"06 3, 2013",A2CQO0FORCTC2R,7508492919,,Sulli,I have used this case for a couple weeks & so ...,i use case couple week far great little bling ...,very sparkly,1370217600,,,2013-06-03,6,2013
21,5.0,True,"05 4, 2013",A2ROMLP8COJ6JA,7508492919,,Mary Beth Anderson,I chose this case because it is so beautiful. ...,i choose case beautiful everyone like ask coul...,Beautiful,1367625600,,,2013-05-04,5,2013
22,5.0,True,"04 24, 2013",A23MRCVKI8M8OY,7508492919,,T...,Of all my cases this is the one that I have on...,case one i phone right i love love love fit gr...,pink bow,1366761600,,,2013-04-24,4,2013
