In [1]:
### Import necessary depencencies
import gzip
import json
import multiprocessing
import pickle
from multiprocessing import Process
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import utils


PROCESSED_FILENAME= './data/amazon_reviews_processed.pickle' 
trial=0
#number of processes to run in multiprocessing
processors=16


# ============================================
#   Overall plan to pre-processing dataset 
# ============================================
### 0. Load data
### 1. Prune for local development
### 2. Reformat dates and times for visualization
### 3. Cleaning review text
     a. strip HTML
     b. Removing accented characters
     c. Expanding Contractions
     d. Removing Special Characters
     e. Lemmatizing text
     f. Removing Stopwords
     g. Remove special characters and repeating characters
     g. Spelling corrections





In [2]:
def child_process(number):
    number.value = 0
    while True:
        number.value += 1
        #print(number)

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)


In [3]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')



# ============================================
#  Load Product Metadata 
# ============================================


In [None]:
metadfraw = getDF('./data/cell_reviews_meta.json.gz')

In [None]:
print('Total products metadata : ' + str(len(metadfraw)))
# Sample of raw dataset
metadfraw.head(10)


In [None]:
#remove rows with bad category
metadfraw.dropna(subset=['category'], inplace=True)


In [None]:
#catlist=[]
#does main category contain cellphones
def isCellphone(x):

    y=False

    try:   
#        for item in x:
#            if item not in catlist:
#                catlist.append(item)
        y= ('Cell Phones' in x) | ('Unlocked Cell Phones' in x) | ('Carrier Cell Phones' in x)

    except TypeError as te:
        y=False
        print(x)
    
    return y;


#derive iscellphone
metadfraw['iscellphone']=metadfraw.apply(lambda x: isCellphone(x['category']),axis=1)

#drop if not cellphone
metadfraw=metadfraw[metadfraw.iscellphone]

#print(catlist)

In [None]:
print('Filtered cellphone metadata: ' + str(len(metadfraw)))
# Sample of raw dataset
metadfraw.head(20)


In [None]:
#remove rows that have no use
metadfraw.dropna(subset=['asin'], inplace=True)
metadfraw.dropna(subset=['brand'], inplace=True)


In [None]:
print('Filtered further for missing bad asins and brands: ' + str(len(metadfraw)))
# Sample of raw dataset
metadfraw.head(20)



In [None]:
#strip asin
metadfraw['asin'] = metadfraw['asin'].str.strip().astype(str)

#metadfraw['asin'].replace('', np.NaN, inplace=True)
#metadfraw['brand'] = metadfraw['brand'].str.strip().astype(str)
#metadfraw['brand'].replace('', np.NaN, inplace=True)
#metadfraw['feature'] = metadfraw['feature'].str.strip().astype(str)
#metadfraw['feature'].replace('', np.NaN, inplace=True)
#metadfraw['title'] = metadfraw['title'].str.strip().astype(str)
#metadfraw['title'].replace('', np.NaN, inplace=True)
#metadfraw['price'] = metadfraw['price'].str.strip().astype(str)
#metadfraw['price'].replace('', np.NaN, inplace=True)

#metadfraw.dropna(subset=['asin'], inplace=True)
#drop where price and/or brand is na
#metadfraw.dropna(subset=['price'], inplace=True)
#metadfraw.dropna(subset=['brand'], inplace=True)



In [None]:
metadfraw.drop_duplicates(subset ="asin", keep = 'first', inplace = True) 


In [None]:
print('Filtered further to remove duplicate asins: ' + str(len(metadfraw)))
# Sample of raw dataset
metadfraw.head(20)


In [None]:
def convertCurrency(x):
    y=np.NaN
    try:
        y= float(x[1:])
    except:
        y=np.NaN
    return y;

#derive price
#metadfraw['price_parsed']=metadfraw.apply(convertCurrency)
metadfraw['price_parsed']=metadfraw.apply(lambda x: convertCurrency(x['price']),axis=1)


In [None]:
print('Price is parsed for : ' + str(len(metadfraw)))
# Sample of raw dataset
metadfraw.head(20)


In [None]:
#drop uneeded columns
metadfraw=metadfraw[['asin', 'title', 'brand','feature','price_parsed']]

In [None]:
print('Final cellphone metadata for : ' + str(len(metadfraw)))
# Sample of raw dataset
metadfraw.head(20)



# ============================================
#   Load review data 
# ============================================


In [None]:
dfraw = getDF('./data/cell_reviews_5core.json.gz')


In [None]:
print('Total raw reviews: ' + str(len(dfraw)))
# Sample of raw dataset
dfraw.head(20)


# ======================================================================================================
#   concatenate columns summary and reviewText because sometimes summary is very indicative of sentiment
# ======================================================================================================

In [None]:
#take out leading and trailing blanks
dfraw['reviewText'] = dfraw['reviewText'].str.strip().astype(str)
dfraw['summary'] = dfraw['summary'].str.strip().astype(str)
dfraw['reviewText']=dfraw['reviewText']+' . '+dfraw['summary'] 



# ============================================
#   Drop uneeded rows
# ============================================

In [None]:
def cleanDataFrame(frame):
    #strip
    dfraw['asin'] = dfraw['asin'].str.strip().astype(str)
    dfraw['reviewText'] = dfraw['reviewText'].str.strip().astype(str)

    #replace blank columns
    dfraw['reviewText'].replace('', np.NaN, inplace=True)
    dfraw['reviewTime'].replace('', np.NaN, inplace=True)
    dfraw['asin'].replace('', np.NaN, inplace=True)
    dfraw['overall'].replace('', np.NaN, inplace=True)

    #remove observations with nan reviews
    dfraw.dropna(subset=['reviewText','reviewTime','asin','overall'], inplace=True)

cleanDataFrame(dfraw)

In [None]:
print('Total reviews after removing NaNs and blanks: ' + str(len(dfraw)))
# Sample of raw dataset
dfraw.head(20)

# ============================================
#   Infer sentiment based on stars
# ============================================

In [None]:
#infer sentiment positive and negative based on star reviews and add to dataset
#negative= star reviews 0,1,2,
#positive= star reviews 4,5
#create new column for sentiment
#sentiment column is labeled target
dfraw.loc[(dfraw['overall'] >= 0.0) & (dfraw['overall'] < 3.0), 'sentiment'] = 0
dfraw.loc[(dfraw['overall'] >= 3.0) & (dfraw['overall'] < 4.0), 'sentiment'] = -1
dfraw.loc[(dfraw['overall'] >= 4.0) & (dfraw['overall'] <= 5.0), 'sentiment'] = 1

#drop 3 stars which are neutral
dfraw.drop( dfraw[ dfraw['sentiment'] == -1 ].index , inplace=True)


# =================================================================================
# Inner join dataset and metadata. Add columns brand and price_parsed from metadata 
# =================================================================================


In [None]:
print('Total reviews BEFORE inner join: ' + str(len(dfraw)))

In [None]:
#df1 = dfraw.merge(metadfraw, on='asin',how='inner')
df1=pd.merge(dfraw, metadfraw, how='left', on=['asin', 'asin'])


In [None]:
print('Total reviews AFTER inner join: ' + str(len(df1)))
df1.head(20)

In [None]:
# drop nans on brand
df1.dropna(subset=['brand'], inplace=True)

In [None]:
print('Total reviews AFTER dropping NaN brands: ' + str(len(df1)))
df1.head(20)


# ============================================
#   Check how balanced the dataset is 
# ============================================

In [None]:
target_count = df1.sentiment.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

target_count.plot(kind='bar', title='Sentiment distribution BEFORE Balancing');


# ======================================================
# Balance dataset by picking equal numbers for each star
# =====================================================

In [5]:
#subset for local runs, will remove on final runs or on server
five=(df1['overall'] == 5.0)
four=(df1['overall'] >= 4.0) & (df1['overall'] < 5.0)
two=(df1['overall'] >= 2.0) & (df1['overall'] < 3.0)
one=(df1['overall'] >= 1.0) & (df1['overall'] < 2.0)

df5=df1[five]
df4=df1[four]
df2=df1[two]
dfone=df1[one]


#calculate smallest slice
slices=[len(df5),len(df4),len(df2),len(dfone)]
slice=min(slices)


df=pd.DataFrame(columns = df1.columns)

df=shuffle(df5).iloc[0:slice]
df=df.append(shuffle(df4).iloc[0:slice])
df=df.append(shuffle(df2).iloc[0:slice])
df=df.append(shuffle(dfone).iloc[0:slice])

#randomize dataset
df = shuffle(df)


Total Rows: 800


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"08 4, 2014",A24E3SXTC62LJI,7508492919,{'Color:': ' Bling'},Claudia Valdivia,Looks even better in person. Be careful to not...,Can't stop won't stop looking at it,1407110400,,
1,5.0,True,"02 12, 2014",A269FLZCB4GIPV,7508492919,,sarah ponce,When you don't want to spend a whole lot of ca...,1,1392163200,,
7,5.0,True,"01 17, 2014",A31OVFL91BCKXG,7508492919,,Ashley Nicole Miller,It is a very cute case. None of the jewels hav...,Cute case,1389916800,,
11,5.0,True,"10 23, 2013",A2ZB7KGUSBR9P3,7508492919,,E. Bryce,Another great product that my daughter she use...,Bling bling for iPhone 4S,1382486400,,
14,5.0,True,"09 10, 2013",A18U23JWTMQX5C,7508492919,,KaitlynxO625,Beautiful quality and outstanding product! Eve...,I can't stop using this case!,1378771200,,
15,5.0,True,"08 28, 2013",A1JQUCTFM4UKMQ,7508492919,,M. Antillon,It is such a good case for a low price. I have...,I love it,1377648000,,
18,5.0,True,"06 4, 2013",A29KSIE8BKYVQN,7508492919,,Janine B.,Super durable and I get compliments on it dail...,Good case,1370304000,,
19,5.0,True,"06 3, 2013",A2CQO0FORCTC2R,7508492919,,Sulli,I have used this case for a couple weeks & so ...,very sparkly,1370217600,,
21,5.0,True,"05 4, 2013",A2ROMLP8COJ6JA,7508492919,,Mary Beth Anderson,I chose this case because it is so beautiful. ...,Beautiful,1367625600,,
22,5.0,True,"04 24, 2013",A23MRCVKI8M8OY,7508492919,,T...,Of all my cases this is the one that I have on...,pink bow,1366761600,,


# =====================================================
#   Check how balanced the dataset is after balancing
# ====================================================

In [None]:
target_count = df.sentiment.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

target_count.plot(kind='bar', title='Sentiment distribution AFTER Balancing');


# =================================================
# Clean reviews - reformat dates, add counts
# =================================================

In [6]:
#convert review date to standard format
df['Review_Time']= pd.to_datetime(df['reviewTime'])
df['Month']=df['Review_Time'].dt.month
df['Year']=df['Review_Time'].dt.year
df['Day']=df['Review_Time'].dt.day

#add length and word count to dataframe
df['review_len'] = df['reviewText'].astype(str).apply(len)
df['word_count'] = df['reviewText'].apply(lambda x: len(str(x).split()))

Total Rows: 800


Unnamed: 0,overall,verified,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,sentiment,Review_Time,Month,Year
0,5.0,True,A24E3SXTC62LJI,7508492919,Claudia Valdivia,Looks even better in person. Be careful to not...,Can't stop won't stop looking at it,1407110400,,1,2014-08-04,8,2014
1,5.0,True,A269FLZCB4GIPV,7508492919,sarah ponce,When you don't want to spend a whole lot of ca...,1,1392163200,,1,2014-02-12,2,2014
7,5.0,True,A31OVFL91BCKXG,7508492919,Ashley Nicole Miller,It is a very cute case. None of the jewels hav...,Cute case,1389916800,,1,2014-01-17,1,2014
11,5.0,True,A2ZB7KGUSBR9P3,7508492919,E. Bryce,Another great product that my daughter she use...,Bling bling for iPhone 4S,1382486400,,1,2013-10-23,10,2013
14,5.0,True,A18U23JWTMQX5C,7508492919,KaitlynxO625,Beautiful quality and outstanding product! Eve...,I can't stop using this case!,1378771200,,1,2013-09-10,9,2013
15,5.0,True,A1JQUCTFM4UKMQ,7508492919,M. Antillon,It is such a good case for a low price. I have...,I love it,1377648000,,1,2013-08-28,8,2013
18,5.0,True,A29KSIE8BKYVQN,7508492919,Janine B.,Super durable and I get compliments on it dail...,Good case,1370304000,,1,2013-06-04,6,2013
19,5.0,True,A2CQO0FORCTC2R,7508492919,Sulli,I have used this case for a couple weeks & so ...,very sparkly,1370217600,,1,2013-06-03,6,2013
21,5.0,True,A2ROMLP8COJ6JA,7508492919,Mary Beth Anderson,I chose this case because it is so beautiful. ...,Beautiful,1367625600,,1,2013-05-04,5,2013
22,5.0,True,A23MRCVKI8M8OY,7508492919,T...,Of all my cases this is the one that I have on...,pink bow,1366761600,,1,2013-04-24,4,2013


In [None]:
#keep only the columns we want
columns1=['asin', 'overall', 'verified', 'reviewText', 'summary',
       'vote', 'sentiment', 'title', 'brand', 'feature', 
       'price_parsed', 'Month', 'Year', 'Day',
       'review_len', 'word_count']
df=df[columns1]

In [None]:
print('Total Rows: ' + str(len(df)))
# Sample of raw dataset with dates reformatted. Notice new month and year columns
df.head(20)


# ============================================
#   Preprocess reviews 
# ============================================

In [7]:
def writeToDisk(dfdb):
    with open(PROCESSED_FILENAME, "wb") as f:
        pickle.dump(dfdb, f)
        
        

In [8]:
#nltk.download('all', halt_on_error = True)

#number of observations
size=len(df)
#number of observations to normalize in each process
iterSize=round(size/processors)
#holds the processes
processes=[]

print('To process: ' + str(size))
 
i=0

dfholder=[]
parent_conn_holder=[]

columns2=columns1 + ['Clean_Review','Clean_Review_Tokens']

dffile=pd.DataFrame(columns=columns2);

print(dffile.columns)

for i in range(0,processors):
    start=i*iterSize
    stop=start+iterSize
    if(stop>size):
        stop=size
    #split df for parallel proc
    dflist=df.iloc[start:stop]
    # creating a pipe 
    parent_conn, child_conn = multiprocessing.Pipe() 
    p = Process(target=utils.multiprocNormalize, args=(dflist,child_conn,"proc"+str(i)))
    #dfholder.append(utils.multiprocNormalize(dflist,"proc"+str(i)))
    processes.append(p)
    p.start()
    parent_conn_holder.append(parent_conn)

    
for parent_conn in parent_conn_holder:
    dfdb=parent_conn.recv()
    #take out leading and trailing blanks
    dfdb['Clean_Review'] = dfdb['Clean_Review'].str.strip().astype(str)
    #replace blank ones with nan
    dfdb['Clean_Review'].replace('', np.NaN, inplace=True)
    #now drop all nuls
    dfdb.dropna(subset=['Clean_Review'], inplace=True)
    dfdb=dfdb.reset_index(drop=True)
    dffile=pd.concat([dffile, dfdb], axis=0)

for p in processes:
    p.join()

To process: 800


In [None]:
print('Total Rows on processed dataset: ' + str(len(dffile)))
print('Sample of processed dataset. Notice the column named Clean_Review');
dffile.head(20)

In [None]:
writeToDisk(dffile)

In [9]:
print('done')

done


In [10]:

with open(PROCESSED_FILENAME, "rb") as f:
    dfread = pickle.load(f)
    print('Total Rows on processed dataset: ' + str(len(dfread)))
    

Total Rows on processed dataset: 800
Sample of processed dataset. Notice the column named Clean_Review
