### Load and Clean Data

In [None]:
import gzip
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from collections import defaultdict
from itertools import groupby
import seaborn as sns

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)


def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
path = 'data/Office_Products_5.json.gz'
df = getDF(path)

In [None]:
df.shape

In [None]:
df[:3]

At the moment, we aren't using **'image'** and **'style'**

In [None]:
df = df.drop('image', 1)
df['vote'].fillna(0, inplace=True)

In [None]:
uniDf = df.drop('style', 1).drop_duplicates()
uniDf.shape

In [None]:
uniDf.drop_duplicates(subset=['reviewerID', 'asin']).shape

In [None]:
uniDf['vote'] = uniDf['vote'].apply(lambda v : str(v).replace(',', ''))
uniDf['vote'] = pd.to_numeric(uniDf['vote'])
uniDf['reviewText'].fillna('', inplace=True)

In [None]:
uniDf.reset_index(drop=True)
uniDf = uniDf[uniDf['reviewText'].apply(lambda x: len(x) <= 1000)]

### Rating Historgram

In [None]:
ratings = list(uniDf['overall'])
votes = list(uniDf['vote'])
items = list(uniDf['asin'])
verified = list(uniDf['verified'])
reviews = list(uniDf['reviewText'])

count_r = [0] * 6
count_verified = [0] * 6
count_non_verified = [0] * 6
r_items = defaultdict(list)
count_r_items = [0] * 6
count_review_length = [0] * 7

In [None]:
item_rating = []
for i in range(len(ratings)):
    r_items[items[i]].append(ratings[i])
for item in r_items:
    counts = np.bincount(r_items[item])
    count = np.argmax(counts)
    count_r_items[count] += 1
count_r_items = count_r_items[1:]

In [None]:
zero_votes = []
non_zero_votes = []
verified_rating = []
non_verified_rating = []
review_length = []

for i in range(len(ratings)):
    if verified[i] == True:
        count_verified[int(ratings[i])] += 1
        verified_rating.append(ratings[i])
    else :
        count_non_verified[int(ratings[i])] += 1
        non_verified_rating.append(ratings[i])
    count_r[int(ratings[i])] += 1
    review_length.append(len(str(reviews[i])))

In [None]:
max_len = 0
for review in reviews:
    if type(review) == float:
        print(review)
    if len(review) > max_len:
        max_len = len(review)
max_len = max_len/7

In [None]:
i = 0
for k, g in groupby(sorted(review_length), key=lambda x: x//max_len):
    count_review_length[i] = len(list(g))
    i += 1
print(count_review_length)

In [None]:
print(count_r)
print(count_verified)
print(count_non_verified)
#print(review_length)

In [None]:
n,bins,patches = plt.hist(ratings,50, facecolor='blue')
plt.xlim([0,6])
plt.xlabel("rating score")
plt.ylabel("number of reviews")
plt.title("rating histogram of all reviews ")
plt.xticks(np.arange(0,6,1))
plt.savefig('rating_histogram')
plt.show()

In [None]:
n,bins,patches = plt.hist(verified_rating,50, facecolor='blue')
plt.xlim([0,6])
plt.xlabel("rating score")
plt.ylabel("number of reviews")
plt.title("rating histogram of reviews, verified ")
plt.xticks(np.arange(0,6,1))
plt.savefig('verified_rating_histogram')
plt.show()

In [None]:
n,bins,patches = plt.hist(non_verified_rating,50, facecolor='blue')
plt.xlim([0,6])
plt.xlabel("rating score")
plt.ylabel("number of reviews")
plt.title("rating histogram of reviews, none_verified ")
plt.xticks(np.arange(0,6,1))
plt.savefig('none_verified rating_histogram')
plt.show()

### Rating Percentage by Items

In [None]:
labels = ['1', '2', '3','4','5']
colors = ['gold', 'yellowgreen', 'lightcoral', 'orange','lightskyblue']
explode = (0.1, 0.1, 0.1, 0.1, 0.5)  # explode 1st slice


fig = plt.figure(figsize=[6, 6])
ax = fig.add_subplot(111)
ax.pie(count_r_items, labels=labels,explode = explode, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)
ax.set_title("rating percentage of items")

### Review Length Distribution

In [None]:
uniDf['reviewTextLength'] = uniDf['reviewText'].apply(lambda rt : len(rt))
uniDf .groupby('overall').reviewTextLength.hist(range=[0, 1000], bins=5)

In [None]:
max_len = int(max_len)
labels = ['0-'+str(max_len), str(max_len)+'-'+str(max_len*2), str(max_len*2)+'-'+str(max_len*3),str(max_len*3)+'-'+str(max_len*4),str(max_len*4)+'-'+str(max_len*5), str(max_len*5)+'-'+str(max_len*6), str(max_len*6)+'-'+str(max_len*7)]
colors = ['gold', 'yellowgreen', 'lightcoral', 'orange','lightskyblue']
explode = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1)  # explode 1st slice

fig = plt.figure(figsize=[6, 6])
ax = fig.add_subplot(111)
ax.pie(count_review_length, labels=labels,explode = explode, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=200)
ax.set_title("review length")

### Ratings vs Length of review

In [None]:
rating1 = {}
for i in np.random.randint(0,50000,size = 500):
    length = len(uniDf.iloc[i]['reviewText'])
    if rating1.get(length):
        rating1[length].append(uniDf.iloc[i]['overall'])
    else:
        rating1[length] = [uniDf.iloc[i]['overall']]
averages = {}
for key, values in rating1.items(): 
    averages[key] = float(sum([int(i) for i in values]))/len(values)

In [None]:
rating1 = {}
for i in np.random.randint(0,50000,size = 500):
    length = len(uniDf.iloc[i]['reviewText'])
    if rating1.get(length):
        rating1[length].append(uniDf.iloc[i]['overall'])
    else:
        rating1[length] = [uniDf.iloc[i]['overall']]

In [None]:
def sort_by_value(d): 
    items=d.items() 
    backitems=[[v[0],v[1]] for v in items] 
    backitems.sort() 
    return [ backitems[i] for i in range(0,len(backitems))] 

averages1 = sort_by_value(averages)

In [None]:
rating = []
length = []
for d in averages1: 
    length.append(d[0])
    rating.append(d[1])

plt.style.use('ggplot')
plt.figure(figsize = (12, 6)) 
plt.title('Ratings vs Length of review')
plt.xlabel('Length of review')
plt.ylabel('Rating')
plt.scatter(length,rating)
plt.show()

In [None]:
lenAll = list(uniDf['reviewTextLength'])
count = np.array([[0]*3]*6)

for i in range(len(ratings)):
    if lenAll[i] <= 333:
        count[int(ratings[i])][0] += 1
    elif lenAll[i] <= 666:
        count[int(ratings[i])][1] += 1
    else:
            count[int(ratings[i])][2] += 1      
count = count[1:]  

In [None]:
df = pd.DataFrame(count, index=['1','2','3','4','5'], columns=['0_333', '333_666','666_1000'])
df.plot.bar(stacked=True)
plt.title('Ratings vs Count of long/medium/short of review')
plt.show()

### Vote Distribution on Rating

In [None]:
uniDf.dtypes

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
plt.suptitle('')
uniDf[uniDf['vote'] != 0].boxplot(column=['vote'], by='overall', ax=ax)

In [None]:
sns.violinplot(data=uniDf[uniDf['vote'] != 0], x='overall', y='vote')

### Extract and save each review and its rating

In [None]:
uniDf.to_csv(path.strip('.json.gz') + '.csv', columns = ['reviewerID', 'asin', 'overall'], index=False)

### Obtain Styles and Save to JSON

In [None]:
styles = defaultdict(set)
for style in df['style'].values:
    if type(style) != dict:
        continue
    for key in style:
        styles[key.replace(':', '')].add(style[key])

In [None]:
with open(path.strip('.json.gz')+'_styles.json', 'w') as fp:
    data = {key:list(styles[key]) for key in styles}
    json.dump(data, fp, sort_keys=True, indent=4)

### Temporal Effects

Trend of number of reviews and average ratings over months

In [None]:
minMonth = uniDf['unixReviewTime'].min()//(30*24*60*60)

def unixTimeToMonth(row):
    return row['unixReviewTime']//(30*24*60*60) - minMonth

uniDf['normalizedMonth'] = uniDf.apply(unixTimeToMonth, axis=1)

print('Range of normalized months: {0} - {1}'.format(uniDf['normalizedMonth'].min(), uniDf['normalizedMonth'].max()))

In [None]:
ax = uniDf.hist('normalizedMonth', bins=25, grid=False, figsize=(8,4), color='#86bf91', zorder=2, rwidth=0.9)
for x in ax[0]:
    x.set_title('Number of reviews versus months')
    x.set_xlabel('Month normalized')
    x.set_ylabel('Number of reviews')

In [None]:
avgRatings = uniDf.groupby('normalizedMonth').mean()['overall']
ax = avgRatings.plot(figsize=(8,4))
ax.set_ylabel('Average rating')
ax.set_xlabel('Month normalized')
ax.set_title('Average rating versus months')

### Prepare Text for Sentimental Analysis

E.g.
Given fn.json with rating from 1 to 3,
generate fn_1.txt, fn_2.txt, fn_3.txt

In [None]:
import re
import string
import nltk

stemmer = nltk.stem.porter.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')

""" Cleaning Options
--rm_punc:             remove punctuations
--lower:               lower case
--stemm:               stemming
--rm_stop:             remove stop words

Modify these options in extract_review_text()!
"""
def clean_str(s,
             rm_punc=False,
             lower=True,
             stem=False,
             rm_stop=False):
    
    # Yoon Kim CNN preprocess
    s = s.replace('\n', ' ')
    s = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", s)
    s = re.sub(r"\'s", " \'s",                s)
    s = re.sub(r"\'ve", " \'ve",              s)
    s = re.sub(r"n\'t", " n\'t",              s)
    s = re.sub(r"\'re", " \'re",              s)
    s = re.sub(r"\'d", " \'d",                s)
    s = re.sub(r"\'ll", " \'ll",              s)
    s = re.sub(r",", " , ",                   s)
    s = re.sub(r"!", " ! ",                   s)
    s = re.sub(r"\(", " \( ",                 s)
    s = re.sub(r"\)", " \) ",                 s)
    s = re.sub(r"\?", " \? ",                 s)
    s = re.sub(r"\s{2,}", " ",                s)
    
    # No punctuations
    if rm_punc:
        s = s.translate(None, string.punctuation)
    
    # Perform stemming / Remvoe stop words
    if stem or rm_stop:
        s_clean = []
        for w in s.split():
            if rm_stop and w in stopwords: continue
            s_clean.append(stemmer.stem(w) if stem else w)
        s = ' '.join(s_clean)

    return s.strip().lower() if lower else s.strip()


def extract_review_text(path, uniDf):
    # [1.0, 2.0, 3.0, 4.0, 5.0]
    rating_vals = set(uniDf['overall'].tolist())
    print('Ratings are: {}'.format(rating_vals))

    fps = {r: open('%s_%s.txt'%(path, str(r)), 'w') for r in rating_vals}
    for r in rating_vals:
        reviews = list(uniDf[uniDf['overall']==r]['reviewText'].values)
        print('Extracting reviews for rating {}'.format(r))
        for review in reviews:
            fps[r].write(clean_str(review) + '\n')

    for r in fps:
        fps[r].close()
    
    print('Finish extracting reviews!')

In [None]:
extract_review_text(path, uniDf)