## Classify Reviews

In [1]:
######importing libraries for data manipulation#######
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import math
from PIL import Image
import requests
from io import BytesIO
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
###############Function definitions###########

#########Function definitions for separating related feature########
def change_vals_new_col(s,value,new_cols):
    
    if(value.get(s) != None):
        if((type(value[s]) == float) and np.isnan(value[s])):
            new_cols[s] = np.nan
        else:
            new_cols.get(s).append(value[s])
    else:
        new_cols.get(s).append(np.nan)
    

def generate_new_cols(related):
    
    new_cols = {'also_bought':[], 'also_viewed':[],'bought_together':[],'buy_after_viewing':[]}
    
    for key,value in related.items():
        if((type(value) == float) and np.isnan(value)):
            
            new_cols['also_bought'].append(np.nan)
            new_cols['also_viewed'].append(np.nan)
            new_cols['bought_together'].append(np.nan)
            new_cols['buy_after_viewing'].append(np.nan)
        else:
            change_vals_new_col('also_bought',value,new_cols)
            change_vals_new_col('also_viewed',value,new_cols)
            change_vals_new_col('bought_together',value,new_cols)
            change_vals_new_col('buy_after_viewing',value,new_cols)
            
        
    return new_cols
#####Function definitions for separating related feature ends####

def plot_related_prods(related,which,final_metadata):
    
    if(related == None):
        print('People who'+str(related)+'this product did not buy any other product:')
        return
    else:
        #print(np.array(related) in final_metadata.index)
        tot = 0
        for idx in related:
            if(idx in final_metadata.index):
                tot += 1
        print(tot)
        tot = round(tot/2)
        print('final',tot)
        f, axes = plt.subplots(tot,tot,figsize=(4,4),dpi=300)
        f.suptitle('People also '+str(which))
        for i in range(0,tot):
            for j in range(0,tot):
                curr_asin = related[i+j]
                if((curr_asin in final_metadata.index) == True):
                    curr_url = final_metadata.loc[curr_asin]['imUrl']
                    curr_title = final_metadata.loc[curr_asin]['title']
                    curr_title = curr_title[0:30]
                    response = requests.get(curr_url)
                    img = Image.open(BytesIO(response.content))
                    axes[i,j].imshow(img)
                    axes[i,j].get_xaxis().set_ticks([])
                    axes[i,j].get_yaxis().set_ticks([])
                    plt.axis('off')
                    axes[i,j].set_title(curr_title,size=3)
        plt.show()            

def Show_related_products(meta_data_row,final_metadata):
    
    #print(meta_data_row)
    curr_url = meta_data_row['imUrl']
    #curr_prod_id = meta_data_row['asin']
    title = meta_data_row['title']
    
    print('The current product is:',title)
    response = requests.get(curr_url)
    img = Image.open(BytesIO(response.content))
    plt.imshow(img)
    plt.show()
    
    ####People who bought this product also bought####
    also_bought = meta_data_row['also_bought']
    if((type(also_bought) == float) and np.isnan(also_bought)):
        also_bought = None
    else:
        if(len(also_bought) > 9):
            also_bought = also_bought[0:9]
    plot_related_prods(also_bought,'bought',final_metadata)
    
    ####People who bought this product also viewed####
    also_viewed = meta_data_row['also_viewed']
    if((type(also_viewed) == float) and np.isnan(also_viewed)):
        also_viewed = None
    else:
        if(len(also_viewed) > 9):
            also_viewed = also_viewed[0:9]
    plot_related_prods(also_viewed,'viewed',final_metadata)


    

### Read Data In

In [3]:
#####Reading Data#######
#####This is a smaller data for initial data exploration and model testing#####
######The data is about Heal and Personal care Products on Amazon##########

def parse(path):
    g = open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

data_path = '/Users/ruchinpatel/Desktop/USC_EVERYTHING/SPRINGBOARD/CAPSTONE/Health_and_Personal_Care_5.json'
metadata_path = '/Users/ruchinpatel/Desktop/USC_EVERYTHING/SPRINGBOARD/CAPSTONE/meta_Health_and_Personal_Care.json'

data = getDF(data_path)
metadata = getDF(metadata_path)

* As seen from the table above the field related has two sub values $\textbf{also_bought}$ and $\textbf{also_viewed}$ and as a result of this we need to make two new columns for it.

In [4]:
########## Generating seperate columns for related feature######
related = metadata['related'].to_dict()
newly_created_columns = pd.DataFrame(generate_new_cols(related))

In [5]:
#########Final Metadata dataframe###########
final_metadata = pd.concat([metadata,newly_created_columns],axis = 1)
final_metadata = final_metadata.drop('related',axis=1)
final_metadata = final_metadata.set_index('asin')

######converting the dates to date time format#####
data['unixReviewTime'] = pd.to_datetime(data['unixReviewTime'],unit='s')
data['reviewTime'] = pd.to_datetime(data['reviewTime'])

In [6]:
data.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5.0,Handy little gadget,2011-01-05,2011-01-05
1,AHKSURW85PJUE,159985130X,"AZ buyer ""AZ buyer""","[1, 1]",I would recommend this for a travel magnifier ...,4.0,Small & may need to encourage battery,2012-02-18,2012-02-18
2,A38RMU1Y5TDP9,159985130X,"Bob Tobias ""Robert Tobias""","[75, 77]",What I liked was the quality of the lens and t...,4.0,Very good but not great,2010-06-08,2010-06-08
3,A1XZUG7DFXXOS4,159985130X,Cat lover,"[56, 60]",Love the Great point light pocket magnifier! ...,4.0,great addition to your purse,2008-02-08,2008-02-08
4,A1MS3M7M7AM13X,159985130X,Cricketoes,"[1, 1]",This is very nice. You pull out on the magnifi...,5.0,Very nice and convenient.,2011-08-16,2011-08-16


In [7]:
final_metadata.head(5)

Unnamed: 0_level_0,description,title,imUrl,salesRank,categories,price,brand,also_bought,also_viewed,bought_together,buy_after_viewing
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
77614992,This is an example product description.,Principles of Mgmt + Oper-CSUF Custom C,http://ecx.images-amazon.com/images/I/51G%2BRq...,{'Health & Personal Care': 168429},[[Health & Personal Care]],,,"[0471730726, 0132834871, 0471391905, B00000JZK...","[0073525057, 1133227295, 0324628676, 0073523224]",,
615208479,By now we all know the benefits of exercise fo...,Brain Fitness Exercises Software,http://ecx.images-amazon.com/images/I/41kbZB04...,{'Health & Personal Care': 1346973},"[[Health & Personal Care, Personal Care]]",,,,,,
615269990,What's wrong with your patient?Do all the symp...,Occam's Razor,http://ecx.images-amazon.com/images/I/51fH-ABe...,{'Toys & Games': 110575},"[[Health & Personal Care, Personal Care, Shavi...",34.99,,"[1935660152, 0071743979, 0071831428, 032308787...","[1594741476, B0069628EU, B009RTGX2Y, B000IQHSL...",,
615315860,,101 BlenderBottle Recipes Quick and Easy,http://ecx.images-amazon.com/images/I/21zOQu2Q...,{'Health & Personal Care': 254068},[[Health & Personal Care]],,,"[B006VT9RBM, B0010JLMO8, B001CXC69C, B0064QSHX...",[B0018G4ZEW],,
615406394,This is an example product description.,"Aphrodite Reborn - Women's Stories of Hope, Co...",http://ecx.images-amazon.com/images/I/51rJLgsi...,{'Health & Personal Care': 377936},[[Health & Personal Care]],,,"[0966035232, 1421407205]",,,


In [8]:
reviews_data = data[['reviewText','overall']]
print(reviews_data.shape)

(346355, 2)


In [None]:
reviews_data_temp = reviews_data.head(1000)
print(reviews_data_temp.head(100))
reviews_data_temp.groupby(by='overall').sum()

In [37]:
#####Tokenizing our text data####
count_vect = CountVectorizer(analyzer = 'word',stop_words = 'english',min_df=0.01,binary=True)
review_text_tokenized = count_vect.fit_transform(reviews_data['reviewText'])
print(review_text_tokenized.shape)
review_text_tokenized = pd.DataFrame(review_text_tokenized.toarray())

(346355, 625)


In [36]:
review_text_tokenized.head(-10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,0,1,0,0,0,0,1,0,1,...,0,0,1,1,0,0,0,0,0,0
2,1,0,0,0,0,1,1,1,1,1,...,0,0,0,0,1,0,0,1,1,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,1,1,0,0,1,...,0,0,0,1,1,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
