#### DLS Project - Recommendation System using GNN
1. Data Processing Script
2. Read the raw datasets
3. Merge meta data and review data files for modeling

In [1]:
import os
import pandas as pd
import gzip
import json
import numpy as np

**Reading Meta Data File**

In [2]:
'''Step1 : Load and clean meta Data file'''
#os.chdir(r'/N/u/abbajpai/Carbonate/Documents/data/Raw data')
os.chdir(r'/N/project/APRS/') # Change path to data
metaRawData = []
with gzip.open('meta_Books.json.gz') as f:
    for l in f:
        metaRawData.append(json.loads(l.strip()))

In [3]:
# Convert list into pandas dataframe and check for duplicates
metaData = pd.DataFrame.from_dict(metaRawData)
metaData = metaData.drop_duplicates(subset='asin', keep='first')
metaData = metaData.dropna(subset=['category']) # no nulls
metaData = metaData[metaData!='[ ]']

In [5]:
# Extracting last value in category field
metaData['categoryLast'] = metaData['category'].astype(str).str.rsplit(',').str[-1]
metaData['categoryLast'] = metaData['categoryLast'].replace('\]', '', regex=True) 
metaData['categoryLast'] = metaData['categoryLast'].replace('\'', '', regex=True)

In [6]:
metaData['categoryLast']=metaData['categoryLast'].replace('\[', '',regex=True)

In [None]:
# Save file
metaDataFilter=metaData
metaDataFilter.to_pickle('1_metaData.pkl') # for future use

##### Reading Review Data File

In [8]:
'''Step2 : Load and clean Review Data file and merge with meta data'''

reviewRawData = []
i=0
with gzip.open('Books_5.json.gz') as f:
    for l in f:
        i+=1
        ly=""
        ly=json.loads(l.strip())
        if ly['reviewTime'].endswith((', 2016',', 2017',', 2018')):
            reviewRawData.append(ly)
        if (i==40000000): #  limit to avoid memory issue
            break
# total length of list, this number equals total number of products
print(len(reviewRawData))

# convert list into pandas dataframe and drop nulls
reviewData = pd.DataFrame.from_dict(reviewRawData)

reviewData= reviewData.dropna(subset=['asin'])
mergedData= pd.merge(reviewData,metaDataFilter, how='inner', on='asin')

print(len(mergedData))
mergedData.to_pickle('2_mergedData_final.pkl')

9824156
9823656


In [9]:
# Sample check
mergedData[0:100]

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,...,rank,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,categoryLast
0,5.0,True,"06 20, 2016",AVP0HXC9FG790,0001713353,Amazon Customer,The kids loved it!,Five Stars,1466380800,,...,"1,461,315 in Books (","[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction
1,5.0,True,"01 24, 2016",A324TTUBKTN73A,0001713353,Tekla Borner,My students (3 & 4 year olds) loved this book!...,Five Stars,1453593600,{'Format:': ' Paperback'},...,"1,461,315 in Books (","[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction
2,5.0,True,"10 6, 2017",A13WB96L3AKB6U,0001713353,C. Morgan,This is a family favorite.\nIt has been one of...,ALL TIME FAVORITE!!,1507248000,{'Format:': ' Hardcover'},...,"1,461,315 in Books (","[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction
3,5.0,False,"06 2, 2017",A26ZH70F6R1RXI,0001713353,Kindle_mom,Very fun book!,Loved it!,1496361600,,...,"1,461,315 in Books (","[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction
4,5.0,True,"05 8, 2017",A1EE8B7JW9Q4LC,0001713353,Michelle A. Harris,The Book is in pristine condition. I had no i...,Fabulous! Better than Expected,1494201600,,...,"1,461,315 in Books (","[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5.0,True,"07 17, 2017",A2L4BBNT6DXHWM,0001384198,L barb,BEST classic,Five Stars,1500249600,{'Format:': ' Hardcover'},...,"3,286,778 in Books (","[0448405202, B000OKY9X0, 0399244670]",Books,,,$16.04,[],[],,Literature & Fiction
96,3.0,True,"06 20, 2017",AGIEJEH5K1WMT,0001384198,Sara,"I didn't realize this is a mini booklet , when...",Interesting !!,1497916800,{'Format:': ' Hardcover'},...,"3,286,778 in Books (","[0448405202, B000OKY9X0, 0399244670]",Books,,,$16.04,[],[],,Literature & Fiction
97,5.0,True,"06 16, 2017",A3EY4VI8K6JLUU,0001384198,tres,What can one say bad about the little engine t...,"He could, you should!",1497571200,{'Format:': ' Hardcover'},...,"3,286,778 in Books (","[0448405202, B000OKY9X0, 0399244670]",Books,,,$16.04,[],[],,Literature & Fiction
98,5.0,False,"06 13, 2017",A34TPFLDV1ZG70,0001384198,Sandi,Perfect for our needs.,Five Stars,1497312000,{'Format:': ' Hardcover'},...,"3,286,778 in Books (","[0448405202, B000OKY9X0, 0399244670]",Books,,,$16.04,[],[],,Literature & Fiction


In [10]:
# Split into year for initial model building
data_16=mergedData.loc[mergedData['reviewTime'].str.contains("2016",case=False)]
data_17=mergedData.loc[mergedData['reviewTime'].str.contains("2017",case=False)]
data_18=mergedData.loc[mergedData['reviewTime'].str.contains("2018",case=False)]
print(len(data_16),len(data_17),len(data_18))

4494142 3933994 1395520


##### Data exploration

In [11]:
# Book Avg. Rating
mergedData1=mergedData
mergedData1['finalreview']=mergedData1.groupby(['asin'])['overall'].transform('mean')
mergedData1['finalreview']=np.round_(mergedData1.finalreview,decimals=1)
finalData=mergedData1[["asin","reviewerID","finalreview","reviewTime"]]
print(len(finalData))
mergedData1.to_pickle('total_data_avg.pkl')
mergedData1[1:10]

9823656


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,...,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,categoryLast,finalreview
1,5.0,True,"01 24, 2016",A324TTUBKTN73A,1713353,Tekla Borner,My students (3 & 4 year olds) loved this book!...,Five Stars,1453593600,{'Format:': ' Paperback'},...,"[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction,5.0
2,5.0,True,"10 6, 2017",A13WB96L3AKB6U,1713353,C. Morgan,This is a family favorite.\nIt has been one of...,ALL TIME FAVORITE!!,1507248000,{'Format:': ' Hardcover'},...,"[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction,5.0
3,5.0,False,"06 2, 2017",A26ZH70F6R1RXI,1713353,Kindle_mom,Very fun book!,Loved it!,1496361600,,...,"[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction,5.0
4,5.0,True,"05 8, 2017",A1EE8B7JW9Q4LC,1713353,Michelle A. Harris,The Book is in pristine condition. I had no i...,Fabulous! Better than Expected,1494201600,,...,"[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction,5.0
5,5.0,True,"12 10, 2016",ARBC5VQWIPRMR,1713353,Debbie Lampert,This book is eternal. I read it to my children...,An Eternal Story,1481328000,{'Format:': ' Hardcover'},...,"[0394800230, B01K8ZBL54, 0140564349, B000X6VEW...",Books,,,$12.33,[],[],,Literature & Fiction,5.0
6,5.0,True,"10 6, 2016",A2CVLIZ9ELU88,1061240,A.M.H.,I had this book as a child and loved it. The c...,A wonderful poetry collection with superior co...,1475712000,{'Format:': ' Hardcover'},...,"[B001HDIKYW, B000LTQDP6, 0307168514, 039485010...",Books,,,$23.98,[],[],,Literature & Fiction,5.0
7,5.0,False,"09 16, 2016",A2LGACKSC0MALY,1061240,Sharon Katz,My aunt bought me this book in 1962 - I was te...,The Best Poetry Books For Children And Adults,1473984000,{'Format:': ' Hardcover'},...,"[B001HDIKYW, B000LTQDP6, 0307168514, 039485010...",Books,,,$23.98,[],[],,Literature & Fiction,5.0
8,5.0,False,"01 7, 2016",A6EQG0P75KHJ,1061240,Bookworm 93103,"I agree with another reviewer, that every home...","Enchanting poetry and illustrations, a true cl...",1452124800,,...,"[B001HDIKYW, B000LTQDP6, 0307168514, 039485010...",Books,,,$23.98,[],[],,Literature & Fiction,5.0
9,5.0,True,"04 15, 2018",A37NYSAZ4SSO1,1061240,Amazon Customer,My mother gave me my copy of this when I was o...,Sure to become a family heirloom!,1523750400,{'Format:': ' Hardcover'},...,"[B001HDIKYW, B000LTQDP6, 0307168514, 039485010...",Books,,,$23.98,[],[],,Literature & Fiction,5.0
