In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Load the Reviews Data

In [None]:
# reviews_df_raw = pd.read_json("../Data/amazon_reviews.json",lines=True)
# reviews_pickle = reviews_df_raw.to_pickle('reviews_raw.pkl')
reviews_df_raw = pd.read_pickle('reviews_raw.pkl')


Look at NaNs in the Data

In [11]:
# count and percentage of missing values in each column
missing_counts = reviews_df_raw.isna().sum()
missing_percentages = reviews_df_raw.isna().mean() * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage': missing_percentages
})
missing_df = missing_df[missing_df['Missing Count'] > 0]
missing_df = missing_df.sort_values(by='Missing Percentage', ascending=False)
print("Missing Values Overview:")
print(missing_df)

Missing Values Overview:
              Missing Count  Missing Percentage
image               7999253           97.537223
vote                7239598           88.274529
style               6211781           75.742056
reviewText             7117            0.086780
summary                2531            0.030861
reviewerName            502            0.006121


In [13]:
reviews_df_raw = reviews_df_raw.dropna(subset=['summary','reviewText', 'reviewerName'])

Drop Columns

In [14]:
# from this we decide to drop image, vote, style
# we will also drop reviewTime and use unixReviewTime because it is the same values but
# an easier format to work with

reviews_df_raw = reviews_df_raw.drop(columns=['image', 'vote', 'style', 'reviewTime'])

Look at Duplicates in the Data
- for now, we will keep duplicates

In [15]:
# check for duplicates in the reviews
duplicates = reviews_df_raw.duplicated(subset=['reviewerID', 'asin', 'unixReviewTime', 'overall'], keep=False)
print(f"Number of duplicate reviews: {duplicates.sum()}")
# show the duplicate reviews
duplicate_reviews = reviews_df_raw[duplicates]
duplicate_reviews[['reviewerID', 'asin', 'unixReviewTime', 'overall', 'reviewText']]

Number of duplicate reviews: 391975


Unnamed: 0,reviewerID,asin,unixReviewTime,overall,reviewText
1144,ABWSQI9992Q29,0786955554,1474761600,5,nice
1145,ABWSQI9992Q29,0786955554,1474761600,5,nice
1154,A115TZEJ3U3AER,0786955554,1462924800,5,On-time and nice item.
1155,A115TZEJ3U3AER,0786955554,1462924800,5,On-time and nice item.
1266,AN82BJBT7QNT7,0786955554,1309046400,4,This set is slightly worse than earlier ones (...
...,...,...,...,...,...
8189354,A5FVEFQ55OHRS,B01H25XJ76,1500508800,5,She is happy
8189355,A5FVEFQ55OHRS,B01H25XJ76,1500508800,5,everyone is happpy
8189356,A5FVEFQ55OHRS,B01H25XJ76,1500508800,5,The 6 year old loves it
8199065,AY6VLC56K8CVX,B01HGKDE22,1480982400,5,very pleased with promptness and quality


Convert unixReviewTime column to Datetime Objects

In [16]:
reviews_df_raw['date'] = pd.to_datetime(reviews_df_raw['unixReviewTime'], unit='s')

Features Related to Overall

In [None]:
# using simple aggregation techniques for the rating column
reviews_df_raw['avg_rating'] = reviews_df_raw.groupby('asin')['overall'].transform('mean')
reviews_df_raw['min_rating'] = reviews_df_raw.groupby('asin')['overall'].transform('min')
reviews_df_raw['min_rating'] = reviews_df_raw.groupby('asin')['overall'].transform('max')
reviews_df_raw['num_of_rating'] = reviews_df_raw.groupby('asin')['overall'].transform('count')

In [19]:
# percent positive reviews -- rating >= 4
# percent negative reviews -- rating <= 2
reviews_df_raw['is_positive'] = (reviews_df_raw['overall'] >= 4).astype(int)
reviews_df_raw['is_negative'] = (reviews_df_raw['overall'] <= 2).astype(int)

reviews_df_raw['percent_positive'] = reviews_df_raw.groupby('asin')['is_positive'].transform('mean')
reviews_df_raw['percent_negative'] = reviews_df_raw.groupby('asin')['is_negative'].transform('mean')

Features Related to Verified
- this does not seem like a great feature to use because it is so imbalanced

In [20]:
reviews_df_raw['avg_verified_reviewers'] = reviews_df_raw.groupby('asin')['verified'].mean()

Features Related to UnixReviewTime