In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Load the dataset

In [2]:
# reviews_df_raw = pd.read_json("../Data/amazon_reviews.json",lines=True)
# reviews_pickle = reviews_df_raw.to_pickle('reviews_raw.pkl')
reviews_df_raw = pd.read_pickle('reviews_raw.pkl')
reviews_df_raw

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,image,style
0,2,12,False,"09 22, 2016",A1IDMI31WEANAF,0020232233,Mackenzie Kent,"When it comes to a DM's screen, the space on t...",The fact that 50% of this space is wasted on a...,1474502400,,
1,1,21,False,"09 18, 2016",A4BCEVVZ4Y3V3,0020232233,Jonathan Christian,An Open Letter to GaleForce9*:\n\nYour unpaint...,Another worthless Dungeon Master's screen from...,1474156800,,
2,3,19,True,"09 12, 2016",A2EZ9PY1IHHBX0,0020232233,unpreparedtodie,"Nice art, nice printing. Why two panels are f...","pretty, but also pretty useless",1473638400,,
3,5,,True,"03 2, 2017",A139PXTTC2LGHZ,0020232233,Ashley,Amazing buy! Bought it as a gift for our new d...,Five Stars,1488412800,,
4,1,3,True,"02 8, 2017",A3IB33V29XIL8O,0020232233,Oghma_EM,As my review of GF9's previous screens these w...,Money trap,1486512000,,
...,...,...,...,...,...,...,...,...,...,...,...,...
8201226,5,,False,"12 7, 2017",A3OCDEVI6FGUWU,B01HJBAKIO,wilson,My son is happy!!!,Five Stars,1512604800,,
8201227,5,2,True,"01 23, 2016",A1KTVUVADLKWZO,B01HJHA7GI,Raven the Maven,E My ten month old has had this for a few mont...,Fun for ten month old,1453507200,[https://images-na.ssl-images-amazon.com/image...,
8201228,5,,True,"02 8, 2015",A2QCA9OE62IPZ4,B01HJHA7GI,crhug,perfect toy to keep baby amused,Five Stars,1423353600,,
8201229,5,,True,"12 31, 2014",A3N28JAZYS4L9O,B01HJHA7GI,Cindy Volk,Perfect,Five Stars,1419984000,,


Column Title Meanings

In [3]:
reviews_df_raw.columns

Index(['overall', 'vote', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'image',
       'style'],
      dtype='object')

In [4]:
reviews_df_raw.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,image,style
0,2,12.0,False,"09 22, 2016",A1IDMI31WEANAF,20232233,Mackenzie Kent,"When it comes to a DM's screen, the space on t...",The fact that 50% of this space is wasted on a...,1474502400,,
1,1,21.0,False,"09 18, 2016",A4BCEVVZ4Y3V3,20232233,Jonathan Christian,An Open Letter to GaleForce9*:\n\nYour unpaint...,Another worthless Dungeon Master's screen from...,1474156800,,
2,3,19.0,True,"09 12, 2016",A2EZ9PY1IHHBX0,20232233,unpreparedtodie,"Nice art, nice printing. Why two panels are f...","pretty, but also pretty useless",1473638400,,
3,5,,True,"03 2, 2017",A139PXTTC2LGHZ,20232233,Ashley,Amazing buy! Bought it as a gift for our new d...,Five Stars,1488412800,,
4,1,3.0,True,"02 8, 2017",A3IB33V29XIL8O,20232233,Oghma_EM,As my review of GF9's previous screens these w...,Money trap,1486512000,,


- overall - rating of the product
- vote - helpful votes of the review (so people that gave a thumbs up)
- verified - a boolean of whether the data was verified or not
- reviewTime - time of the review (raw)
- reviewerID - ID of the reviewer
- asin - ID of the product
- reviewerName - name of the reviewer
- reviewText - text of the review
- summary - review title
- unixReviewTime - time of the review (unix time)
- image - images that users post after they have received the product
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- summary - summary of the review

### Begin EDA

Look at the data NaN values

In [5]:
# count and percentage of missing values in each column
missing_counts = reviews_df_raw.isna().sum()
missing_percentages = reviews_df_raw.isna().mean() * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage': missing_percentages
})
missing_df = missing_df[missing_df['Missing Count'] > 0]
missing_df = missing_df.sort_values(by='Missing Percentage', ascending=False)
print("Missing Values Overview:")
print(missing_df)

Missing Values Overview:
              Missing Count  Missing Percentage
image               7999253           97.537223
vote                7239598           88.274529
style               6211781           75.742056
reviewText             7117            0.086780
summary                2531            0.030861
reviewerName            502            0.006121


Duplicates in the dataset

In [None]:
# check for duplicates in the reviews
duplicates = reviews_df_raw.duplicated(subset=['reviewerID', 'asin', 'reviewTime', 'overall'], keep=False)
print(f"Number of duplicate reviews: {duplicates.sum()}")
# show the duplicate reviews
duplicate_reviews = reviews_df_raw[duplicates]
print("Duplicate Reviews:")
duplicate_reviews[['reviewerID', 'asin', 'reviewTime', 'overall', 'reviewText']]


In [None]:
reviews_df_raw.loc[reviews_df_raw['reviewerID']=='A5FVEFQ55OHRS', 'summary']

Data type in each column

In [None]:
reviews_df_raw.info()

Distribution of Ratings

In [None]:
sns.countplot(x='overall', data=reviews_df_raw)
plt.title('Histogram of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
reviews_df_raw['overall'] = reviews_df_raw['overall'].astype(int)
reviews_df_raw['overall'].describe()

Review Text Length Distribution

In [None]:
# review text length
reviews_df_raw['review_length'] = reviews_df_raw['reviewText'].apply(lambda x: len(x) if isinstance(x, str) else 0)
sns.histplot(reviews_df_raw['review_length'], bins=50)
plt.title('Distribution of Review Text Length')
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Length distribution (number of characters or words)
reviews_df_raw['review_length'].describe()

Unique Number of Products and Reviews Per Product

In [None]:
print(f"There are {len(reviews_df_raw['asin'].unique())} unique products.")

In [None]:
# count the number of reviews per product
reviews_per_product = reviews_df_raw.groupby('asin').size().reset_index(name='review_count')
# look at summary statistics for the number of reviews per product
print(reviews_per_product.describe())
# plot the distribution of the number of reviews per product
sns.histplot(reviews_per_product['review_count'], bins=100)



In [None]:
# overview of the number of reviews per product
reviews_per_product['review_count'].value_counts().sort_index().head(20).plot(kind='bar')

In [None]:
# show the product with the most reviews
reviews_per_product.sort_values(by='review_count', ascending=False).head(20)

Heatmap of Some of the Columns

In [None]:
# correlation between overall rating and review length
sns.scatterplot(x='review_length', y='overall', data=reviews_df_raw)
plt.title('Overall Rating vs Review Length')
plt.xlabel('Review Length')
plt.ylabel('Overall Rating')
plt.show()

# Ensure 'vote' is numeric for correlation calculation
if 'vote' in reviews_df_raw.columns:
	reviews_df_raw['vote'] = pd.to_numeric(reviews_df_raw['vote'].str.replace(',', ''), errors='coerce')

# correlation matrix (only numeric columns)
correlation_matrix = reviews_df_raw.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Plot number of reviews per month/year using reviewTime or unixReviewTime.
reviews_df_raw['reviewTime'] = pd.to_datetime(reviews_df_raw['reviewTime'], errors='coerce')
reviews_df_raw['review_month'] = reviews_df_raw['reviewTime'].dt.to_period('M')
reviews_per_month = reviews_df_raw.groupby('review_month').size().reset_index(name='review_count')
# Convert review_month to string for plotting
reviews_per_month['review_month_str'] = reviews_per_month['review_month'].astype(str)
sns.lineplot(x='review_month_str', y='review_count', data=reviews_per_month)
plt.title('Number of Reviews per Month')
plt.xlabel('Month')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45)
plt.show()

# do the same but for the year
reviews_df_raw['review_year'] = reviews_df_raw['reviewTime'].dt.year
reviews_per_year = reviews_df_raw.groupby('review_year').size().reset_index(name='review_count')
sns.barplot(x='review_year', y='review_count', data=reviews_per_year)
plt.title('Number of Reviews per Year')
plt.xlabel('Year')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45)
plt.show()


Copying the Article

In [None]:
# look for certain keywords in the review text and summary
keywords = ['dangerous', 'hazardous', 'unsafe', 'risk', 'warning', 'alert']

for keyword in keywords:
    reviews_df_raw[f'contains_{keyword}'] = reviews_df_raw['reviewText'].str.contains(keyword, case=False, na=False) | \
                                             reviews_df_raw['summary'].str.contains(keyword, case=False, na=False)
# count the number of reviews containing each keyword
keyword_counts = {keyword: reviews_df_raw[f'contains_{keyword}'].sum() for keyword in keywords}
# create a DataFrame for keyword counts 
keyword_counts_df = pd.DataFrame(list(keyword_counts.items()), columns=['Keyword', 'Count'])
# plot the keyword counts
sns.barplot(x='Count', y='Keyword', data=keyword_counts_df)
plt.title('Keyword Counts in Reviews')
plt.xlabel('Count')
plt.ylabel('Keyword')
plt.show()

In [None]:
# show the overall for each keyword
for keyword in keywords:
    overall_counts = reviews_df_raw[reviews_df_raw[f'contains_{keyword}']]['overall'].value_counts().sort_index()
    sns.barplot(x=overall_counts.index, y=overall_counts.values)
    plt.title(f'Overall Ratings for Reviews Containing "{keyword}"')
    plt.xlabel('Overall Rating')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# return all rows that contain the keyword 'hazardous' in the review text or summary
# because we are in a toys and games context, there are some reviews that mention 'hazardous' in a non-negative context, so we will filter them
hazardous_reviews = reviews_df_raw[reviews_df_raw['contains_hazardous']]
print(f"Number of reviews containing 'hazardous': {len(hazardous_reviews)}")
# display the first few rows of hazardous reviews
hazardous_reviews[['reviewerID', 'asin', 'reviewTime', 'overall', 'reviewText', 'summary']].to_clipboard()

In [None]:
# show the asins for the hazardous reviews
hazardous_asins = hazardous_reviews['asin'].unique()
print(f"Unique ASINs for hazardous reviews: {len(hazardous_asins)}")
# display the first few hazardous ASINs
print("Hazardous ASINs:")
for asin in hazardous_asins[:10]:
    print(asin)

In [None]:
# check if the reviews or summaries contain the keyword 'recall'
recall_reviews = reviews_df_raw[reviews_df_raw['reviewText'].str.contains('recall', case=False, na=False) | 
                                 reviews_df_raw['summary'].str.contains('recall', case=False, na=False)]
print(f"Number of reviews containing 'recall': {len(recall_reviews)}")
# display the first few rows of recall reviews
recall_reviews[['reviewerID', 'asin', 'reviewTime', 'overall', 'reviewText', 'summary']].sample(100).to_clipboard()

# some of these talk about an item recall but some are just using the word 'recall' in a different context

In [None]:
# how long are people writing reviews for products that have 1 star ratings
one_star_reviews = reviews_df_raw[reviews_df_raw['overall'] == 1].copy()
one_star_reviews

# groupby asin and see how many months the reviews span
one_star_reviews['review_month'] = one_star_reviews['reviewTime'].dt.to_period('M')
one_star_reviews_grouped = one_star_reviews.groupby('asin')['review_month'].nunique().reset_index()
one_star_reviews_grouped.rename(columns={'review_month': 'months_reviewed'}, inplace=True)
# merge this back to the one_star_reviews DataFrame
one_star_reviews = one_star_reviews.merge(one_star_reviews_grouped, on='asin', how='left')



In [None]:
one_star_reviews['months_reviewed'].describe()

In [None]:
one_star_reviews['asin'].value_counts().head(2)

In [None]:
# find products in reviews_df_raw that have had overall 1 for all reviews
one_star_products = reviews_df_raw.groupby('asin').filter(lambda x: (x['overall'] == 1).all())
print(f"Number of products with all 1-star reviews: {len(one_star_products['asin'].unique())}")

In [None]:
one_star_products

In [None]:
# for each product, take the average of overall rating
avg_rating = reviews_df_raw.groupby('asin')['overall'].mean().reset_index()
avg_rating.rename(columns={'overall': 'avg_overall'}, inplace=True)

avg_var = reviews_df_raw.groupby('asin')['overall'].var().reset_index()
avg_var.rename(columns={'overall': 'var_overall'}, inplace=True)



In [None]:
avg_rating.sort_values(by='avg_overall', ascending=False)

In [None]:
avg_var

In [None]:
avg_and_var_ratings = pd.merge(avg_rating, avg_var, on='asin').sort_values(by='avg_overall', ascending=False)

In [None]:
reviews_df_raw.loc[reviews_df_raw['asin']=='B01HJDFWDK'].sort_values('reviewTime').plot(x='reviewTime', y='overall', kind='line')
# on the same plot, show the average overall rating for the product
plt.axhline(y=avg_and_var_ratings.loc[avg_and_var_ratings['asin'] == 'B01HJDFWDK', 'avg_overall'].values[0], color='r', linestyle='--', label='Average Rating')
plt.axhline(y=avg_and_var_ratings.loc[avg_and_var_ratings['asin'] == 'B01HJDFWDK', 'var_overall'].values[0], color='g', linestyle='--', label='Variance of Rating')
plt.title('Overall Ratings Over Time for Product B01HJDFWDK')
plt.xlabel('Review Time')
plt.ylabel('Overall Rating')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()



In [None]:
sns.histplot(avg_and_var_ratings['avg_overall'], bins=50)

WordCloud-ing

In [None]:
# word cloud of the most common words in the review text
!pip install wordcloud
from wordcloud import WordCloud
text = ' '.join(reviews_df_raw['reviewText'].dropna().astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Review Text')
plt.show()

In [None]:
text = ' '.join(reviews_df_raw['summary'].dropna().astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Summary Text')
plt.show()

Combing Through the Review Text

In [None]:
# find bad reviews (rating 1 or 2) and plot the most common words in the review text
bad_reviews = reviews_df_raw[reviews_df_raw['overall'].isin([1, 2])]
text_bad = ' '.join(bad_reviews['reviewText'].dropna().astype(str).tolist())
wordcloud_bad = WordCloud(width=800, height=400, background_color='white').generate(text_bad)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_bad, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Bad Reviews (Rating 1 or 2)')
plt.show()

In [None]:
reviews_df_raw.loc[reviews_df_raw['asin']== 'B000YDDF6O', ['reviewText', 'summary']].head(10).to_clipboard(index=False)