In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Load the dataset

In [None]:
# reviews_df_raw = pd.read_json("../Data/amazon_reviews.json",lines=True)
# reviews_pickle = reviews_df_raw.to_pickle('reviews_raw.pkl')
reviews_df_raw = pd.read_pickle('reviews_raw.pkl')
reviews_df_raw

Column Title Meanings

In [None]:
reviews_df_raw.columns

In [None]:
reviews_df_raw.head()

- overall - rating of the product
- vote - helpful votes of the review (so people that gave a thumbs up)
- verified - a boolean of whether the data was verified or not
- reviewTime - time of the review (raw)
- reviewerID - ID of the reviewer
- asin - ID of the product
- reviewerName - name of the reviewer
- reviewText - text of the review
- summary - review title
- unixReviewTime - time of the review (unix time)
- image - images that users post after they have received the product
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- summary - summary of the review

### Begin EDA

Look at the data NaN values. Drop the following columns:
- drop vote
- drop image
- drop style
- drop reviewTime bc unix is easier to work with

In [None]:
# count and percentage of missing values in each column
missing_counts = reviews_df_raw.isna().sum()
missing_percentages = reviews_df_raw.isna().mean() * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage': missing_percentages
})
missing_df = missing_df[missing_df['Missing Count'] > 0]
missing_df = missing_df.sort_values(by='Missing Percentage', ascending=False)
print("Missing Values Overview:")
print(missing_df)

Duplicates in the dataset

In [None]:
# check for duplicates in the reviews
duplicates = reviews_df_raw.duplicated(subset=['reviewerID', 'asin', 'reviewTime', 'overall'], keep=False)
print(f"Number of duplicate reviews: {duplicates.sum()}")
# show the duplicate reviews
duplicate_reviews = reviews_df_raw[duplicates]
print("Duplicate Reviews:")
duplicate_reviews[['reviewerID', 'asin', 'reviewTime', 'overall', 'reviewText']]


Data type in each column

In [None]:
reviews_df_raw.info()

In [None]:
# convert dates to datetime objects
reviews_df_raw['date'] = pd.to_datetime(reviews_df_raw['unixReviewTime'], unit='s')


In [None]:
reviews_df_raw

EDA on Overall

In [None]:
sns.countplot(x='overall', data=reviews_df_raw)
plt.title('Histogram of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
# aggregation of ratings
reviews_df_raw.groupby('asin').agg({'overall': ['mean', 'std', 'min', 'max', 'median', 'count']}).reset_index()

In [None]:
# percent positive reviews -- rating >= 4
# percent negative reviews -- rating <= 2

reviews_df_raw['is_positive'] = (reviews_df_raw['overall'] >= 4).astype(int)
reviews_df_raw['is_negative'] = (reviews_df_raw['overall'] <= 2).astype(int)

reviews_df_raw.groupby('asin')[['is_positive', 'is_negative']].mean().reset_index().rename(columns={'is_positive': 'pct_positive', 'is_negative': 'pct_negative'})

In [None]:
reviews_df_raw['overall'] = reviews_df_raw['overall'].astype(int)
reviews_df_raw['overall'].describe()

EDA Verified Reviews

In [None]:
# majority of reviewers are verified
sns.countplot(reviews_df_raw, x='verified')

In [None]:
reviews_df_raw.groupby('asin')['verified'].mean()

Time Features EDA

In [None]:
# compute product lifespan
reviews_df_raw['min_date'] = reviews_df_raw.groupby('asin')['date'].transform('min') 
reviews_df_raw['max_date'] = reviews_df_raw.groupby('asin')['date'].transform('max') 
reviews_df_raw['product_lifespan'] = reviews_df_raw['max_date']-reviews_df_raw['min_date']

product lifespan distribution

In [None]:
reviews_df_raw['product_lifespan'].describe()

Finding Suspicious Activity

In [None]:
all_fives = reviews_df_raw[reviews_df_raw['overall'] == 5].groupby('reviewerID').size().reset_index(name='five_count')
all_ones = reviews_df_raw[reviews_df_raw['overall'] == 1].groupby('reviewerID').size().reset_index(name='one_count')
all_ratings = reviews_df_raw.groupby('reviewerID').size().reset_index(name='total_ratings')



In [None]:
all_ratings = all_ratings.merge(all_fives, on='reviewerID', how='left')
all_ratings = all_ratings.merge(all_ones, on='reviewerID', how='left')

In [None]:
all_ratings['five_count'] = all_ratings['five_count'].fillna(0).astype(int)
all_ratings['one_count'] = all_ratings['one_count'].fillna(0).astype(int)
all_ratings


In [None]:
all_ratings['prop_five'] = all_ratings['five_count'] / all_ratings['total_ratings']
all_ratings['prop_one'] = all_ratings['one_count'] / all_ratings['total_ratings']
all_ratings

In [None]:
all_ratings.describe()

- it seems that most people wrote almost 2 reviews. there is the max which is suspicious...but we can either think about
doing x many standard deviations from the mean or a hard cutoff -- depends on the threshold we are seeking
- most people are leaving 5 star reviews and so prop_five == 1 for bot
- most people are not leaving 1 star reviews so a high prop_one like >0.8

In [None]:
reviewer_dates = reviews_df_raw.groupby('reviewerID')['date'].agg(['min', 'max']).reset_index()

In [None]:
reviewer_dates['review_span'] = reviewer_dates['max']-reviewer_dates['min']

In [None]:
reviewer_dates.describe()

In [None]:
suspicious_activity_df = all_ratings.merge(reviewer_dates[['reviewerID', 'review_span']], on='reviewerID', how='left')

In [None]:
products_reviewed = reviews_df_raw.groupby('reviewerID')['asin'].nunique().reset_index()

In [None]:
products_reviewed = products_reviewed.rename(columns={'asin':'num_unique_products'})

In [None]:
suspicious_activity_df = suspicious_activity_df.merge(products_reviewed, on='reviewerID', how='left')

In [None]:
suspicious_activity_df['reviews_per_day'] = suspicious_activity_df['total_ratings'] / (suspicious_activity_df['review_span'].dt.days + 1 )


In [None]:
suspicious_activity_df.columns

In [None]:
suspicious_activity_df['reviews_per_day'].describe()

In [None]:
suspicious_activity_df['num_unique_products'].describe()

In [None]:
bots_df = suspicious_activity_df[
    (suspicious_activity_df['total_ratings'] > 10) & 
    (suspicious_activity_df['reviews_per_day'] < 3) & 
    (suspicious_activity_df['num_unique_products'] > 10) & 
    ((suspicious_activity_df['prop_five'] == 1) | (suspicious_activity_df['prop_one'] > 0.8))
]

In [None]:
bots = list(bots_df['reviewerID'].unique())

In [None]:
reviews_df_raw['is_bot'] = 0
reviews_df_raw.loc[reviews_df_raw['reviewerID'].isin(bots), 'is_bot'] = 1

In [None]:
reviews_df_raw['num_bots_per_asin'] = reviews_df_raw.groupby('asin')['is_bot'].transform('sum')

In [None]:
reviews_df_raw['num_bots_per_asin'].describe()

Review Text Length Distribution

In [None]:
# review text length
reviews_df_raw['review_length'] = reviews_df_raw['reviewText'].apply(lambda x: len(x) if isinstance(x, str) else 0)
sns.histplot(reviews_df_raw['review_length'], bins=50)
plt.title('Distribution of Review Text Length')
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Length distribution (number of characters or words)
reviews_df_raw['review_length'].describe()

Unique Number of Products and Reviews Per Product

In [None]:
print(f"There are {len(reviews_df_raw['asin'].unique())} unique products.")

In [None]:
# count the number of reviews per product
reviews_per_product = reviews_df_raw.groupby('asin').size().reset_index(name='review_count')
# look at summary statistics for the number of reviews per product
print(reviews_per_product.describe())
# plot the distribution of the number of reviews per product
sns.histplot(reviews_per_product['review_count'], bins=100)



In [None]:
# overview of the number of reviews per product
reviews_per_product['review_count'].value_counts().sort_index().head(20).plot(kind='bar')

In [None]:
# show the product with the most reviews
reviews_per_product.sort_values(by='review_count', ascending=False).head(20)

In [None]:
# look at number of reviews per product and the distribution
review_counts = reviews_df_raw.groupby('asin')['reviewText'].size()
review_counts.describe()

In [None]:
review_counts.quantile([0.05,0.1,0.2,0.25,0.5,0.6,0.75,0.8,0.9,0.95,0.99])

Heatmap of Some of the Columns

In [None]:
# correlation matrix (only numeric columns)
correlation_matrix = reviews_df_raw.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

Copying the Article

In [None]:
# look for certain keywords in the review text and summary
keywords = ['dangerous', 'hazardous', 'unsafe', 'risk', 'warning', 'alert']

for keyword in keywords:
    reviews_df_raw[f'contains_{keyword}'] = reviews_df_raw['reviewText'].str.contains(keyword, case=False, na=False) | \
                                             reviews_df_raw['summary'].str.contains(keyword, case=False, na=False)
# count the number of reviews containing each keyword
keyword_counts = {keyword: reviews_df_raw[f'contains_{keyword}'].sum() for keyword in keywords}
# create a DataFrame for keyword counts 
keyword_counts_df = pd.DataFrame(list(keyword_counts.items()), columns=['Keyword', 'Count'])
# plot the keyword counts
sns.barplot(x='Count', y='Keyword', data=keyword_counts_df)
plt.title('Keyword Counts in Reviews')
plt.xlabel('Count')
plt.ylabel('Keyword')
plt.show()

In [None]:
# show the overall for each keyword
for keyword in keywords:
    overall_counts = reviews_df_raw[reviews_df_raw[f'contains_{keyword}']]['overall'].value_counts().sort_index()
    sns.barplot(x=overall_counts.index, y=overall_counts.values)
    plt.title(f'Overall Ratings for Reviews Containing "{keyword}"')
    plt.xlabel('Overall Rating')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# return all rows that contain the keyword 'hazardous' in the review text or summary
# because we are in a toys and games context, there are some reviews that mention 'hazardous' in a non-negative context, so we will filter them
hazardous_reviews = reviews_df_raw[reviews_df_raw['contains_hazardous']]
print(f"Number of reviews containing 'hazardous': {len(hazardous_reviews)}")
# display the first few rows of hazardous reviews
hazardous_reviews[['reviewerID', 'asin', 'reviewTime', 'overall', 'reviewText', 'summary']].to_clipboard()

In [None]:
# show the asins for the hazardous reviews
hazardous_asins = hazardous_reviews['asin'].unique()
print(f"Unique ASINs for hazardous reviews: {len(hazardous_asins)}")
# display the first few hazardous ASINs
print("Hazardous ASINs:")
for asin in hazardous_asins[:10]:
    print(asin)

In [None]:
# check if the reviews or summaries contain the keyword 'recall'
recall_reviews = reviews_df_raw[reviews_df_raw['reviewText'].str.contains('recall', case=False, na=False) | 
                                 reviews_df_raw['summary'].str.contains('recall', case=False, na=False)]
print(f"Number of reviews containing 'recall': {len(recall_reviews)}")
# display the first few rows of recall reviews
recall_reviews[['reviewerID', 'asin', 'reviewTime', 'overall', 'reviewText', 'summary']].sample(100).to_clipboard()

# some of these talk about an item recall but some are just using the word 'recall' in a different context

In [None]:
# how long are people writing reviews for products that have 1 star ratings
one_star_reviews = reviews_df_raw[reviews_df_raw['overall'] == 1].copy()
one_star_reviews

# groupby asin and see how many months the reviews span
one_star_reviews['review_month'] = one_star_reviews['reviewTime'].dt.to_period('M')
one_star_reviews_grouped = one_star_reviews.groupby('asin')['review_month'].nunique().reset_index()
one_star_reviews_grouped.rename(columns={'review_month': 'months_reviewed'}, inplace=True)
# merge this back to the one_star_reviews DataFrame
one_star_reviews = one_star_reviews.merge(one_star_reviews_grouped, on='asin', how='left')



In [None]:
one_star_reviews['months_reviewed'].describe()

In [None]:
one_star_reviews['asin'].value_counts().head(2)

In [None]:
# find products in reviews_df_raw that have had overall 1 for all reviews
one_star_products = reviews_df_raw.groupby('asin').filter(lambda x: (x['overall'] == 1).all())
print(f"Number of products with all 1-star reviews: {len(one_star_products['asin'].unique())}")

In [None]:
one_star_products

In [None]:
# for each product, take the average of overall rating
avg_rating = reviews_df_raw.groupby('asin')['overall'].mean().reset_index()
avg_rating.rename(columns={'overall': 'avg_overall'}, inplace=True)

avg_var = reviews_df_raw.groupby('asin')['overall'].var().reset_index()
avg_var.rename(columns={'overall': 'var_overall'}, inplace=True)



In [None]:
avg_rating.sort_values(by='avg_overall', ascending=False)

In [None]:
avg_var

In [None]:
avg_and_var_ratings = pd.merge(avg_rating, avg_var, on='asin').sort_values(by='avg_overall', ascending=False)

In [None]:
reviews_df_raw.loc[reviews_df_raw['asin']=='B01HJDFWDK'].sort_values('reviewTime').plot(x='reviewTime', y='overall', kind='line')
# on the same plot, show the average overall rating for the product
plt.axhline(y=avg_and_var_ratings.loc[avg_and_var_ratings['asin'] == 'B01HJDFWDK', 'avg_overall'].values[0], color='r', linestyle='--', label='Average Rating')
plt.axhline(y=avg_and_var_ratings.loc[avg_and_var_ratings['asin'] == 'B01HJDFWDK', 'var_overall'].values[0], color='g', linestyle='--', label='Variance of Rating')
plt.title('Overall Ratings Over Time for Product B01HJDFWDK')
plt.xlabel('Review Time')
plt.ylabel('Overall Rating')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()



In [None]:
sns.histplot(avg_and_var_ratings['avg_overall'], bins=50)

WordCloud-ing

In [None]:
# word cloud of the most common words in the review text
!pip install wordcloud
from wordcloud import WordCloud
text = ' '.join(reviews_df_raw['reviewText'].dropna().astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Review Text')
plt.show()

In [None]:
text = ' '.join(reviews_df_raw['summary'].dropna().astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Summary Text')
plt.show()