In [None]:
# import relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Load the Reviews Data

In [None]:
# reviews_df_raw = pd.read_json("../Data/amazon_reviews.json",lines=True)
# reviews_pickle = reviews_df_raw.to_pickle('reviews_raw.pkl')
reviews_df_raw = pd.read_pickle('../rebekah-idea-testing/reviews_raw.pkl')


Look at NaNs in the Data

In [None]:
# count and percentage of missing values in each column
missing_counts = reviews_df_raw.isna().sum()
missing_percentages = reviews_df_raw.isna().mean() * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage': missing_percentages
})
missing_df = missing_df[missing_df['Missing Count'] > 0]
missing_df = missing_df.sort_values(by='Missing Percentage', ascending=False)
print("Missing Values Overview:")
print(missing_df)

In [None]:
reviews_df_raw = reviews_df_raw.dropna(subset=['summary','reviewText', 'reviewerName'])

Drop Columns

In [None]:
# from this we decide to drop image, vote, style
# we will also drop reviewTime and use unixReviewTime because it is the same values but
# an easier format to work with

reviews_df_raw = reviews_df_raw.drop(columns=['image', 'vote', 'style', 'reviewTime'])

Look at Duplicates in the Data
- for now, we will keep duplicates

In [None]:
# check for duplicates in the reviews
duplicates = reviews_df_raw.duplicated(subset=['reviewerID', 'asin', 'unixReviewTime', 'overall'], keep=False)
print(f"Number of duplicate reviews: {duplicates.sum()}")
# show the duplicate reviews
duplicate_reviews = reviews_df_raw[duplicates]
duplicate_reviews[['reviewerID', 'asin', 'unixReviewTime', 'overall', 'reviewText']]

Convert unixReviewTime column to Datetime Objects

In [None]:
reviews_df_raw['date'] = pd.to_datetime(reviews_df_raw['unixReviewTime'], unit='s')

Features Related to Overall

In [None]:
# using simple aggregation techniques for the rating column
reviews_df_raw['avg_rating'] = reviews_df_raw.groupby('asin')['overall'].transform('mean')
reviews_df_raw['min_rating'] = reviews_df_raw.groupby('asin')['overall'].transform('min')
reviews_df_raw['min_rating'] = reviews_df_raw.groupby('asin')['overall'].transform('max')
reviews_df_raw['num_of_rating'] = reviews_df_raw.groupby('asin')['overall'].transform('count')

In [None]:
# percent positive reviews -- rating >= 4
# percent negative reviews -- rating <= 2
reviews_df_raw['is_positive'] = (reviews_df_raw['overall'] >= 4).astype(int)
reviews_df_raw['is_negative'] = (reviews_df_raw['overall'] <= 2).astype(int)

reviews_df_raw['percent_positive'] = reviews_df_raw.groupby('asin')['is_positive'].transform('mean')
reviews_df_raw['percent_negative'] = reviews_df_raw.groupby('asin')['is_negative'].transform('mean')

Features Related to Verified
- this does not seem like a great feature to use because it is so imbalanced

In [None]:
reviews_df_raw['avg_verified_reviewers'] = reviews_df_raw.groupby('asin')['verified'].transform('mean')

Features Related to UnixReviewTime

In [None]:
# compute product lifespan
reviews_df_raw['min_date'] = reviews_df_raw.groupby('asin')['date'].transform('min') 
reviews_df_raw['max_date'] = reviews_df_raw.groupby('asin')['date'].transform('max') 
reviews_df_raw['product_lifespan'] = reviews_df_raw['max_date']-reviews_df_raw['min_date']

Features Related to ReviewerID

In [None]:
# count total number of five star reviews per reviewer
all_fives = reviews_df_raw[reviews_df_raw['overall'] == 5].groupby('reviewerID').size().reset_index(name='count_of_five_star_ratings_by_reviewer')
# count total number of one star reviews per reviewer
all_ones = reviews_df_raw[reviews_df_raw['overall'] == 1].groupby('reviewerID').size().reset_index(name='count_of_one_star_ratings_by_reviewer')
#count number of total number reviews per reviewer
all_ratings = reviews_df_raw.groupby('reviewerID').size().reset_index(name='count_of_ratings_by_reviewer')

In [None]:
# merge onto one dataframe
all_ratings = all_ratings.merge(all_fives, on='reviewerID', how='left')
all_ratings = all_ratings.merge(all_ones, on='reviewerID', how='left')

In [None]:
# if reviewer gave no fives, fill with 0
all_ratings['count_of_five_star_ratings_by_reviewer'] = all_ratings['count_of_five_star_ratings_by_reviewer'].fillna(0).astype(int)
# if reviewer gave no ones, fill with 0
all_ratings['count_of_one_star_ratings_by_reviewer'] = all_ratings['count_of_one_star_ratings_by_reviewer'].fillna(0).astype(int)


In [None]:
# get proportion of fives and proportion of ones over total count
all_ratings['prop_five'] = all_ratings['count_of_five_star_ratings_by_reviewer'] / all_ratings['count_of_ratings_by_reviewer']
all_ratings['prop_one'] = all_ratings['count_of_one_star_ratings_by_reviewer'] / all_ratings['count_of_ratings_by_reviewer']
all_ratings

In [None]:
# find first and last date of review by reviewer
reviewer_dates = reviews_df_raw.groupby('reviewerID')['date'].agg(['min', 'max']).reset_index()
# get the difference between those dates
reviewer_dates['review_span'] = reviewer_dates['max']-reviewer_dates['min']

In [None]:
# build suspicious activity df to find potential bots
suspicious_activity_df = all_ratings.merge(reviewer_dates[['reviewerID', 'review_span']], on='reviewerID', how='left')
products_reviewed = reviews_df_raw.groupby('reviewerID')['asin'].nunique().reset_index()
products_reviewed = products_reviewed.rename(columns={'asin':'num_unique_products'})
suspicious_activity_df = suspicious_activity_df.merge(products_reviewed, on='reviewerID', how='left')
suspicious_activity_df['reviews_per_day'] = suspicious_activity_df['count_of_ratings_by_reviewer'] / (suspicious_activity_df['review_span'].dt.days + 1 )


In [None]:
# make bots df by specifying criteria based on distribution of each feature
bots_df = suspicious_activity_df[
    (suspicious_activity_df['count_of_ratings_by_reviewer'] > 10) & 
    (suspicious_activity_df['reviews_per_day'] > 3) & 
    (suspicious_activity_df['num_unique_products'] > 10) & 
    ((suspicious_activity_df['prop_five'] == 1) | (suspicious_activity_df['prop_one'] > 0.8))
]

In [None]:
# put bot count on reviews_data_df
bots = list(bots_df['reviewerID'].unique())
reviews_df_raw['is_bot'] = 0
reviews_df_raw.loc[reviews_df_raw['reviewerID'].isin(bots), 'is_bot'] = 1

reviews_df_raw['num_bots_per_asin'] = reviews_df_raw.groupby('asin')['is_bot'].transform('sum')

In [None]:
suspicious_activity_df.columns

In [None]:
reviews_df_raw = reviews_df_raw.merge(suspicious_activity_df[['reviewerID', 'reviews_per_day']], on='reviewerID', how='left')

In [None]:
# number of unique reviewers per product
reviews_df_raw['unique_reviewer_count'] = reviews_df_raw.groupby('asin')['reviewerID'].transform('nunique')

In [None]:
reviews_df_raw['avg_reviews_per_day'] = reviews_df_raw.groupby('asin')['reviews_per_day'].transform('mean')

Features Related to ReviewText

In [None]:
# number of reviews per product
reviews_df_raw['reviews_per_product'] = reviews_df_raw.groupby('asin')['reviewText'].transform('count')

In [None]:
# word count of each review
reviews_df_raw['review_length_words'] = reviews_df_raw['reviewText'].str.split().str.len()

# character count of each review
reviews_df_raw['review_length_chars'] = reviews_df_raw['reviewText'].str.len()

# aggregate the above columns
reviews_df_raw['avg_review_length_words'] = reviews_df_raw.groupby('asin')['review_length_words'].transform('mean')
reviews_df_raw['avg_review_length_chars'] = reviews_df_raw.groupby('asin')['review_length_chars'].transform('mean')


Make Final Reviews DF grouped by ASIN

In [None]:
final_reviews_df = reviews_df_raw[['asin', 'avg_rating', 'min_rating', 'num_of_rating','percent_positive', 'percent_negative', 'avg_verified_reviewers',
                'min_date', 'max_date', 'product_lifespan', 'num_bots_per_asin', 'unique_reviewer_count', 'avg_reviews_per_day', 'reviews_per_product',
                'avg_review_length_words', 'avg_review_length_chars' ]].drop_duplicates()

In [None]:
final_reviews_df.isna().sum()

In [None]:
final_reviews_df

In [None]:
pd.read_parquet('final_reviews.parquet').shape