In [3]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

DATA = Path.cwd().parent / "data"

# Customers leave star-ratings for products (1-5 stars).
MAX_RATING = 5
RATING_BINS = [i + 0.5 for i in range(MAX_RATING + 1)]

# Explore Products

In [None]:
products = pd.read_csv(DATA / "products.csv")

products.head(10)

In [None]:
products.groupby("product_brand", sort=False, dropna=False).size().sort_values(
    ascending=False
)

In [None]:
products.groupby("product_department", sort=False, dropna=False).size().sort_values(
    ascending=False
).plot.bar()

plt.show()

# Explore Reviews

In [None]:
reviews = pd.read_csv(DATA / "reviews.csv")

reviews.head(10)

In [None]:
reviews[["review_rating"]].plot.hist(bins=RATING_BINS)

plt.show()

In [None]:
review_aggregation_by_product = reviews.groupby("product_id", sort=False).aggregate(
    review_rating_mean=("review_rating", "mean"),
    review_rating_count=("review_rating", "count"),
)
# TODO: Maybe we don't aggregate count yet - let them figure out why count is useful?

review_aggregation_by_product.head(10)

In [None]:
review_aggregation_by_product[["review_rating_mean"]].plot.hist(bins=RATING_BINS)
review_aggregation_by_product[["review_rating_count"]].plot.hist(log=True)

plt.show()

# Join aggregated ratings to products

In [None]:
product_ratings = products.merge(
    review_aggregation_by_product, left_on="product_id", right_index=True
).set_index("product_id")

product_ratings.head(10)

# Generate Recommendations

## Top Rated Products (Overall)

In [None]:
# Show top rated products
product_ratings.sort_values(by="review_rating_mean", ascending=False).head(10)

In [None]:
# Now think about tie-breaking
product_ratings.sort_values(
    by=["review_rating_mean", "review_rating_count"], ascending=False
).head(10)

In [None]:
# TODO[IndabaX]: Ignore for now, we'll come back to this later

## Top Rated Products (By Brand)

In [None]:
# Show top rated products (with tie-breaking)
product_ratings[product_ratings["product_brand"] == "LEGO"].sort_values(
    by=["review_rating_mean", "review_rating_count"], ascending=False
).head(10)

In [None]:
# TODO[IndabaX]: Ignore for now, we'll come back to this later

## Top Rated Products (By Department)

In [None]:
# Show top rated products (with tie-breaking)
product_ratings[product_ratings["product_department"] == "Cellphones & Wearables"].sort_values(
    by=["review_rating_mean", "review_rating_count"], ascending=False
).head(10)

In [None]:
# TODO[IndabaX]: Ignore for now, we'll come back to this later

# TODO: Cherry pick an example that illustrates the idea of trusting more reviews rather than higher mean

# Something Better

In [None]:
# Compute the expected rating using Laplace Rule of Succession
# https://www.youtube.com/watch?v=8idr1WZ1A7Q
# TODO: Links, markdown, example
product_ratings["review_rating_expected"] = (
    (product_ratings["review_rating_mean"] * product_ratings["review_rating_count"])
    + MAX_RATING
) / (product_ratings["review_rating_count"] + 2)

In [None]:
# Look at the effect of this transformation
product_ratings[["review_rating_mean"]].plot.hist(bins=RATING_BINS)
product_ratings[["review_rating_expected"]].plot.hist(bins=RATING_BINS)

plt.show()

## Now go back and generate recommendations using this new 'expected' rating