In [None]:
import pandas as pd
import os
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

%matplotlib inline
sns.set(font_scale = 1.5)

In [None]:
# raw data
cat_cols = ["userId", "movieId"]
parsed_ratings_df = pd.read_csv(
    './../data/ml-25m/ratings.csv', 
    dtype=dict(zip(cat_cols, ["category"] * 2)))

In [None]:
# downcast ratings to uints from 1 to 10
parsed_ratings_df.loc[:, "rating"] = (parsed_ratings_df.rating * 2).astype("uint8")

# parse timestamps as datetimes
parsed_ratings_df.loc[:, "timestamp"] = pd.to_datetime(parsed_ratings_df["timestamp"], unit="s")

# print result of parsing
parsed_ratings_df.info()

# Whole Dataset

In [None]:
# generate pandas profiling report
#profile = ProfileReport(parsed_ratings_df, title="Pandas Profiling Report", minimal=True)
#profile.to_file("minimal_report.html")`

## Number of ratings per user

In [None]:
user_ratings_df = parsed_ratings_df.groupby("userId", observed=True)[["rating"]].count()

In [None]:
user_ratings_desc = user_ratings_df.describe()
user_ratings_desc

In [None]:
user_ratings_df

In [None]:
    
upper_xlim = 150
sns.displot(user_ratings_df["rating"].loc[lambda srs: srs <= upper_xlim],
            discrete=True, height=8, aspect=1.5,)
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(10))
plt.title("Distribution of number of ratings per user")
plt.ylabel("# of users")
plt.xlabel("# of ratings")
plt.xlim((19, upper_xlim));
#plt.savefig("img/ratings_per_user.pdf", bbox_inches="tight")

## Number of ratings per movie

In [None]:
movie_ratings_df = parsed_ratings_df.groupby("movieId", observed=True)[["rating"]].count()

In [None]:
movie_ratings_df.describe()

In [None]:
# get quantile for x lim
quantile_xlim = movie_ratings_df["rating"].quantile(.8)
quantile_xlim_count = movie_ratings_df["rating"].eq(quantile_xlim).sum()

# plot 
sns.displot(movie_ratings_df["rating"].loc[lambda srs: srs <= quantile_xlim], 
            discrete=True, height=8, aspect=1.5,)
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(10))
plt.title(f"Distribution of number of ratings per movie\nClipped at quantile .8: "
          f"{quantile_xlim} ratings on {quantile_xlim_count} movies")
plt.ylabel("# of movies")
plt.xlabel("# of ratings")
plt.xlim((0, quantile_xlim));
#plt.savefig("img/ratings_per_movie.pdf", bbox_inches="tight")

# ASMG subset

In [None]:
start_date = "20140101"
end_date = "20190101"
asmg_ratings_df = parsed_ratings_df.loc[lambda df: df["timestamp"].between(
    start_date, end_date, inclusive="left"), :]
asmg_ratings_df.shape

In [None]:
# generate pandas profiling report
# profile = ProfileReport(asmg_ratings_df.reset_index(), 
# title="Pandas Profiling Report", minimal=True)
# profile.to_file("minimal_report-asmg_subset.html")

## Number of ratings per user

In [None]:
asmg_user_ratings_df = asmg_ratings_df.groupby("userId", observed=True)[["rating"]].count()

In [None]:
asmg_user_ratings_desc = asmg_user_ratings_df.describe()
asmg_user_ratings_desc

In [None]:
asmg_user_ratings_df

In [None]:
    
upper_xlim = 200
sns.displot(asmg_user_ratings_df["rating"].loc[lambda srs: srs <= upper_xlim],
            discrete=True, height=8, aspect=1.5,)
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(10))
plt.title("Distribution of number of ratings per user")
plt.ylabel("# of users")
plt.xlabel("# of ratings")
plt.xlim((19, upper_xlim));
plt.savefig("img/asmg_subset-ratings_per_user.pdf", bbox_inches="tight")

## Number of ratings per movie

In [None]:
asmg_movie_ratings_df = asmg_ratings_df.groupby("movieId", observed=True)[["rating"]].count()

asmg_movie_ratings_df = asmg_ratings_df.groupby("movieId")[["rating"]].agg(lambda x: len(x))

In [None]:
asmg_movie_ratings_df.describe()

In [None]:
# get quantile for x lim
quantile_ = .85
quantile_xlim = asmg_movie_ratings_df["rating"].quantile(quantile_)
quantile_xlim_count = asmg_movie_ratings_df["rating"].eq(quantile_xlim).sum()

# plot 
sns.displot(asmg_movie_ratings_df["rating"].loc[lambda srs: srs <= quantile_xlim], 
            discrete=True, height=8, aspect=1.5,)
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(10))
plt.title(f"Distribution of number of ratings per movie\nClipped at quantile {quantile_}: "
          f"{quantile_xlim} ratings on {quantile_xlim_count} movies")
plt.ylabel("# of movies")
plt.xlabel("# of ratings")
plt.xlim((0, quantile_xlim));
# plt.savefig("img/asmg_subset-ratings_per_movie.pdf", bbox_inches="tight")