In [None]:
import pandas as pd
import os
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

%matplotlib inline
sns.set(font_scale = 1.5)

In [None]:
# raw data
cat_cols = ["userId", "movieId"]
parsed_ratings_df = pd.read_csv(
    './../data/ml-25m/ratings.csv', 
    dtype=dict(zip(cat_cols, ["category"] * 2)))

In [None]:
# downcast ratings to uints from 1 to 10
parsed_ratings_df.loc[:, "rating"] = (parsed_ratings_df.rating * 2).astype("uint8")

# parse timestamps as datetimes
parsed_ratings_df.loc[:, "timestamp"] = pd.to_datetime(parsed_ratings_df["timestamp"], unit="s")

# print result of parsing
parsed_ratings_df.info()

In [None]:
# generate pandas profiling report
#profile = ProfileReport(parsed_ratings_df, title="Pandas Profiling Report", minimal=True)
#profile.to_file("minimal_report.html")`

# Number of ratings per user

In [None]:
user_ratings_df = parsed_ratings_df.groupby("userId")[["rating"]].count()

In [None]:
user_ratings_desc = user_ratings_df.describe()
user_ratings_desc

In [None]:
user_ratings_df

In [None]:
parsed_ratings_df.info()

In [None]:
upper_xlim = 150
sns.displot(user_ratings_df["rating"].loc[lambda srs: srs <= upper_xlim],
            discrete=True, height=8, aspect=1.5,)
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(10))
plt.title("Distribution of number of ratings per user")
plt.ylabel("# of users")
plt.xlabel("# of ratings")
plt.xlim((19, upper_xlim));
#plt.savefig("img/ratings_per_user.pdf", bbox_inches="tight")

# Number of ratings per movie

In [None]:
movie_ratings_df = parsed_ratings_df.groupby("movieId")[["rating"]].count()

In [None]:
movie_ratings_df.describe()

In [None]:
# get quantile for x lim
quantile_xlim = movie_ratings_df["rating"].quantile(.8)
quantile_xlim_count = movie_ratings_df["rating"].eq(quantile_xlim).sum()

# plot 
sns.displot(movie_ratings_df["rating"].loc[lambda srs: srs <= quantile_xlim], 
            discrete=True, height=8, aspect=1.5,)
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(10))
plt.title(f"Distribution of number of ratings per movie\nClipped at quantile .8: "
          f"{quantile_xlim} ratings on {quantile_xlim_count} movies")
plt.ylabel("# of movies")
plt.xlabel("# of ratings")
plt.xlim((0, quantile_xlim));
#plt.savefig("img/ratings_per_movie.pdf", bbox_inches="tight")