In [None]:
import os
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from gym import utils
import gym
import pickle

### MLens Dataset Loading & Preparation

In [None]:
# Loading the MovieLens dataset
ML_LATEST_SMALL_DATA_ROOT_PATH = "../data/dt-datasets/movielens/ml-latest-small"
ML_LATEST_DATA_ROOT_PATH = "../data/dt-datasets/movielens/ml-latest"

##### Load the ML-latest-small dataset

In [None]:
links_df = pd.read_csv(os.path.join(ML_LATEST_SMALL_DATA_ROOT_PATH, "links.csv"))
movies_df = pd.read_csv(os.path.join(ML_LATEST_SMALL_DATA_ROOT_PATH, "movies.csv"))
ratings_df = pd.read_csv(os.path.join(ML_LATEST_SMALL_DATA_ROOT_PATH, "ratings.csv"))
tags_df = pd.read_csv(os.path.join(ML_LATEST_SMALL_DATA_ROOT_PATH, "tags.csv"))
print(f"links shape: {links_df.shape}\nmovies shape: {movies_df.shape}\nratings shape: {ratings_df.shape}\ntags shape: {tags_df.shape}")

In [None]:
# Data distribution of the rating count per user in the small dataset
rating_count_per_user = ratings_df.groupby('userId')['movieId'].count().values
print(f'min no of rating: {np.min(rating_count_per_user)}, max no of rating: {np.max(rating_count_per_user)}')
print(f'average rating per user: {np.mean(rating_count_per_user)}, median rating per user: {np.median(rating_count_per_user)}')
print(f'number of users who rated less than 140 movies: {(rating_count_per_user <= 140).sum()}')


# Visualize
plt.rcParams.update({
    'font.family': 'sans-serif',
    'font.sans-serif': ['Helvetica']
})
sns.set_palette("rocket")
plt.figure(figsize=(10, 6), dpi=300)
sns.histplot(data = rating_count_per_user)
plt.ylabel("No of movies rated by users", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.show()
# plt.savefig("../Figures/Movielens/dist_of_no_of_rated_movies_for_users.pdf")
plt.savefig("../Figures/Movielens/dist_of_no_of_rated_movies_for_users.pdf", format="pdf", bbox_inches="tight")
# sns.boxplot(rating_count_per_user)

In [None]:
# List of all the genres
np.unique(np.concatenate(movies_df['genres'].apply(lambda g: g.split("|")).tolist()))

In [None]:
# Distribution of the movie ratings

plt.figure(figsize=(10, 6), dpi=300)
ratings = sorted([str(rating) for rating in ratings_df['rating']])
sns.histplot(ratings, binwidth=1)
# plt.xticks(ticks)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel("Ratings", fontsize = 14)
plt.savefig("../Figures/Movielens/dist_of_movie_ratings.pdf", format="pdf", bbox_inches="tight")
# plt.show()

In [None]:
# import matplotlib.pyplot as plt
# import pandas as pd

# # Assuming 'ratings' is a column containing string ratings in your DataFrame
# ratings = ratings_df['rating']

# # Count the occurrences of each rating
# rating_counts = ratings.value_counts()

# # Plotting a pie chart
# plt.figure(figsize=(10, 10), dpi=300)
# plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'), textprops={'fontsize': 14})
# plt.title('Distribution of Movie Ratings', fontsize=16)
# plt.show()


In [None]:
# Movie ratings distribution
min_rating = np.min(ratings_df['rating'])
max_rating = np.max(ratings_df['rating'])
avg_rating = np.mean(ratings_df['rating'])
median_rating = np.median(ratings_df['rating'])
sd = np.std(ratings_df['rating'])
print(f"min: {min_rating}, max: {max_rating}, avg: {avg_rating}, median: {median_rating}, SD: {sd}")

##### Merge the movies and ratings dataframes together

In [None]:
movies_and_ratings = pd.merge(movies_df, ratings_df, on='movieId')
movies_and_ratings.shape

In [None]:
movies_and_ratings.head()

In [None]:
# Check if there is any duplicate
print(movies_and_ratings.duplicated().sum())

print(movies_and_ratings.isna().sum())
movies_and_ratings[movies_and_ratings[['movieId', 'userId']].duplicated(keep=False)]

In [None]:
movies_and_ratings.dtypes

In [None]:
# Check if there are any null values
movies_and_ratings['userId'].isna().sum(),  movies_and_ratings['movieId'].isna().sum()

In [None]:
# Check if all the userIds are present in the merged df as well
sorted(ratings_df.userId.unique()) ==  sorted(movies_and_ratings.userId.unique())

In [None]:
# Sort the movie ratings by userId and timestamp in an ascending manner
movies_and_ratings = movies_and_ratings.sort_values(by=['userId', 'timestamp'])
movies_and_ratings.head()

In [None]:
# Find out the different rating values
sorted(movies_and_ratings['rating'].unique())

In [None]:
# Another reward scheme
# Scaling the rewards between 0 & 1
highest_rating = 5.0
movies_and_ratings['reward'] = (1 - abs(movies_and_ratings['rating'] - highest_rating) / 4.5) ** 2

In [None]:
# Number of rated movies per user
grouped = movies_and_ratings.groupby('userId')

In [None]:
no_of_ratings_per_user = movies_and_ratings['userId'].value_counts()

In [None]:
no_of_ratings_per_user.min(), no_of_ratings_per_user.max(), no_of_ratings_per_user.mean(), no_of_ratings_per_user.median()

In [None]:
# Earliest and latest timestamps
movies_and_ratings['timestamp'].min(), movies_and_ratings['timestamp'].max()

In [None]:
all_genres = movies_and_ratings['genres']

In [None]:
genres_list = []
for genres in all_genres:
    genre_splitted = genres.split("|")
    genres_list += genre_splitted
genres_arr = np.array(genres_list)
genres_arr.shape

In [None]:
unique_elements, counts = np.unique(genres_arr, return_counts=True)

In [None]:
data = dict(genre=unique_elements, count=counts)
genre_count_df = pd.DataFrame(data)
genre_count_df

In [None]:
# Create the bar plot with seaborn
plt.figure(figsize=(10, 6), dpi=300)
sns.set_palette("rocket")
sns.barplot(x="genre", y="count", data=genre_count_df, palette="rocket")

plt.xlabel("Genres", fontsize=14)
plt.ylabel("Count (log scaled)", fontsize=14)
plt.yscale("log")
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.savefig("../Figures/Movielens/dist_of_movie_genres.pdf", format="pdf", bbox_inches="tight")

In [None]:
movies_and_ratings[movies_and_ratings['movieId'] == 804]

In [None]:
movies_and_ratings['genre_count_per_movie'] = movies_and_ratings['genres'].apply(lambda x: len(x.split('|')))

In [None]:
movies_and_ratings['genre_count_per_movie'].max()

In [None]:
grouped_tags = tags_df.groupby(['userId', 'movieId'])

In [None]:
tag_counts_per_user = tags_df[['userId', 'movieId']].value_counts().tolist()

In [None]:
np.mean(tag_counts_per_user)

In [None]:
tags_df['tag'].value_counts()

In [None]:
tags_df['tag'].apply(lambda x: x.lower()).nunique()

In [None]:
# Outlier analysis

sns.boxplot(data = movies_and_ratings['genre_count_per_movie'])
plt.xlabel("Genres", fontsize=14)
plt.ylabel("Count (log scaled)", fontsize=14)
plt.yscale("log")
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)

In [None]:
plt.figure(figsize=(10,6), dpi=300)
sns.set_palette("rocket")
sns.boxplot(data = movies_and_ratings['genre_count_per_movie'], width=0.1)
# plt.xlabel("Genres", fontsize=14)
plt.ylabel("Genre count per movie", fontsize=14)
# plt.yscale("log")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig("../Figures/Movielens/boxplot_genre_count_per_movie.pdf", format="pdf", bbox_inches="tight")

In [None]:
plt.figure(figsize=(10,6), dpi=300)
sns.boxplot(data = ratings_df['userId'].value_counts().tolist(), width=0.1, color="teal")
# plt.xlabel("Genres", fontsize=14)
plt.ylabel("Rating count per user (log scaled)", fontsize=14)
plt.yscale("log")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig("../Figures/Movielens/boxplot_per_user_rating_count.pdf", format="pdf", bbox_inches="tight")

In [None]:
plt.figure(figsize=(10,6), dpi=300)
sns.boxplot(data = tags_df['tag'].value_counts(), width=0.1, color='maroon')
# plt.xlabel("Genres", fontsize=14)
plt.ylabel("Tag usage count (log sclaed)", fontsize=14)
plt.yscale("log")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig("../Figures/Movielens/boxplot_tag_usage_count.pdf", format="pdf", bbox_inches="tight")

In [None]:
plt.figure(figsize=(10,6), dpi=300)
sns.boxplot(data=tag_counts_per_user, width=0.1, color="blue")
# plt.xlabel("Genres", fontsize=14)
plt.ylabel("Tag count (log scaled)", fontsize=14)
plt.yscale("log")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig("../Figures/Movielens/boxplot_tag_counts_per_user.pdf", format="pdf", bbox_inches="tight")

In [None]:
tags_df['movieId'].nunique()

In [None]:
movies_and_ratings['movieId'].nunique() - tags_df['movieId'].nunique()