In [1]:
import pandas as pd

# Load the datasets
path = 'data/ml-latest-small/'
movies_path = path + "movies.csv"
ratings_path = path + "ratings.csv"

# Load movies and ratings datasets
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

# Merge movies and ratings datasets on movieId
movie_ratings = ratings.merge(movies, on="movieId")

# Metric 1: Most Rated Movies
print("\nCalculating Most Rated Movies...")
most_rated = (
    movie_ratings.groupby("title")
    .size()
    .sort_values(ascending=False)
    .head(10)
)
print("Most Rated Movies:")
print(most_rated)

# Metric 2: Top-Rated Movies (Simple Average)
print("\nCalculating Top-Rated Movies (Minimum 50 Ratings)...")
average_ratings = (
    movie_ratings.groupby("title")
    .agg(average_rating=("rating", "mean"), num_ratings=("rating", "size"))
)
top_rated_simple = average_ratings[average_ratings["num_ratings"] >= 50].sort_values(
    by="average_rating", ascending=False
).head(10)
print("Top-Rated Movies (Simple Average):")
print(top_rated_simple)

# Metric 3: Top-Rated Movies (Weighted Score)
print("\nCalculating Top-Rated Movies (Weighted by Popularity)...")
lambda_value = 50  # Damping factor
average_ratings["weighted_score"] = (
    average_ratings["average_rating"] * average_ratings["num_ratings"]
) / (average_ratings["num_ratings"] + lambda_value)

top_rated_weighted = average_ratings.sort_values(
    by="weighted_score", ascending=False
).head(10)
print("Top-Rated Movies (Weighted):")
print(top_rated_weighted[["average_rating", "num_ratings", "weighted_score"]])



Calculating Most Rated Movies...
Most Rated Movies:
title
Forrest Gump (1994)                          329
Shawshank Redemption, The (1994)             317
Pulp Fiction (1994)                          307
Silence of the Lambs, The (1991)             279
Matrix, The (1999)                           278
Star Wars: Episode IV - A New Hope (1977)    251
Jurassic Park (1993)                         238
Braveheart (1995)                            237
Terminator 2: Judgment Day (1991)            224
Schindler's List (1993)                      220
dtype: int64

Calculating Top-Rated Movies (Minimum 50 Ratings)...
Top-Rated Movies (Simple Average):
                                                    average_rating  \
title                                                                
Shawshank Redemption, The (1994)                          4.429022   
Godfather, The (1972)                                     4.289062   
Fight Club (1999)                                         4.272936   