In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [3]:
links = pd.read_csv('../data/ml-latest-small/links.csv')
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
tags = pd.read_csv('../data/ml-latest-small/tags.csv')

### Genres

In [4]:
# 19 Genres as listed in readme.txt
unique_genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)",
]

In [4]:
genre_counts = {}

for genre in unique_genres:
    genre_counts[genre] = 0
    for movie in movies.values:
        if genre in movie[2].split():
            genre_counts[genre] += 1

In [10]:
df_gc = pd.Series(genre_counts)
df_gc.sort_values(ascending=False, inplace=True)

In [11]:
px.bar(df_gc, x=df_gc.index, y=df_gc.values)

In [14]:
movies_ratings = pd.merge(movies, ratings, 'outer', on='movieId')

In [32]:
avg_rating = movies_ratings.groupby(['movieId', 'title', 'genres'])[['rating']].mean().sort_values(by='rating', ascending=False)
avg_rating.rename(columns={'rating': 'avg_rating'}, inplace=True)
avg_rating

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,avg_rating
movieId,title,genres,Unnamed: 3_level_1
88448,Paper Birds (Pájaros de papel) (2010),Comedy|Drama,5.0
100556,"Act of Killing, The (2012)",Documentary,5.0
143031,Jump In! (2007),Comedy|Drama|Romance,5.0
143511,Human (2015),Documentary,5.0
143559,L.A. Slasher (2015),Comedy|Crime|Fantasy,5.0
...,...,...,...
30892,In the Realms of the Unreal (2004),Animation|Documentary,
32160,Twentieth Century (1934),Comedy,
32371,Call Northside 777 (1948),Crime|Drama|Film-Noir,
34482,"Browning Version, The (1951)",Drama,


In [33]:
total_ratings = movies_ratings.groupby(['movieId', 'title', 'genres'])[['rating']].count().sort_values(by='rating', ascending=False)
total_ratings.rename(columns={'rating': 'total_ratings'}, inplace=True)
total_ratings

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_ratings
movieId,title,genres,Unnamed: 3_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329
318,"Shawshank Redemption, The (1994)",Crime|Drama,317
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278
...,...,...,...
1076,"Innocents, The (1961)",Drama|Horror|Thriller,0
34482,"Browning Version, The (1951)",Drama,0
25855,"Roaring Twenties, The (1939)",Crime|Drama|Thriller,0
85565,Chalet Girl (2011),Comedy|Romance,0


In [34]:
concat = pd.concat([avg_rating, total_ratings], axis=1)

In [42]:
top_rated = concat[concat.total_ratings >= 10].sort_values(by=['avg_rating'], ascending=False)
top_rated.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,avg_rating,total_ratings
movieId,title,genres,Unnamed: 3_level_1,Unnamed: 4_level_1
1041,Secrets & Lies (1996),Drama,4.590909,11
3451,Guess Who's Coming to Dinner (1967),Drama,4.545455,11
1178,Paths of Glory (1957),Drama|War,4.541667,12
1104,"Streetcar Named Desire, A (1951)",Drama,4.475,20
2360,"Celebration, The (Festen) (1998)",Drama,4.458333,12
1217,Ran (1985),Drama|War,4.433333,15
318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317
951,His Girl Friday (1940),Comedy|Romance,4.392857,14
1927,All Quiet on the Western Front (1930),Action|Drama|War,4.35,10
3468,"Hustler, The (1961)",Drama,4.333333,18


In [70]:
genre_top_rated = {}
for genre in unique_genres:
    for movie in top_rated.index:
        if genre in movie[2].split():
            genre_top_rated[genre] = (movie[0], movie[1])
            break
df_tp = pd.Series(genre_top_rated)
df_tp

Action                                  (9, Sudden Death (1995))
Comedy                          (176, Living in Oblivion (1995))
Documentary                            (246, Hoop Dreams (1994))
Drama                              (1041, Secrets & Lies (1996))
Fantasy        (135143, Fantastic Beasts and Where to Find Th...
Horror                               (1258, Shining, The (1980))
Musical                       (918, Meet Me in St. Louis (1944))
Mystery        (8228, Maltese Falcon, The (a.k.a. Dangerous F...
Romance                            (2340, Meet Joe Black (1998))
Sci-Fi                                  (164179, Arrival (2016))
Thriller                              (142488, Spotlight (2015))
Western                       (2921, High Plains Drifter (1973))
dtype: object

To get down to 10 movies, we could drop 'Western' and 'Romance' since they are the least represented genres in this group