## Import Libraries

In [21]:
import numpy as np
import pandas as pd
import ast

In [22]:
movies = pd.read_csv('../data/NewMoviesMetadata.csv')
movies.head()

Unnamed: 0,adult,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,status,tagline,title,video,vote_average,vote_count,name_belongs_to_collection,id_belongs_to_collection,poster_path_belongs_to_collection,backdrop_path_belongs_to_collection
0,False,30000000,"['animation', 'comedy', 'family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,Released,,Toy Story,False,7.7,5415.0,Toy Story Collection,10194.0,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
1,False,65000000,"['adventure', 'fantasy', 'family']",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,,,,
2,False,0,"['romance', 'comedy']",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,Grumpy Old Men Collection,119050.0,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg
3,False,16000000,"['comedy', 'drama', 'romance']",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,,,,
4,False,0,[],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Father of the Bride Collection,96871.0,/nts4iOmNnq7GNicycMJ9pSAn204.jpg,/7qwE57OVZmMJChBpLEbJEmzUydk.jpg


In [39]:
movies = movies.copy()
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').dt.year

def parse_genres(x):
    if isinstance(x, list):
        return [str(g).strip().lower() for g in x]
    if isinstance(x, str):
        try:
            lst = ast.literal_eval(x)
            if isinstance(lst, list):
                return [str(g).strip().lower() for g in lst]
        except Exception:
            pass
        return [x.strip().lower()]
    return []

movies['genres'] = movies['genres'].apply(parse_genres)

gen_md = movies.explode('genres').rename(columns={'genres': 'genre'})


In [25]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [26]:
treshold = vote_counts.quantile(0.95)
treshold

434.0

In [27]:
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [28]:
qualified = movies[(movies['vote_count'] >= treshold) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [29]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+treshold) * R) + (treshold/(treshold+v) * C)

In [30]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [31]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

## Top Movies

In [32]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[action, thriller, sciencefiction, mystery, ad...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[drama, action, crime, thriller]",7.905871
22877,Interstellar,2014,11187,8,32.213481,"[adventure, drama, sciencefiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[adventure, fantasy, action]",7.871787
292,Pulp Fiction,1994,8670,8,140.950236,"[thriller, crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.645403,"[drama, crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[adventure, fantasy, action]",7.861927
351,Forrest Gump,1994,8147,8,48.307194,"[comedy, drama, romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[adventure, fantasy, action]",7.851924


## Top Movies by Genre

In [None]:
def build_chart(genre, percentile=0.85):
    g = str(genre).strip().lower()
    df = gen_md[gen_md['genre'] == g].copy()
    if df.empty:
        return pd.DataFrame(columns=['title','year','vote_count','vote_average','popularity','wr'])

    vote_counts   = df['vote_count'].dropna().astype(int)
    vote_averages = df['vote_average'].dropna().astype(float)

    if vote_counts.empty or vote_averages.empty:
        return pd.DataFrame(columns=['title','year','vote_count','vote_average','popularity','wr'])

    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    qualified = df[
        (df['vote_count'].notna()) &
        (df['vote_average'].notna()) &
        (df['vote_count'] >= m)
    ][['title','year','vote_count','vote_average','popularity']].copy()

    qualified['vote_count']   = qualified['vote_count'].astype(int)
    qualified['vote_average'] = qualified['vote_average'].astype(float)

    qualified['wr'] = qualified.apply(
        lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) +
                  (m/(m+x['vote_count']) * C),
        axis=1
    )

    return qualified.sort_values('wr', ascending=False).head(250)


#### Top Drama Movies

In [38]:
build_chart('drama').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9.1,34.457024,8.671923
314,The Shawshank Redemption,1994,8358,8.5,51.645403,8.46865
834,The Godfather,1972,6024,8.5,41.109264,8.456708
12481,The Dark Knight,2008,12269,8.3,123.167259,8.280225
40245,Your Name.,2016,1030,8.5,34.461252,8.265886
522,Schindler's List,1993,4436,8.3,41.725123,8.2461
2211,Life Is Beautiful,1997,3643,8.3,39.39497,8.23469
1178,The Godfather: Part II,1974,3418,8.3,36.629307,8.230517
1176,Psycho,1960,2405,8.3,36.826309,8.202452
351,Forrest Gump,1994,8147,8.2,48.307194,8.171594
