In [1]:
import pandas as pd
import numpy as np

<h2>Load Cleaned Movies Dataset</h2>

In [2]:
df = pd.read_pickle("../datasets/clean/movies_df.pkl")  

In [3]:
df.shape

(31516, 28)

In [4]:
df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,cast,director,producer,text_corpus
0,False,Toy Story Collection,30000000,"Animation, Comedy, Family",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",John Lasseter,"Bonnie Arnold, Ralph Guggenheim","Toy Story, Animation, Comedy, Family, Tom Hank..."
1,False,,65000000,"Adventure, Fantasy, Family",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Joe Johnston,"Scott Kroopf, William Teitler","Jumanji, Adventure, Fantasy, Family, Robin Wil..."
2,False,Grumpy Old Men Collection,0,"Romance, Comedy",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"Walter Matthau, Jack Lemmon, Ann-Margret, Soph...",Howard Deutch,,"Grumpier Old Men, Romance, Comedy, Walter Matt..."
3,False,,16000000,"Comedy, Drama, Romance",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"Whitney Houston, Angela Bassett, Loretta Devin...",Forest Whitaker,"Ronald Bass, Ezra Swerdlow, Deborah Schindler,...","Waiting to Exhale, Comedy, Drama, Romance, Whi..."
4,False,Father of the Bride Collection,0,Comedy,,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Charles Shyer,Nancy Meyers,"Father of the Bride Part II, Comedy, Steve Mar..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45531,False,,0,Science Fiction,,222848,tt0112613,en,Caged Heat 3000,It's the year 3000 AD. The world's most danger...,...,Released,,Caged Heat 3000,False,3.5,1.0,"Lisa Boyle, Kena Land, Zaneta Polard, Don Yana...",Aaron Osborne,Mike Upton,"Caged Heat 3000, Science Fiction, Lisa Boyle, ..."
45532,False,,0,"Drama, Action, Romance",,30840,tt0102797,en,Robin Hood,"Yet another version of the classic epic, with ...",...,Released,,Robin Hood,False,5.7,26.0,"Patrick Bergin, Uma Thurman, David Morrissey, ...",John Irvin,Sarah Radclyffe,"Robin Hood, Drama, Action, Romance, Patrick Be..."
45535,False,,0,"Action, Drama, Thriller",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,Released,A deadly game of wits.,Betrayal,False,3.8,6.0,"Erika Eleniak, Adam Baldwin, Julie du Page, Ja...",Mark L. Lester,,"Betrayal, Action, Drama, Thriller, Erika Eleni..."
45536,False,,0,,,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,Released,,Satan Triumphant,False,0.0,0.0,"Iwan Mosschuchin, Nathalie Lissenko, Pavel Pav...",Yakov Protazanov,Joseph N. Ermolieff,"Satan Triumphant, , Iwan Mosschuchin, Nathalie..."


<h2>Top Movies Based on Popularity Score</h2>

In [5]:
df['popularity'] = df['popularity'].astype('float64')
popular_movies = df.sort_values('popularity' , ascending = False)
popular_movies[['title','popularity','vote_average', 'vote_count']].head(10)

Unnamed: 0,title,popularity,vote_average,vote_count
30764,Minions,547.488298,6.4,4729.0
33421,Wonder Woman,294.337037,7.2,5025.0
42292,Beauty and the Beast,287.253654,6.8,5530.0
43713,Baby Driver,228.032744,7.2,2083.0
24501,Big Hero 6,213.849907,7.8,6289.0
26625,Deadpool,187.860492,7.4,11444.0
26627,Guardians of the Galaxy Vol. 2,185.330992,7.6,4858.0
14580,Avatar,185.070892,7.2,12114.0
24397,John Wick,183.870374,7.0,5499.0
23720,Gone Girl,154.801009,7.9,6023.0


<h2>Vote-Weighted Movie Score</h2>

In [6]:
# Calculating Thresholds (m and C)
m = df['vote_count'].quantile(0.9) # Minimum Votes Required
C = df['vote_average'].mean() # Mean Vote Average
m, C

(250.0, 5.5328119050640945)

In [7]:
# Defining the Weighted Average Function
def weight_average(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(v+m) * C)

In [8]:
# Filtering Dataframe
q_movies = df.copy().loc[df['vote_count'] >= m]
q_movies.shape

(3156, 28)

In [9]:
# Adding the 'score' Feature
q_movies['score'] = q_movies.apply(weight_average , axis = 1)
q_movies.shape


(3156, 29)

In [10]:
# Sorting and Displaying Top Movies
q_movies = q_movies.sort_values('score', ascending = False)
q_movies[['title', 'score', 'vote_average', 'vote_count']].head(10)

Unnamed: 0,title,score,vote_average,vote_count
314,The Shawshank Redemption,8.413825,8.5,8358.0
835,The Godfather,8.381766,8.5,6024.0
12501,The Dark Knight,8.24474,8.3,12269.0
2848,Fight Club,8.230319,8.3,9678.0
292,Pulp Fiction,8.222444,8.3,8670.0
522,Schindler's List,8.152369,8.3,4436.0
23718,Whiplash,8.150455,8.3,4376.0
351,Forrest Gump,8.120591,8.2,8147.0
1181,The Godfather: Part II,8.111397,8.3,3418.0
1157,The Empire Strikes Back,8.093278,8.2,5998.0
