# Actor and Director Classification
### Ranking each actor and director based on two metrics: number of films and budget of those films

In [1]:
import pandas as pd
import ast

In [2]:
# import dataset 
df = pd.read_csv('tmdb_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,year,release_date,cast,crew,rating,genres,overview,budget,...,studio,total_gross,total_theaters,openining_gross,opening_gross,opening_date,production_companies,runtime,in_collection,imdb_id
0,0,Harry Potter and the Half-Blood Prince,2009,"July 8, 2009","['Daniel Radcliffe', 'Rupert Grint', 'Emma Wat...","{'Director': ['David Yates'], 'Novel': ['J.K. ...",PG,"['Adventure', 'Fantasy', 'Family']","As Harry begins his sixth year at Hogwarts, he...",250000000,...,WB,"$301,959,197",4455,"$77,835,727",4325,7/15,"['Warner Bros. Pictures', 'Heyday Films']",153.0,1.0,tt0417741
1,1,Terminator Salvation,2009,"May 20, 2009","['Christian Bale', 'Sam Worthington', 'Anton Y...","{'Director': ['McG'], 'Writer': ['Michael Ferr...",PG-13,"['Action', 'Science Fiction', 'Thriller']","All grown up in post-apocalyptic 2018, John Co...",200000000,...,WB,"$125,322,469",3602,"$42,558,390",3530,5/21,"['Columbia Pictures', 'The Halcyon Company', '...",115.0,1.0,tt0438488
2,2,Indiana Jones and the Kingdom of the Crystal S...,2008,"May 22, 2008","['Harrison Ford', 'Cate Blanchett', 'Shia LaBe...","{'Characters': ['George Lucas', 'Philip Kaufma...",PG-13,"['Adventure', 'Action']","Set during the Cold War, the Soviets – led by ...",185000000,...,Par.,"$317,101,119",4264,"$100,137,835",4260,5/22,"['Paramount', 'Lucasfilm']",122.0,1.0,tt0367882
3,3,The Incredible Hulk,2008,"June 12, 2008","['Edward Norton', 'Liv Tyler', 'Tim Roth', 'Wi...","{'Characters': ['Stan Lee', 'Jack Kirby'], 'Di...",PG-13,"['Science Fiction', 'Action', 'Adventure']",Scientist Bruce Banner scours the planet for a...,150000000,...,Uni.,"$134,806,913",3508,"$55,414,050",3505,6/13,['Marvel Studios'],114.0,0.0,tt0800080
4,6,The Dark Knight,2008,"July 14, 2008","['Christian Bale', 'Michael Caine', 'Heath Led...","{'Director': ['Christopher Nolan'], 'Screenpla...",PG-13,"['Drama', 'Action', 'Crime', 'Thriller']",Batman raises the stakes in his war on crime. ...,185000000,...,WB,"$533,345,358",4366,"$158,411,483",4366,7/18,"['DC Comics', 'Legendary Entertainment', 'Sync...",152.0,1.0,tt0468569


#### First, create a dictionary for just the budget

In [4]:
actor_budget = {}
for i in range(len(df)):
    cast = df.iloc[i,4]
    # cast string into list
    cast_list = ast.literal_eval(cast)
    # adding to new dictionary
    for actor in cast_list:
        if actor_budget.get(actor) == None:
            actor_budget[actor] = float(df.iloc[i, 10])
        else:
            actor_budget[actor] += float(df.iloc[i, 10])

#### Next, create a dictionary for the number of films each actor is in

In [5]:
actor_freq = {}
for i in range(len(df)):
    cast = df.iloc[i,4]
    # cast string into list
    cast_list = ast.literal_eval(cast)
    for actor in cast_list:
        if actor_freq.get(actor) == None:
            actor_freq[actor] = 1
        else:
            actor_freq[actor] += 1

#### Lastly, compile both results into one dictionary and sort

In [6]:
actor_score = {}
actor_list = actor_budget.keys()
for actor in actor_list:
    if actor_score.get(actor) == None:
            actor_score[actor] = actor_freq.get(actor) * actor_budget.get(actor)
    else:
        actor_score[actor] += 1
#sort dictionary in DESCENDING order
sorted_actors = sorted(actor_score.items(), key=lambda x: x[1], reverse=True)

In [7]:
sorted_actors

[('Robert Downey Jr.', 169139593826.0),
 ('Dwayne Johnson', 161013873182.0),
 ('Scarlett Johansson', 117302316480.0),
 ('Chris Hemsworth', 109135204368.0),
 ('Mark Wahlberg', 85456451020.0),
 ('Bradley Cooper', 84138040058.0),
 ('Chris Evans', 68591147848.0),
 ('Jennifer Lawrence', 66855489972.0),
 ('Woody Harrelson', 64581646454.0),
 ('Mark Ruffalo', 63513375360.0),
 ('Samuel L. Jackson', 62057192223.0),
 ('Vin Diesel', 48337524280.0),
 ('Simon Pegg', 47968524750.0),
 ('Emma Stone', 47656149646.0),
 ('Ryan Reynolds', 46632812472.0),
 ('Chris Pratt', 45675195507.0),
 ('Steve Carell', 44954567568.0),
 ('Johnny Depp', 44578158944.0),
 ('Tom Cruise', 41651902457.0),
 ('Kevin Hart', 40057676282.0),
 ('Gary Oldman', 37270846647.0),
 ('Christian Bale', 35172078290.0),
 ('Jason Bateman', 35152790148.0),
 ('John Goodman', 34953003862.0),
 ('Harrison Ford', 34560900396.0),
 ('Owen Wilson', 34424744705.0),
 ('Chris Pine', 33970068011.0),
 ('Cate Blanchett', 33568533297.0),
 ('Tom Hanks', 3344893

In [8]:
actor_ranking = {}
i = 1
for info in sorted_actors:
    if actor_ranking.get(info[0]) == None:
        actor_ranking[info[0]] = i
    else:
        actor_ranking[info[0]] += 1
    i = i+1

In [9]:
actor_ranking

{'Robert Downey Jr.': 1,
 'Dwayne Johnson': 2,
 'Scarlett Johansson': 3,
 'Chris Hemsworth': 4,
 'Mark Wahlberg': 5,
 'Bradley Cooper': 6,
 'Chris Evans': 7,
 'Jennifer Lawrence': 8,
 'Woody Harrelson': 9,
 'Mark Ruffalo': 10,
 'Samuel L. Jackson': 11,
 'Vin Diesel': 12,
 'Simon Pegg': 13,
 'Emma Stone': 14,
 'Ryan Reynolds': 15,
 'Chris Pratt': 16,
 'Steve Carell': 17,
 'Johnny Depp': 18,
 'Tom Cruise': 19,
 'Kevin Hart': 20,
 'Gary Oldman': 21,
 'Christian Bale': 22,
 'Jason Bateman': 23,
 'John Goodman': 24,
 'Harrison Ford': 25,
 'Owen Wilson': 26,
 'Chris Pine': 27,
 'Cate Blanchett': 28,
 'Tom Hanks': 29,
 'Matt Damon': 30,
 'Zoe Saldana': 31,
 'Kristen Wiig': 32,
 'Ben Stiller': 33,
 'Elizabeth Banks': 34,
 'Jonah Hill': 35,
 'Adam Sandler': 36,
 'Michael Fassbender': 37,
 'Amy Adams': 38,
 'Anne Hathaway': 39,
 'Angelina Jolie': 40,
 'Jason Statham': 41,
 'Christoph Waltz': 42,
 'Jack Black': 43,
 'Gerard Butler': 44,
 'Liam Neeson': 45,
 'Hugh Jackman': 46,
 'Jessica Chastain'

#### sorted_actors holds the actual scores while actor_ranking holds the ranking for each actor

### Repeat the same steps for directors

In [21]:
director_budget = {}
for i in range(len(df)):
    crew = df.iloc[i,5]
    # cast string into list
    crew_list = ast.literal_eval(crew)
    director = crew_list.get('Director')[0]
    # adding to new dictionary
    if director_budget.get(director) == None:
        director_budget[director] = float(df.iloc[i, 10])
    else:
        director_budget[director] += float(df.iloc[i, 10])

In [22]:
director_freq = {}
for i in range(len(df)):
    crew = df.iloc[i,5]
    # cast string into list
    crew_list = ast.literal_eval(crew)
    director = crew_list.get('Director')[0]
    # adding to new dictionary
    if director_freq.get(director) == None:
        director_freq[director] = 1
    else:
        director_freq[director] += 1

In [24]:
director_score = {}
director_list = director_budget.keys()
for director in director_list:
    if director_score.get(director) == None:
            director_score[director] = director_budget.get(director) * director_freq.get(director)
    else:
        director_score[director] += 1
#sort dictionary in DESCENDING order
sorted_directors = sorted(director_score.items(), key=lambda x: x[1], reverse=True)

In [25]:
sorted_directors

[('Steven Spielberg', 21741376000.0),
 ('Christopher Nolan', 20578617425.0),
 ('Michael Bay', 18631429240.0),
 ('Clint Eastwood', 17705959520.0),
 ('Zack Snyder', 15686129292.0),
 ('Ridley Scott', 14102830315.0),
 ('James Wan', 13158412796.0),
 ('Bryan Singer', 12918943540.0),
 ('J.J. Abrams', 12725461212.0),
 ('Francis Lawrence', 12575481820.0),
 ('Peter Jackson', 12116195768.0),
 ('Joe Russo', 9688080402.0),
 ('Carlos Saldanha', 8666198080.0),
 ('Todd Phillips', 6868710120.0),
 ('Brad Bird', 6437277474.0),
 ('Shawn Levy', 6290314200.0),
 ('Peter Berg', 6243444910.0),
 ('Marc Webb', 6235997876.0),
 ('David Yates', 5832174477.0),
 ('Jon Favreau', 5719591252.0),
 ('Ron Howard', 5652602465.0),
 ('M. Night Shyamalan', 5355709380.0),
 ('Justin Lin', 5274868023.0),
 ('Christopher McQuarrie', 5075064558.0),
 ('Dean DeBlois', 4864586046.0),
 ('Nicholas Stoller', 4850446326.0),
 ('Rob Marshall', 4822269792.0),
 ('Chris Renaud', 4722373416.0),
 ('F. Gary Gray', 4701271446.0),
 ('David Leitch', 

In [27]:
director_ranking = {}
i = 1
for info in sorted_directors:
    if director_ranking.get(info[0]) == None:
        director_ranking[info[0]] = i
    else:
        director_ranking[info[0]] += 1
    i = i+1

In [28]:
director_ranking

{'Steven Spielberg': 1,
 'Christopher Nolan': 2,
 'Michael Bay': 3,
 'Clint Eastwood': 4,
 'Zack Snyder': 5,
 'Ridley Scott': 6,
 'James Wan': 7,
 'Bryan Singer': 8,
 'J.J. Abrams': 9,
 'Francis Lawrence': 10,
 'Peter Jackson': 11,
 'Joe Russo': 12,
 'Carlos Saldanha': 13,
 'Todd Phillips': 14,
 'Brad Bird': 15,
 'Shawn Levy': 16,
 'Peter Berg': 17,
 'Marc Webb': 18,
 'David Yates': 19,
 'Jon Favreau': 20,
 'Ron Howard': 21,
 'M. Night Shyamalan': 22,
 'Justin Lin': 23,
 'Christopher McQuarrie': 24,
 'Dean DeBlois': 25,
 'Nicholas Stoller': 26,
 'Rob Marshall': 27,
 'Chris Renaud': 28,
 'F. Gary Gray': 29,
 'David Leitch': 30,
 'Tom McGrath': 31,
 'Jaume Collet-Serra': 32,
 'Jon M. Chu': 33,
 'Matt Reeves': 34,
 'Peyton Reed': 35,
 'Roland Emmerich': 36,
 'Jon Watts': 37,
 'Pierre Coffin': 38,
 'Sam Mendes': 39,
 'Jake Kasdan': 40,
 'Anthony Russo': 41,
 'Lee Unkrich': 42,
 'Dennis Dugan': 43,
 'Adam McKay': 44,
 'Guillermo del Toro': 45,
 'James Gunn': 46,
 'Pete Docter': 47,
 'Raja G