# Actor and Director Classification
### Ranking each actor and director based on two metrics: number of films and budget of those films

In [1]:
import pandas as pd
import ast

In [2]:
# import dataset 
df = pd.read_csv('tmdb_data.csv')

In [3]:
df.head()

Unnamed: 0,title,year,release_date,cast,crew,rating,overview,budget,total_revenue,tmdb_id,...,History,Horror,Music,Mystery,Romance,Science Fiction,Thriller,War,Western,keywords
0,Harry Potter and the Half-Blood Prince,2009,"July 8, 2009","['Daniel Radcliffe', 'Rupert Grint', 'Emma Wat...","{'Director': ['David Yates'], 'Novel': ['J.K. ...",PG,"As Harry begins his sixth year at Hogwarts, he...",250000000,933959197,767,...,0,0,0,0,0,0,0,0,0,"['saving the world', 'riddle', 'whip', 'treasu..."
1,Terminator Salvation,2009,"May 20, 2009","['Christian Bale', 'Sam Worthington', 'Anton Y...","{'Director': ['McG'], 'Writer': ['Michael Ferr...",PG-13,"All grown up in post-apocalyptic 2018, John Co...",200000000,371353001,534,...,0,0,0,0,0,1,1,0,0,"['new york', 'rio de janeiro', 'superhero', 'b..."
2,Indiana Jones and the Kingdom of the Crystal S...,2008,"May 22, 2008","['Harrison Ford', 'Cate Blanchett', 'Shia LaBe...","{'Characters': ['George Lucas', 'Philip Kaufma...",PG-13,"Set during the Cold War, the Soviets – led by ...",185000000,786636033,217,...,0,0,0,0,0,0,0,0,0,"['tempel', 'shaolin', 'teenager', 'urination',..."
3,The Incredible Hulk,2008,"June 12, 2008","['Edward Norton', 'Liv Tyler', 'Tim Roth', 'Wi...","{'Characters': ['Stan Lee', 'Jack Kirby'], 'Di...",PG-13,Scientist Bruce Banner scours the planet for a...,150000000,163712074,1724,...,0,0,0,0,0,1,0,0,0,"['dc comics', 'crime fighter', 'secret identit..."
4,The Dark Knight,2008,"July 14, 2008","['Christian Bale', 'Michael Caine', 'Heath Led...","{'Director': ['Christopher Nolan'], 'Screenpla...",PG-13,Batman raises the stakes in his war on crime. ...,185000000,1004558444,155,...,0,0,0,0,0,0,1,0,0,"['saving the world', 'artificial intelligence'..."


In [4]:
df.columns

Index(['title', 'year', 'release_date', 'cast', 'crew', 'rating', 'overview',
       'budget', 'total_revenue', 'tmdb_id', 'studio', 'total_gross',
       'total_theaters', 'opening_gross', 'opening_theaters', 'opening_date',
       'production_companies', 'runtime', 'in_collection', 'imdb_id', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
       'Science Fiction', 'Thriller', 'War', 'Western', 'keywords'],
      dtype='object')

#### First, create a dictionary for just the budget

In [5]:
actor_budget = {}
for i in range(len(df)):
    cast = df.iloc[i,3]
    # cast string into list
    cast_list = ast.literal_eval(cast)
    # adding to new dictionary
    for actor in cast_list:
        if actor_budget.get(actor) == None:
            actor_budget[actor] = float(df.iloc[i, 7])
        else:
            actor_budget[actor] += float(df.iloc[i, 7])

#### Next, create a dictionary for the number of films each actor is in

In [6]:
actor_freq = {}
for i in range(len(df)):
    cast = df.iloc[i,3]
    # cast string into list
    cast_list = ast.literal_eval(cast)
    for actor in cast_list:
        if actor_freq.get(actor) == None:
            actor_freq[actor] = 1
        else:
            actor_freq[actor] += 1

#### Lastly, compile both results into one dictionary and sort

In [7]:
actor_score = {}
actor_list = actor_budget.keys()
for actor in actor_list:
    if actor_score.get(actor) == None:
            actor_score[actor] = actor_budget.get(actor) #* actor_freq.get(actor) 
    else:
        actor_score[actor] += 1
#sort dictionary in DESCENDING order
sorted_actors = sorted(actor_score.items(), key=lambda x: x[1], reverse=True)

In [8]:
sorted_actors

[('Robert Downey Jr.', 2373000000.0),
 ('Dwayne Johnson', 1997000000.0),
 ('Chris Hemsworth', 1917000000.0),
 ('Johnny Depp', 1812000000.0),
 ('Scarlett Johansson', 1764000000.0),
 ('Chris Evans', 1531000000.0),
 ('Jennifer Lawrence', 1468000000.0),
 ('Mark Wahlberg', 1408000000.0),
 ('Mark Ruffalo', 1381000000.0),
 ('Samuel L. Jackson', 1339000000.0),
 ('Simon Pegg', 1283000000.0),
 ('Woody Harrelson', 1238600000.0),
 ('Michael Fassbender', 1230000000.0),
 ('Henry Cavill', 1228000000.0),
 ('Vin Diesel', 1223000000.0),
 ('Chris Pine', 1186000000.0),
 ('Hugh Jackman', 1185000000.0),
 ('Tom Cruise', 1183000000.0),
 ('Bradley Cooper', 1168800000.0),
 ('Owen Wilson', 1156000000.0),
 ('Ryan Reynolds', 1136000000.0),
 ('Cate Blanchett', 1107000000.0),
 ('Christoph Waltz', 1091000000.0),
 ('Gal Gadot', 1074000000.0),
 ('Christian Bale', 1063000000.0),
 ('Harrison Ford', 1048000000.0),
 ('Ben Stiller', 1047000000.0),
 ('Ben Affleck', 1036500000.0),
 ('Zoe Saldana', 1022000000.0),
 ('Gary Oldma

In [9]:
actor_ranking = {}
i = 1
for info in sorted_actors:
    if actor_ranking.get(info[0]) == None:
        actor_ranking[info[0]] = i
    else:
        actor_ranking[info[0]] += 1
    i = i+1

In [10]:
actor_ranking

{'Robert Downey Jr.': 1,
 'Dwayne Johnson': 2,
 'Chris Hemsworth': 3,
 'Johnny Depp': 4,
 'Scarlett Johansson': 5,
 'Chris Evans': 6,
 'Jennifer Lawrence': 7,
 'Mark Wahlberg': 8,
 'Mark Ruffalo': 9,
 'Samuel L. Jackson': 10,
 'Simon Pegg': 11,
 'Woody Harrelson': 12,
 'Michael Fassbender': 13,
 'Henry Cavill': 14,
 'Vin Diesel': 15,
 'Chris Pine': 16,
 'Hugh Jackman': 17,
 'Tom Cruise': 18,
 'Bradley Cooper': 19,
 'Owen Wilson': 20,
 'Ryan Reynolds': 21,
 'Cate Blanchett': 22,
 'Christoph Waltz': 23,
 'Gal Gadot': 24,
 'Christian Bale': 25,
 'Harrison Ford': 26,
 'Ben Stiller': 27,
 'Ben Affleck': 28,
 'Zoe Saldana': 29,
 'Gary Oldman': 30,
 'Daniel Craig': 31,
 'James McAvoy': 32,
 'Martin Freeman': 33,
 'Angelina Jolie': 34,
 'Jason Statham': 35,
 'Adam Sandler': 36,
 'Anne Hathaway': 37,
 'Matt Damon': 38,
 'Tom Hanks': 39,
 'Amy Adams': 40,
 'Gerard Butler': 41,
 'Emma Stone': 42,
 'Jonah Hill': 43,
 'Charlize Theron': 44,
 'Evangeline Lilly': 45,
 'Michael Caine': 46,
 'John Good

#### sorted_actors holds the actual scores while actor_ranking holds the ranking for each actor

### Repeat the same steps for directors

In [11]:
director_budget = {}
for i in range(len(df)):
    crew = df.iloc[i,4]
    # cast string into list
    crew_list = ast.literal_eval(crew)
    director = crew_list.get('Director')[0]
    # adding to new dictionary
    if director_budget.get(director) == None:
        director_budget[director] = float(df.iloc[i, 7])
    else:
        director_budget[director] += float(df.iloc[i, 7])

In [12]:
director_freq = {}
for i in range(len(df)):
    crew = df.iloc[i,4]
    # cast string into list
    crew_list = ast.literal_eval(crew)
    director = crew_list.get('Director')[0]
    # adding to new dictionary
    if director_freq.get(director) == None:
        director_freq[director] = 1
    else:
        director_freq[director] += 1

In [13]:
director_score = {}
director_list = director_budget.keys()
for director in director_list:
    if director_score.get(director) == None:
            director_score[director] = director_budget.get(director) # director_freq.get(director)
    else:
        director_score[director] += 1
#sort dictionary in DESCENDING order
sorted_directors = sorted(director_score.items(), key=lambda x: x[1], reverse=True)

In [14]:
sorted_directors

[('Zack Snyder', 1267000000.0),
 ('Christopher Nolan', 860000000.0),
 ('Steven Spielberg', 851000000.0),
 ('Michael Bay', 822000000.0),
 ('Peter Jackson', 815000000.0),
 ('Ridley Scott', 795000000.0),
 ('Bryan Singer', 750000000.0),
 ('Joe Russo', 656000000.0),
 ('Ron Howard', 645000000.0),
 ('J.J. Abrams', 635000000.0),
 ('David Yates', 630000000.0),
 ('Rob Marshall', 560000000.0),
 ('Peter Berg', 554000000.0),
 ('Brad Bird', 535000000.0),
 ('Francis Lawrence', 522000000.0),
 ('Roland Emmerich', 515000000.0),
 ('Jon Favreau', 514000000.0),
 ('Shawn Levy', 500000000.0),
 ('Clint Eastwood', 471800000.0),
 ('Justin Lin', 470000000.0),
 ('Andrew Stanton', 450000000.0),
 ('Sam Mendes', 445000000.0),
 ('Dean DeBlois', 439000000.0),
 ('Marc Webb', 429500000.0),
 ('Anthony Russo', 410000000.0),
 ('Tom McGrath', 405000000.0),
 ('James Wan', 403000000.0),
 ('Marc Forster', 400000000.0),
 ('Carlos Saldanha', 394000000.0),
 ('Guy Ritchie', 390000000.0),
 ('Gore Verbinski', 390000000.0),
 ('Christ

In [15]:
director_ranking = {}
i = 1
for info in sorted_directors:
    if director_ranking.get(info[0]) == None:
        director_ranking[info[0]] = i
    else:
        director_ranking[info[0]] += 1
    i = i+1

In [16]:
director_ranking

{'Zack Snyder': 1,
 'Christopher Nolan': 2,
 'Steven Spielberg': 3,
 'Michael Bay': 4,
 'Peter Jackson': 5,
 'Ridley Scott': 6,
 'Bryan Singer': 7,
 'Joe Russo': 8,
 'Ron Howard': 9,
 'J.J. Abrams': 10,
 'David Yates': 11,
 'Rob Marshall': 12,
 'Peter Berg': 13,
 'Brad Bird': 14,
 'Francis Lawrence': 15,
 'Roland Emmerich': 16,
 'Jon Favreau': 17,
 'Shawn Levy': 18,
 'Clint Eastwood': 19,
 'Justin Lin': 20,
 'Andrew Stanton': 21,
 'Sam Mendes': 22,
 'Dean DeBlois': 23,
 'Marc Webb': 24,
 'Anthony Russo': 25,
 'Tom McGrath': 26,
 'James Wan': 27,
 'Marc Forster': 28,
 'Carlos Saldanha': 29,
 'Guy Ritchie': 30,
 'Gore Verbinski': 31,
 'Christopher McQuarrie': 32,
 'Lee Unkrich': 33,
 'James Gunn': 34,
 'M. Night Shyamalan': 35,
 'Martin Scorsese': 36,
 'Pete Docter': 37,
 'Matt Reeves': 38,
 'Peyton Reed': 39,
 'David Leitch': 40,
 'Guillermo del Toro': 41,
 'Jon Watts': 42,
 'F. Gary Gray': 43,
 'Dennis Dugan': 44,
 'Jon Turteltaub': 45,
 'Todd Phillips': 46,
 'Martin Campbell': 47,
 'D

In [17]:
def assign_actor_score (actors):
    lowestRank = len(sorted_actors)
    for actor in actors:
        actor_score = actor_ranking.get(actor)
        if (actor_score < lowestRank):
            lowestRank = actor_score
    return lowestRank

In [18]:
def actor_assign(df):
    df['actor_score']= assign_actor_score(ast.literal_eval(df['cast']))
    return df
df = df.apply(actor_assign, axis=1)

In [19]:
def assign_director_score (director):
    return director_ranking.get(director)

In [20]:
def director_assign(df):
    df['director_score']= assign_director_score(ast.literal_eval(df['crew']).get('Director')[0])
    return df
df = df.apply(director_assign, axis=1)

In [21]:
df.head()

Unnamed: 0,title,year,release_date,cast,crew,rating,overview,budget,total_revenue,tmdb_id,...,Music,Mystery,Romance,Science Fiction,Thriller,War,Western,keywords,actor_score,director_score
0,Harry Potter and the Half-Blood Prince,2009,"July 8, 2009","['Daniel Radcliffe', 'Rupert Grint', 'Emma Wat...","{'Director': ['David Yates'], 'Novel': ['J.K. ...",PG,"As Harry begins his sixth year at Hogwarts, he...",250000000,933959197,767,...,0,0,0,0,0,0,0,"['saving the world', 'riddle', 'whip', 'treasu...",176,11
1,Terminator Salvation,2009,"May 20, 2009","['Christian Bale', 'Sam Worthington', 'Anton Y...","{'Director': ['McG'], 'Writer': ['Michael Ferr...",PG-13,"All grown up in post-apocalyptic 2018, John Co...",200000000,371353001,534,...,0,0,0,1,1,0,0,"['new york', 'rio de janeiro', 'superhero', 'b...",25,52
2,Indiana Jones and the Kingdom of the Crystal S...,2008,"May 22, 2008","['Harrison Ford', 'Cate Blanchett', 'Shia LaBe...","{'Characters': ['George Lucas', 'Philip Kaufma...",PG-13,"Set during the Cold War, the Soviets – led by ...",185000000,786636033,217,...,0,0,0,0,0,0,0,"['tempel', 'shaolin', 'teenager', 'urination',...",22,3
3,The Incredible Hulk,2008,"June 12, 2008","['Edward Norton', 'Liv Tyler', 'Tim Roth', 'Wi...","{'Characters': ['Stan Lee', 'Jack Kirby'], 'Di...",PG-13,Scientist Bruce Banner scours the planet for a...,150000000,163712074,1724,...,0,0,0,1,0,0,0,"['dc comics', 'crime fighter', 'secret identit...",214,76
4,The Dark Knight,2008,"July 14, 2008","['Christian Bale', 'Michael Caine', 'Heath Led...","{'Director': ['Christopher Nolan'], 'Screenpla...",PG-13,Batman raises the stakes in his war on crime. ...,185000000,1004558444,155,...,0,0,0,0,1,0,0,"['saving the world', 'artificial intelligence'...",25,2


In [23]:
df.to_csv(r'/Users/atenkumar/QMI_Movie_Analytics/tmdb_data.csv', index = False)