# The Movie DataBase (TMDB)

In [1]:
import datetime
import pandas as pd
import numpy as np
import json
import re

In [2]:
movies= pd.read_csv('tmdb_5000_movies.csv')
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [3]:
credits= pd.read_csv('tmdb_5000_credits.csv')
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


1- For each movie, compute the number of cast members

In [4]:
def cast_members(data):
    item_dict = json.loads(data)
    return(len(item_dict))
credits['cast_members'] = credits['cast'].apply(cast_members)
credits[['movie_id','cast_members']].head()

Unnamed: 0,movie_id,cast_members
0,19995,83
1,285,34
2,206647,83
3,49026,158
4,49529,27


2- How many movies do not have a homepage?

In [5]:
nohomepage=movies[movies['homepage'].isnull()]
len(nohomepage.index)

3091

3- For each year, how many movies do not have a homepage?

In [6]:
movies['years']= pd.to_datetime(movies['release_date'],format="%Y-%m-%d").dt.year
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,years
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,2009.0
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,2007.0


In [7]:
movies['years'] = movies['years'].replace(np.nan, 'Missing')
nohomepage=movies[movies['homepage'].isnull()]
nohomepage[['id', 'years']].groupby('years').count().tail()

Unnamed: 0_level_0,id
years,Unnamed: 1_level_1
2013.0,127
2014.0,157
2015.0,110
2016.0,31
Missing,1


4- Extract the domain of each homepage.

In [8]:
def extract_dom(url):
    if (pd.isnull(url)):
        return(np.nan)
    else:
        result=re.findall(r'[\w+.]+',url)
        return (result[1])
movies['domain']=movies['homepage'].apply(extract_dom)
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,years,domain
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,2009,www.avatarmovie.com
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,2007,disney.go.com


5- Extract a set of normalized tables. That is, each entry of a normalized table must contain exactly one value (not a list or a dictionary).

#### Creo tabella movie_id

In [9]:
movie_id = movies[['id', 'budget', 'homepage', 'domain', 'original_language', 'original_title', 'overview', 'popularity', 'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title', 'vote_average', 'vote_count' ]]
movie_id = movie_id.set_index(['id'])
movie_id.head(2)

Unnamed: 0_level_0,budget,homepage,domain,original_language,original_title,overview,popularity,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
19995,237000000,http://www.avatarmovie.com/,www.avatarmovie.com,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,2009-12-10,2787965087,162.0,Released,Enter the World of Pandora.,Avatar,7.2,11800
285,300000000,http://disney.go.com/disneypictures/pirates/,disney.go.com,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,2007-05-19,961000000,169.0,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


#### Creo tabella genere

In [10]:
genere=pd.DataFrame()
for film in list(range(0,len(movies['genres']))):
    tabjson=pd.read_json(movies['genres'][film])
    genere=pd.concat([genere, tabjson])
genere = genere.set_index(['id'])
genere= genere.drop_duplicates() #per eliminare le righe duplicate
genere.head()

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
28,Action
12,Adventure
14,Fantasy
878,Science Fiction
80,Crime


#### Creo tabella genere_id

In [11]:
genere_id=pd.DataFrame()
for film in list(range(0,len(movies['genres']))):
    tabjson=pd.read_json(movies['genres'][film])
    idfilm=movies['id'][film]
    tabjson['movie_id']=idfilm
    genere_id=pd.concat([genere_id, tabjson])
del genere_id['name']
genere_id = genere_id.set_index(['movie_id', 'id'])
genere_id.head()

movie_id,id
19995,28.0
19995,12.0
19995,14.0
19995,878.0
285,12.0


#### Creo tabella keywords

In [12]:
keywords=pd.DataFrame()
for film in list(range(0,len(movies['keywords']))):
    tabjson=pd.read_json(movies['keywords'][film])
    keywords=pd.concat([keywords, tabjson])
keywords = keywords.set_index(['id'])
keywords = keywords.drop_duplicates()
keywords.head()

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
1463,culture clash
2964,future
3386,space war
3388,space colony
3679,society


#### Creo tabella keywords_id

In [13]:
keywords_id=pd.DataFrame()
for film in list(range(0,len(movies['keywords']))):
    tabjson=pd.read_json(movies['keywords'][film])
    idfilm=movies['id'][film]
    tabjson['movie_id']=idfilm
    keywords_id=pd.concat([keywords_id, tabjson])
del keywords_id['name']
keywords_id = keywords_id.set_index(['movie_id', 'id'])
keywords_id.head()

movie_id,id
19995,1463.0
19995,2964.0
19995,3386.0
19995,3388.0
19995,3679.0


#### Creo tabella prod_companies

In [14]:
prod_companies=pd.DataFrame()
for film in list(range(0,len(movies['production_companies']))):
    tabjson=pd.read_json(movies['production_companies'][film])
    prod_companies=pd.concat([prod_companies, tabjson])
prod_companies = prod_companies.set_index(['id'])
prod_companies = prod_companies.drop_duplicates()
prod_companies.head()

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
289,Ingenious Film Partners
306,Twentieth Century Fox Film Corporation
444,Dune Entertainment
574,Lightstorm Entertainment
2,Walt Disney Pictures


#### Creo tabella prod_companies_id

In [15]:
prod_companies_id=pd.DataFrame()
for film in list(range(0,len(movies['production_companies']))):
    tabjson=pd.read_json(movies['production_companies'][film])
    idfilm=movies['id'][film]
    tabjson['movie_id']=idfilm
    prod_companies_id=pd.concat([prod_companies_id, tabjson])
del prod_companies_id['name']
prod_companies_id = prod_companies_id.set_index(['movie_id', 'id'])
prod_companies_id.head()

movie_id,id
19995,289.0
19995,306.0
19995,444.0
19995,574.0
285,2.0


#### Creo tabella prod_countries

In [16]:
prod_countries=pd.DataFrame()
for film in list(range(0,len(movies['production_countries']))):
    tabjson=pd.read_json(movies['production_countries'][film])
    prod_countries=pd.concat([prod_countries, tabjson])
prod_countries = prod_countries.set_index(['iso_3166_1'])
prod_countries = prod_countries.drop_duplicates()
prod_countries.head()

Unnamed: 0_level_0,name
iso_3166_1,Unnamed: 1_level_1
US,United States of America
GB,United Kingdom
JM,Jamaica
BS,Bahamas
DM,Dominica


#### Creo tabella prod_countries_id

In [17]:
prod_countries_id=pd.DataFrame()
for film in list(range(0,len(movies['production_countries']))):
    tabjson=pd.read_json(movies['production_countries'][film])
    idfilm=movies['id'][film]
    tabjson['movie_id']=idfilm
    prod_countries_id=pd.concat([prod_countries_id, tabjson])
del prod_countries_id['name']
prod_countries_id = prod_countries_id.set_index(['movie_id', 'iso_3166_1'])
prod_countries_id.head()

movie_id,iso_3166_1
19995,US
19995,GB
285,US
206647,GB
206647,US


#### Creo tabella spoken_languages

In [18]:
spoken_languages=pd.DataFrame()
for film in list(range(0,len(movies['spoken_languages']))):
    tabjson=pd.read_json(movies['spoken_languages'][film])
    spoken_languages=pd.concat([spoken_languages, tabjson])
spoken_languages = spoken_languages.set_index(['iso_639_1'])
spoken_languages = spoken_languages.drop_duplicates()
spoken_languages.head()

Unnamed: 0_level_0,name
iso_639_1,Unnamed: 1_level_1
en,English
es,Español
fr,Français
it,Italiano
de,Deutsch


#### Creo tabella spoken_languages_id

In [19]:
spoken_languages_id=pd.DataFrame()
for film in list(range(0,len(movies['spoken_languages']))):
    tabjson=pd.read_json(movies['spoken_languages'][film])
    idfilm=movies['id'][film]
    tabjson['movie_id']=idfilm
    spoken_languages_id=pd.concat([spoken_languages_id, tabjson])
del spoken_languages_id['name']
spoken_languages_id = spoken_languages_id.set_index(['movie_id', 'iso_639_1'])
spoken_languages_id.head()

movie_id,iso_639_1
19995,en
19995,es
285,en
206647,fr
206647,en


#### Creo tabella credits_id

In [60]:
credits_id=credits[['movie_id', 'cast_members']]
credits_id = credits_id.set_index(['movie_id'])
credits_id.head()

Unnamed: 0_level_0,cast_members
movie_id,Unnamed: 1_level_1
19995,83
285,34
206647,83
49026,158
49529,27


#### Creo tabella cast_id

In [36]:
cast_id=pd.DataFrame()
for film in list(range(0,len(credits['cast']))):
    tabjson=pd.read_json(credits['cast'][film])
    idfilm=credits['movie_id'][film]
    tabjson['movie_id']=idfilm
    cast_id=pd.concat([cast_id, tabjson])
cast_id = cast_id.set_index(['movie_id','cast_id'])
cast_id = cast_id[['movie_id', 'cast_id', 'id', 'order', 'credit_id']]
cast_id.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,character,credit_id,gender,name,order
movie_id,id,cast_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19995,65731.0,242.0,Jake Sully,5602a8a7c3a3685532001c9a,2.0,Sam Worthington,0.0
19995,8691.0,3.0,Neytiri,52fe48009251416c750ac9cb,1.0,Zoe Saldana,1.0
19995,10205.0,25.0,Dr. Grace Augustine,52fe48009251416c750aca39,1.0,Sigourney Weaver,2.0
19995,32747.0,4.0,Col. Quaritch,52fe48009251416c750ac9cf,2.0,Stephen Lang,3.0
19995,17647.0,5.0,Trudy Chacon,52fe48009251416c750ac9d3,1.0,Michelle Rodriguez,4.0


#### Creo tabella attori

In [101]:
attori_crew=crew_id.reset_index()
attori_crew=attori_crew[['id', 'gender', 'name']]
attori_cast=cast_id.reset_index()
attori_cast=attori_cast[['id', 'gender', 'name']]
attori=pd.concat([attori_crew, attori_cast])
attori=attori.sort_values('id').drop_duplicates().set_index('id')
attori.head()

Unnamed: 0_level_0,gender,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2.0,George Lucas
2.0,2.0,Mark Hamill
3.0,2.0,Harrison Ford
4.0,1.0,Carrie Fisher
5.0,2.0,Peter Cushing


#### Creo tabella crew_id

In [41]:
crew_id=pd.DataFrame()
for film in list(range(0,len(credits['crew']))):
    tabjson=pd.read_json(credits['crew'][film])
    idfilm=credits['movie_id'][film]
    tabjson['movie_id']=idfilm
    crew_id=pd.concat([crew_id, tabjson])
crew_id = crew_id.set_index(['movie_id', 'credit_id'])
crew_id = crew_id[['movie_id', 'credit_id', 'id', 'department', 'job']]
crew_id.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,department,gender,job,name
movie_id,id,credit_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
19995,1721.0,52fe48009251416c750aca23,Editing,0.0,Editor,Stephen E. Rivkin
19995,496.0,539c47ecc3a36810e3001f87,Art,2.0,Production Design,Rick Carter
19995,900.0,54491c89c3a3680fb4001cf7,Sound,0.0,Sound Designer,Christopher Boyes
19995,900.0,54491cb70e0a267480001bd0,Sound,0.0,Supervising Sound Editor,Christopher Boyes
19995,1262.0,539c4a4cc3a36810c9002101,Production,1.0,Casting,Mali Finn


6- For each movie, compute the gross margin (difference between revenue and budget)

In [25]:
movies['gross_margin'] = movies['revenue'] - movies['budget']
movies[['id','title','gross_margin']].head()

Unnamed: 0,id,title,gross_margin
0,19995,Avatar,2550965087
1,285,Pirates of the Caribbean: At World's End,661000000
2,206647,Spectre,635674609
3,49026,The Dark Knight Rises,834939099
4,49529,John Carter,24139100


7- For each movie, compute the number of crew members

In [26]:
crew_id = crew_id.reset_index() 
crew=crew.reset_index()

df=pd.merge(crew_id, crew, on=['id', 'credit_id'])
members=df[['movie_id','id']].groupby('movie_id').count()
members.rename(index=str, columns={"id" : "n_CrewMembers"}).head()

Unnamed: 0_level_0,n_CrewMembers
movie_id,Unnamed: 1_level_1
5,45
11,2
12,61
13,43
14,41


8- For each movie, compute the number of directors

In [27]:
directors= crew.loc[crew.job == 'Director']
df1= pd.merge(crew_id, directors, on=['id', 'credit_id'])
num_directors= df1[['movie_id', 'id']].groupby('movie_id').count()
num_directors.rename(index=str, columns={'id': 'num_Directors'}).head()

Unnamed: 0_level_0,num_Directors
movie_id,Unnamed: 1_level_1
5,2
18,1
19,1
35,1
152,1


9- For each language, compute the number of movies where such language is spoken.

In [28]:
spoken_languages_id= spoken_languages_id.reset_index()
spoken_languages= spoken_languages.reset_index()
df2= pd.merge(spoken_languages_id, spoken_languages, on = 'iso_639_1')
df2.head()

Unnamed: 0,movie_id,iso_639_1,name
0,19995,en,English
1,285,en,English
2,206647,en,English
3,49026,en,English
4,49529,en,English


In [29]:
lang= df2.groupby(['name', 'iso_639_1']).count()
lang.rename(index=str, columns={'movie_id': 'count_movie'})
lang= lang.iloc[2:] #vengono tolte le prime due righe del dataset, non classificabili in nessuna lingua
lang.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,movie_id
name,iso_639_1,Unnamed: 2_level_1
Afrikaans,af,7
Bahasa indonesia,id,2
Bamanankan,bm,1
Bosanski,bs,2
Català,ca,1


10- For each company and each decade, compute the overall revenue

In [30]:
pc= prod_companies_id.reset_index()
pc1= movies_1.reset_index()
pc1= pc1[['id', 'revenue', 'years']]
pc1['decade']= pd.to_datetime(movies['release_date'],format="%Y-%m-%d").dt.year//10
pc1['decade'] = pc1.decade.replace(np.nan, 'Missing')
pc2=pd.merge(pc, pc1, left_on= 'movie_id', right_on='id')
pc2.head()

NameError: name 'movies_1' is not defined

In [None]:
pc2[['id_x','decade','revenue']].groupby(['id_x', 'decade']).sum().head()

11- For each decade, compute the company with maximum revenue

12- In each year, how many movies have revenue smaller than the budget?

In [None]:
movies_unsuccess= movies.loc[movies.gross_margin < 0, ['years','gross_margin']]
movies_unsuccess= movies_unsuccess.groupby('years').count()
movies_unsuccess.rename(index=str, columns={'gross_margin': 'count_unsuccess'}).head(10)

1- Distribute the revenue according to the order of appearance in a movie. Assume that the i-th actor contributes twice as much as the (i+1)-th actor to the revenue.

In [None]:
cast.head()

In [None]:
cast_id.head()

In [None]:
cast=cast.reset_index()
cast_id=cast_id.reset_index()

In [None]:
prova2= pd.merge(pd.merge(cast, cast_id, on=['id', 'cast_id']), movies[['id', 'revenue']], left_on='movie_id', right_on='id' )
prova2.head()


In [None]:
numero=prova2.groupby('movie_id').count()['order']
numero=pd.DataFrame(numero)
numero=numero.reset_index()
numero=numero.rename(index=str, columns={'order': 'numero_cast'})
numero.head()

In [213]:
prova3= pd.merge(prova2, numero, on='movie_id')
prova3

Unnamed: 0,index,id_x,cast_id,character,credit_id,gender,name,order,movie_id,id_y,revenue,numero_cast
0,0,65731,242,Jake Sully,5602a8a7c3a3685532001c9a,2,Sam Worthington,0,19995,19995,2787965087,89
1,1,8691,3,Neytiri,52fe48009251416c750ac9cb,1,Zoe Saldana,1,19995,19995,2787965087,89
2,30452,8691,3,Maria (voice),52fe4ec79251416c7516217d,1,Zoe Saldana,2,19995,19995,2787965087,89
3,38097,8691,3,Cataleya Restrepo,52fe468fc3a368484e097593,1,Zoe Saldana,0,19995,19995,2787965087,89
4,41028,8691,3,Theresa Jones,52fe446e9251416c75034723,1,Zoe Saldana,2,19995,19995,2787965087,89
5,2,10205,25,Dr. Grace Augustine,52fe48009251416c750aca39,1,Sigourney Weaver,2,19995,19995,2787965087,89
6,3,32747,4,Col. Quaritch,52fe48009251416c750ac9cf,2,Stephen Lang,3,19995,19995,2787965087,89
7,4,17647,5,Trudy Chacon,52fe48009251416c750ac9d3,1,Michelle Rodriguez,4,19995,19995,2787965087,89
8,55022,17647,5,Katarin,52fe4cd4c3a36847f8240803,1,Michelle Rodriguez,2,19995,19995,2787965087,89
9,74274,17647,5,Luz,52fe4471c3a368484e023641,1,Michelle Rodriguez,1,19995,19995,2787965087,89


In [221]:
ordine=3
posizione=1
for i in list(range(0,ordine-1)):
    posizione=posizione*2
posizione        

4

In [225]:
def parti(persone, guadagno, ordine):
    if (pd.isnull(guadagno)):
        return(np.nan)
    else:
        somma=0
        posizione=0
        for i in list(range(0,persone)):
            somma=somma + 2**(i)
        for i in list(range(0,ordine)):
            posizione=posizione*2
        parte=guadagno/somma
        parte=parte*posizione
        return(parte)
prova3['parti']=list(map(parti,prova3['numero_cast'], prova3['revenue'], prova3['order']))
prova3

Unnamed: 0,index,id_x,cast_id,character,credit_id,gender,name,order,movie_id,id_y,revenue,numero_cast,parti
0,0,65731,242,Jake Sully,5602a8a7c3a3685532001c9a,2,Sam Worthington,0,19995,19995,2787965087,89,0.0
1,1,8691,3,Neytiri,52fe48009251416c750ac9cb,1,Zoe Saldana,1,19995,19995,2787965087,89,0.0
2,30452,8691,3,Maria (voice),52fe4ec79251416c7516217d,1,Zoe Saldana,2,19995,19995,2787965087,89,0.0
3,38097,8691,3,Cataleya Restrepo,52fe468fc3a368484e097593,1,Zoe Saldana,0,19995,19995,2787965087,89,0.0
4,41028,8691,3,Theresa Jones,52fe446e9251416c75034723,1,Zoe Saldana,2,19995,19995,2787965087,89,0.0
5,2,10205,25,Dr. Grace Augustine,52fe48009251416c750aca39,1,Sigourney Weaver,2,19995,19995,2787965087,89,0.0
6,3,32747,4,Col. Quaritch,52fe48009251416c750ac9cf,2,Stephen Lang,3,19995,19995,2787965087,89,0.0
7,4,17647,5,Trudy Chacon,52fe48009251416c750ac9d3,1,Michelle Rodriguez,4,19995,19995,2787965087,89,0.0
8,55022,17647,5,Katarin,52fe4cd4c3a36847f8240803,1,Michelle Rodriguez,2,19995,19995,2787965087,89,0.0
9,74274,17647,5,Luz,52fe4471c3a368484e023641,1,Michelle Rodriguez,1,19995,19995,2787965087,89,0.0


In [226]:
prova3.max()

index                            106256
id_x                            1893207
cast_id                            1119
character           최민식 (Choi Man-shik)
credit_id      59c576cdc3a3681403033598
gender                                2
name                                 徐帆
order                               224
movie_id                         459488
id_y                             459488
revenue                      2787965087
numero_cast                         232
parti                                 0
dtype: object



2- For each actor find the total revenue attributed to him/her.

3- Find the actor that is responsible for the most overall revenue.



1- For each movie, compute the ratio between males and females in the cast

2- For each movie, compute the ratio between the attributed revenue of males and females in the cast

3- Find the director that has the highest average ratio computed in the previous point.
