# The Movie DataBase (TMDB)

In [1]:
import datetime
import pandas as pd
import numpy as np
import json
import re

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [3]:
credits = pd.read_csv('tmdb_5000_credits.csv')
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


1- For each movie, compute the number of cast members

In [4]:
def cast_members(data):
    item_dict = json.loads(data)
    return(len(item_dict))
credits['cast_members'] = credits['cast'].apply(cast_members)
credits[['movie_id','title','cast_members']].head()

Unnamed: 0,movie_id,title,cast_members
0,19995,Avatar,83
1,285,Pirates of the Caribbean: At World's End,34
2,206647,Spectre,83
3,49026,The Dark Knight Rises,158
4,49529,John Carter,27


2- How many movies do not have a homepage?

In [6]:
nohomepage = movies[movies['homepage'].isnull()]
conteggio = len(nohomepage.index)
print('I film che non hanno una homepage sono',conteggio)

I film che non hanno una homepage sono 3091


3- For each year, how many movies do not have a homepage?

In [7]:
movies['years'] = pd.to_datetime(movies['release_date'],format="%Y-%m-%d").dt.year   #creazione var anno
movies['years'] = movies['years'].replace(np.nan, 'Missing')
nohomepage = movies[movies['homepage'].isnull()]#selezione film senza homepage
nohomepage[['id', 'years']].groupby('years').count().tail().rename(index=str, columns={'id':'count_noHomepage'})

Unnamed: 0_level_0,count_noHomepage
years,Unnamed: 1_level_1
2013.0,127
2014.0,157
2015.0,110
2016.0,31
Missing,1


4- Extract the domain of each homepage.

In [8]:
def extract_dom(url):
    dom = []
    if (pd.isnull(url)):
        return(np.nan)
    else:
        homepage = url.split(' ')                     #per poter gestire i film con due siti
        for i in range(len(homepage)):
            result=re.findall(r'[\w\-+.]+',homepage[i])
            dom.append(result[1])
        return str(dom)[1:-1].replace("'", "") 
movies['domain']=movies['homepage'].apply(extract_dom)
movies[['title', 'homepage', 'domain']][3730:3740]

Unnamed: 0,title,homepage,domain
3730,Cargo,http://www.cargoderfilm.ch http://cargothemovi...,"www.cargoderfilm.ch, cargothemovie.com"
3731,High School Musical,http://tv.disney.go.com/disneychannel/original...,tv.disney.go.com
3732,Love and Death on Long Island,,
3733,Night Watch,,
3734,The Crying Game,http://www.miramax.com/movie/the-crying-game/,www.miramax.com
3735,Porky's,,
3736,Survival of the Dead,http://magnetreleasing.com/survivalofthedead/,magnetreleasing.com
3737,Night of the Living Dead,,
3738,Lost in Translation,,
3739,Annie Hall,,


5- Extract a set of normalized tables. That is, each entry of a normalized table must contain exactly one value (not a list or a dictionary).

#### Creo tabella movie_id

In [9]:
movie_id = movies[['id', 'budget', 'homepage', 'domain', 'original_language', 'original_title', 'overview', 'popularity', 'release_date','years', 'revenue', 'runtime', 'status', 'tagline', 'title', 'vote_average', 'vote_count' ]]
movie_id = movie_id.rename(index=str, columns={'id': 'movie_id'})
movie_id = movie_id.set_index(['movie_id'])
movie_id.head(2)

Unnamed: 0_level_0,budget,homepage,domain,original_language,original_title,overview,popularity,release_date,years,revenue,runtime,status,tagline,title,vote_average,vote_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
19995,237000000,http://www.avatarmovie.com/,www.avatarmovie.com,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,2009-12-10,2009,2787965087,162.0,Released,Enter the World of Pandora.,Avatar,7.2,11800
285,300000000,http://disney.go.com/disneypictures/pirates/,disney.go.com,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,2007-05-19,2007,961000000,169.0,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


#### Creo tabella genere_id

In [10]:
genere_id = pd.DataFrame()
for film in list(range(0,len(movies['genres']))):
    tabjson = pd.read_json(movies['genres'][film])
    idfilm = movies['id'][film]
    tabjson['movie_id'] = idfilm
    genere_id = pd.concat([genere_id, tabjson])
genere_id = genere_id.rename(index=str, columns={'id': 'gender_id'})
genere = genere_id[['gender_id', 'name']]
genere_id = genere_id[['movie_id','gender_id']].set_index(['movie_id','gender_id'])
genere_id.head()

movie_id,gender_id
19995,28.0
19995,12.0
19995,14.0
19995,878.0
285,12.0


#### Creo tabella genere

In [11]:
genere = genere.set_index(['gender_id'])
genere = genere.drop_duplicates() #per eliminare le righe duplicate
genere.head()

Unnamed: 0_level_0,name
gender_id,Unnamed: 1_level_1
28.0,Action
12.0,Adventure
14.0,Fantasy
878.0,Science Fiction
80.0,Crime


#### Creo tabella keywords_id

In [12]:
keywords_id = pd.DataFrame()
for film in list(range(0,len(movies['keywords']))):
    tabjson = pd.read_json(movies['keywords'][film])
    idfilm = movies['id'][film]
    tabjson['movie_id'] = idfilm
    keywords_id = pd.concat([keywords_id, tabjson])
keywords_id = keywords_id.rename(index=str, columns={'id': 'key_id'})
keywords = keywords_id[['key_id', 'name']]
keywords_id = keywords_id[['movie_id', 'key_id']].set_index(['movie_id', 'key_id'])
keywords_id.head()

movie_id,key_id
19995,1463.0
19995,2964.0
19995,3386.0
19995,3388.0
19995,3679.0


#### Creo tabella keywords

In [13]:
keywords = keywords.set_index(['key_id'])
keywords = keywords.drop_duplicates()
keywords.head()

Unnamed: 0_level_0,name
key_id,Unnamed: 1_level_1
1463.0,culture clash
2964.0,future
3386.0,space war
3388.0,space colony
3679.0,society


#### Creo tabella prod_companies_id

In [14]:
prod_companies_id = pd.DataFrame()
for film in list(range(0,len(movies['production_companies']))):
    tabjson = pd.read_json(movies['production_companies'][film])
    idfilm = movies['id'][film]
    tabjson['movie_id'] = idfilm
    prod_companies_id = pd.concat([prod_companies_id, tabjson])
prod_companies_id = prod_companies_id.rename(index=str, columns={'id': 'company_id'})
prod_companies = prod_companies_id[['company_id', 'name']]
prod_companies_id = prod_companies_id[['movie_id', 'company_id']].set_index(['movie_id', 'company_id'])
prod_companies_id.head()

movie_id,company_id
19995,289.0
19995,306.0
19995,444.0
19995,574.0
285,2.0


#### Creo tabella prod_companies

In [15]:
prod_companies = prod_companies.set_index(['company_id'])
prod_companies = prod_companies.drop_duplicates()
prod_companies.head()

Unnamed: 0_level_0,name
company_id,Unnamed: 1_level_1
289.0,Ingenious Film Partners
306.0,Twentieth Century Fox Film Corporation
444.0,Dune Entertainment
574.0,Lightstorm Entertainment
2.0,Walt Disney Pictures


#### Creo tabella prod_countries_id

In [16]:
prod_countries_id = pd.DataFrame()
for film in list(range(0,len(movies['production_countries']))):
    tabjson = pd.read_json(movies['production_countries'][film])
    idfilm = movies['id'][film]
    tabjson['movie_id'] = idfilm
    prod_countries_id = pd.concat([prod_countries_id, tabjson])
prod_countries_id = prod_countries_id.rename(index=str, columns={'iso_3166_1': 'country_id'})
prod_countries = prod_countries_id[['country_id', 'name']]
prod_countries_id = prod_countries_id[['movie_id', 'country_id']].set_index(['movie_id', 'country_id'])
prod_countries_id.head()

movie_id,country_id
19995,US
19995,GB
285,US
206647,GB
206647,US


#### Creo tabella prod_countries

In [17]:
prod_countries = prod_countries.set_index(['country_id'])
prod_countries = prod_countries.drop_duplicates()
prod_countries.head()

Unnamed: 0_level_0,name
country_id,Unnamed: 1_level_1
US,United States of America
GB,United Kingdom
JM,Jamaica
BS,Bahamas
DM,Dominica


#### Creo tabella spoken_languages_id

In [18]:
spoken_languages_id = pd.DataFrame()
for film in list(range(0,len(movies['spoken_languages']))):
    tabjson = pd.read_json(movies['spoken_languages'][film])
    idfilm = movies['id'][film]
    tabjson['movie_id'] = idfilm
    spoken_languages_id = pd.concat([spoken_languages_id, tabjson])
spoken_languages_id = spoken_languages_id.rename(index=str, columns={'iso_639_1': 'language_id'})
spoken_languages = spoken_languages_id[['language_id', 'name']]
spoken_languages_id = spoken_languages_id[['movie_id', 'language_id']].set_index(['movie_id', 'language_id'])
spoken_languages_id.head()

movie_id,language_id
19995,en
19995,es
285,en
206647,fr
206647,en


#### Creo tabella spoken_languages

In [19]:
spoken_languages = spoken_languages.set_index(['language_id'])
spoken_languages = spoken_languages.drop_duplicates()
spoken_languages.head()

Unnamed: 0_level_0,name
language_id,Unnamed: 1_level_1
en,English
es,Español
fr,Français
it,Italiano
de,Deutsch


#### Creo tabella credits_id

In [20]:
credits_id = credits[['movie_id', 'cast_members']]
credits_id = credits_id.set_index(['movie_id'])
credits_id.head()

Unnamed: 0_level_0,cast_members
movie_id,Unnamed: 1_level_1
19995,83
285,34
206647,83
49026,158
49529,27


#### Creo tabella cast_id

In [23]:
cast = pd.DataFrame()
for film in list(range(0,len(credits['cast']))):
    tabjson = pd.read_json(credits['cast'][film])
    idfilm = credits['movie_id'][film]
    tabjson['movie_id'] = idfilm
    cast = pd.concat([cast, tabjson])
cast = cast.rename(index=str, columns={'id': 'pers_id'})
cast_id = cast[['movie_id', 'cast_id', 'pers_id', 'order', 'credit_id', 'character']]
cast_id = cast_id.set_index(['credit_id'])
cast_id.head()

Unnamed: 0_level_0,movie_id,cast_id,pers_id,order,character
credit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5602a8a7c3a3685532001c9a,19995,242.0,65731.0,0.0,Jake Sully
52fe48009251416c750ac9cb,19995,3.0,8691.0,1.0,Neytiri
52fe48009251416c750aca39,19995,25.0,10205.0,2.0,Dr. Grace Augustine
52fe48009251416c750ac9cf,19995,4.0,32747.0,3.0,Col. Quaritch
52fe48009251416c750ac9d3,19995,5.0,17647.0,4.0,Trudy Chacon


#### Creo tabella crew_id

In [24]:
crew = pd.DataFrame()
for film in list(range(0,len(credits['crew']))):
    tabjson = pd.read_json(credits['crew'][film])
    idfilm = credits['movie_id'][film]
    tabjson['movie_id'] = idfilm
    crew = pd.concat([crew, tabjson])
crew = crew.rename(index=str, columns={'id': 'pers_id'})
crew_id = crew[['movie_id', 'credit_id', 'pers_id', 'department', 'job']]
crew_id = crew_id.set_index(['credit_id'])
crew_id.head()

Unnamed: 0_level_0,movie_id,pers_id,department,job
credit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
52fe48009251416c750aca23,19995,1721.0,Editing,Editor
539c47ecc3a36810e3001f87,19995,496.0,Art,Production Design
54491c89c3a3680fb4001cf7,19995,900.0,Sound,Sound Designer
54491cb70e0a267480001bd0,19995,900.0,Sound,Supervising Sound Editor
539c4a4cc3a36810c9002101,19995,1262.0,Production,Casting


#### Creo tabella attori

In [25]:
attori_crew = crew[['pers_id', 'gender', 'name']]
attori_cast = cast[['pers_id', 'gender', 'name']]
attori = pd.concat([attori_crew, attori_cast])
attori = attori.sort_values('pers_id').drop_duplicates().set_index('pers_id')
attori.head()

Unnamed: 0_level_0,gender,name
pers_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2.0,George Lucas
2.0,2.0,Mark Hamill
3.0,2.0,Harrison Ford
4.0,1.0,Carrie Fisher
5.0,2.0,Peter Cushing


6- For each movie, compute the gross margin (difference between revenue and budget)

In [26]:
movie_id['gross_margin'] = movie_id['revenue'] - movie_id['budget']
movie_id[['title','gross_margin']].head()

Unnamed: 0_level_0,title,gross_margin
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
19995,Avatar,2550965087
285,Pirates of the Caribbean: At World's End,661000000
206647,Spectre,635674609
49026,The Dark Knight Rises,834939099
49529,John Carter,24139100


7- For each movie, compute the number of crew members

In [27]:
members = crew[['movie_id','pers_id']].groupby('movie_id').count() #si calcolano i membri per ciascun film
#per una miglior visualizzazione dei risultati si aggiunge la colonna 'title' e si rinomina la var pers_id
members = pd.merge(members.reset_index(), movie_id.reset_index(), on='movie_id') 
members = members.rename(index=str, columns={"pers_id" : "n_CrewMembers"})
members[['movie_id', 'title', 'n_CrewMembers']].set_index('movie_id').head()

Unnamed: 0_level_0,title,n_CrewMembers
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,Four Rooms,88
11,Star Wars,20
12,Finding Nemo,104
13,Forrest Gump,93
14,American Beauty,109


8- For each movie, compute the number of directors

In [29]:
directors = crew.loc[crew.job == 'Director']
num_directors = directors[['movie_id', 'pers_id']].groupby('movie_id').count()
#per una miglior visualizzazione dei risultati si aggiunge la colonna 'title' e si rinomina la var pers_id
num_directors = pd.merge(num_directors.reset_index(), movie_id.reset_index(), on='movie_id') 
num_directors = num_directors.rename(index=str, columns={'pers_id': 'n_Directors'}).head()
num_directors[['movie_id', 'title', 'n_Directors']].set_index('movie_id').head()

Unnamed: 0_level_0,title,n_Directors
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,Four Rooms,4
11,Star Wars,1
12,Finding Nemo,1
13,Forrest Gump,1
14,American Beauty,1


9- For each language, compute the number of movies where such language is spoken.

In [30]:
lang = pd.merge(spoken_languages_id.reset_index(), spoken_languages.reset_index(), on = 'language_id')
lang = lang.groupby(['name', 'language_id']).count()
lang = lang.rename(index=str, columns={'movie_id': 'n_Movie'})
lang.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_Movie
name,language_id,Unnamed: 2_level_1
,bo,4
??????,ky,1
Afrikaans,af,7
Bahasa indonesia,id,2
Bamanankan,bm,1


In [31]:
lang = lang.iloc[2:] #vengono tolte le prime due righe del dataset, non classificabili in nessuna lingua
lang.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_Movie
name,language_id,Unnamed: 2_level_1
Afrikaans,af,7
Bahasa indonesia,id,2
Bamanankan,bm,1
Bosanski,bs,2
Català,ca,1


10- For each company and each decade, compute the overall revenue

In [115]:
#si calcola la decade
movie_id['decade'] = pd.to_datetime(movie_id['release_date'], format="%Y-%m-%d").dt.year//10
movie_id['decade'] = movie_id.decade.replace(np.nan, 'Missing')

#si calcolano il numero di compagnie per film
pc_id = prod_companies_id.reset_index()
pc_counter = pc_id.groupby('movie_id').count()  

#si prepara il data set 
pc = pd.merge(pc_counter.reset_index(), movie_id.reset_index(), on='movie_id')
pc = pc[['movie_id', 'company_id', 'revenue', 'decade']].rename(index=str, columns={'company_id':'count_companies'})

#ogni compagnia ottiene una frazone del guadagno del film
pc['revenue/companies'] = pc['revenue']/pc['count_companies']

#si visualizza il guadagno per ogni compagnia per ogni film
ALL = pd.merge(pc_id, pc[['movie_id', 'decade', 'revenue', 'count_companies', 'revenue/companies']], on='movie_id')
ALL.head()

Unnamed: 0,movie_id,company_id,decade,revenue,count_companies,revenue/companies
0,19995,289.0,200,2787965087,4,696991300.0
1,19995,306.0,200,2787965087,4,696991300.0
2,19995,444.0,200,2787965087,4,696991300.0
3,19995,574.0,200,2787965087,4,696991300.0
4,285,2.0,200,961000000,3,320333300.0


In [86]:
#si sommano i guadagni per ottenere il guadagno totale per ogni compagnia in una decade
ALL_grouped = ALL[['company_id', 'decade', 'revenue/companies']].groupby(['company_id','decade']).sum().rename(index=str, columns={'revenue/companies':'overall_revenue'})
ALL_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,overall_revenue
company_id,decade,Unnamed: 2_level_1
1.0,197.0,434365700.0
1.0,198.0,1187157000.0
1.0,199.0,924317600.0
1.0,200.0,1892716000.0
1.0,201.0,16788460.0


11- For each decade, compute the company with maximum revenue

In [99]:
ALL_grouped2 = ALL[['company_id', 'decade','revenue/companies']].groupby(['decade', 'company_id']).sum().rename(index=str, columns={'revenue/companies':'overall_revenue'})
ALL_grouped2 = ALL_grouped2.groupby('decade').max() #per ogni decade si calcola il guadagno massimo
ALL_grouped2 = pd.merge(ALL_grouped2.reset_index(), ALL_grouped.reset_index(), on=['decade', 'overall_revenue'])  #a cui si associa la o le compagnie che hanno ottenuto tale guadagno
#per una migliore visualizzazione si associa il nome della compagnia
pd.merge(ALL_grouped2, prod_companies.reset_index(), on='company_id')

Unnamed: 0,decade,overall_revenue,company_id,name


In [100]:
ALL_grouped2  ###?????????????????

Unnamed: 0,decade,overall_revenue,company_id
0,191.0,4197376.0,1307.0
1,191.0,4197376.0,1308.0
2,192.0,26358000.0,8411.0
3,193.0,200088200.0,1553.0
4,193.0,200088200.0,8411.0
5,194.0,309597200.0,3166.0
6,195.0,68500000.0,306.0
7,196.0,307234100.0,7576.0
8,197.0,767330300.0,33.0
9,198.0,3082731000.0,4.0


12- In each year, how many movies have revenue smaller than the budget?

In [101]:
movie_id1 = movie_id.reset_index()
movies_unsuccess = movie_id1.loc[movie_id1.gross_margin < 0, ['years','gross_margin']]
movies_unsuccess = movies_unsuccess.groupby('years').count()
movies_unsuccess.rename(index=str, columns={'gross_margin': 'count_unsuccess'}).tail()

Unnamed: 0_level_0,count_unsuccess
years,Unnamed: 1_level_1
2012.0,52
2013.0,62
2014.0,59
2015.0,67
2016.0,26


1- Distribute the revenue according to the order of appearance in a movie. Assume that the i-th actor contributes twice as much as the (i+1)-th actor to the revenue.

In [103]:
data = pd.merge(cast_id.reset_index(), attori.reset_index(), on='pers_id')
data = pd.merge(data, credits_id.reset_index(), on='movie_id')   #preparazione data set
data = pd.merge(data, movie_id.reset_index(), on='movie_id')
data = data[['movie_id', 'pers_id', 'order', 'cast_members', 'revenue', 'title', 'name', 'character']]
data.head()

Unnamed: 0,movie_id,pers_id,order,cast_members,revenue,title,name,character
0,19995,65731.0,0.0,83,2787965087,Avatar,Sam Worthington,Jake Sully
1,19995,8691.0,1.0,83,2787965087,Avatar,Zoe Saldana,Neytiri
2,19995,10205.0,2.0,83,2787965087,Avatar,Sigourney Weaver,Dr. Grace Augustine
3,19995,32747.0,3.0,83,2787965087,Avatar,Stephen Lang,Col. Quaritch
4,19995,17647.0,4.0,83,2787965087,Avatar,Michelle Rodriguez,Trudy Chacon


In [120]:
def parti(persone, incasso, ordine):
    somma = 0
    massimo = 1
    for i in list(range(0,persone)):    #calcola le parti per cui dividere il totale
        somma = somma + 2**(i)
    parte = (incasso/somma)*(2**(persone-ordine-1))   
    return parte
data['guadagno'] = list(map(parti,data['cast_members'], data['revenue'], data['order']))
#per una migliore presentazione dei dati
guadagno = data[['movie_id', 'title', 'revenue', 'pers_id', 'name', 'character', 'order', 'guadagno']]
guadagno.head()

Unnamed: 0,movie_id,title,revenue,pers_id,name,character,order,guadagno
0,19995,Avatar,2787965087,65731.0,Sam Worthington,Jake Sully,0.0,1393983000.0
1,19995,Avatar,2787965087,8691.0,Zoe Saldana,Neytiri,1.0,696991300.0
2,19995,Avatar,2787965087,10205.0,Sigourney Weaver,Dr. Grace Augustine,2.0,348495600.0
3,19995,Avatar,2787965087,32747.0,Stephen Lang,Col. Quaritch,3.0,174247800.0
4,19995,Avatar,2787965087,17647.0,Michelle Rodriguez,Trudy Chacon,4.0,87123910.0


2- For each actor find the total revenue attributed to him/her

In [121]:
df = guadagno[['pers_id', 'guadagno']].groupby('pers_id').sum()    #sommo incassi per attore
df = pd.merge(df.reset_index(), attori.reset_index(), on='pers_id')   #aggiungo il nome dell'attore
df = df[['pers_id', 'name', 'guadagno']].set_index('pers_id')
df.head()

Unnamed: 0_level_0,name,guadagno
pers_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,George Lucas,54.87439
2.0,Mark Hamill,943249300.0
3.0,Harrison Ford,2509048000.0
4.0,Carrie Fisher,248315600.0
5.0,Peter Cushing,48462380.0


3- Find the actor that is responsible for the most overall revenue

In [122]:
df.loc[df['guadagno'].idxmax()]

name         Tom Cruise
guadagno    3.97612e+09
Name: 500.0, dtype: object

1- For each movie, compute the ratio between males and females in the cast

In [123]:
merged = pd.merge(attori.reset_index(), cast_id.reset_index(), on='pers_id')
merged = merged[merged.gender != 0.0] #si eliminano i valori mancanti per genere (0.0)
grouped = merged.groupby(['movie_id', 'gender']).count()
counter = grouped.reset_index()
counter = counter[['movie_id', 'gender', 'pers_id']]
counter.head()

Unnamed: 0,movie_id,gender,pers_id
0,5,1.0,14
1,5,2.0,8
2,11,1.0,3
3,11,2.0,46
4,12,1.0,5


In [124]:
femmine = counter[counter.gender == 1.0]
maschi = counter[counter.gender == 2.0]
femmine = femmine.rename(index=str, columns={"pers_id" : "femmine"})
maschi = maschi.rename(index=str, columns={"pers_id" : "maschi"})
totale = pd.merge(femmine[['movie_id', 'femmine']], maschi[['movie_id', 'maschi']], on='movie_id')
totale['ratio'] = totale['maschi']/totale['femmine']
#si aggiunge l titolo del film
totale = pd.merge(totale, movie_id.reset_index(), on='movie_id')
totale[['movie_id', 'title', 'ratio']].set_index('movie_id').head()

Unnamed: 0_level_0,title,ratio
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,Four Rooms,0.571429
11,Star Wars,15.333333
12,Finding Nemo,3.2
13,Forrest Gump,5.0
14,American Beauty,0.681818


2- For each movie, compute the ratio between the attributed revenue of males and females in the cast

In [128]:
new_table = pd.merge(guadagno, attori.reset_index(), on='pers_id')
new_table = new_table[new_table['gender'] != 0.0]
new_table = new_table.groupby(['movie_id','title', 'gender']).sum()
new_table = new_table.reset_index()[['movie_id','title', 'gender', 'guadagno']]
new_table.head()

Unnamed: 0,movie_id,title,gender,guadagno
0,5,Four Rooms,1.0,973027.5
1,5,Four Rooms,2.0,3326971.0
2,11,Star Wars,1.0,97114060.0
3,11,Star Wars,2.0,677526700.0
4,12,Finding Nemo,1.0,251728900.0


In [140]:
#si calcola il guadagno dei maschi
guadagno_maschi = new_table[new_table['gender'] == 1.0]
guadagno_maschi = guadagno_maschi[['movie_id', 'guadagno']].rename(index=str, columns={"guadagno" : "guadagno_maschi"})

#si calcola il guadagno delle femmine
guadagno_femmine = new_table[new_table['gender'] == 2.0]
guadagno_femmine = guadagno_femmine[['movie_id', 'guadagno']].rename(index=str, columns={"guadagno" : "guadagno_femmine"})

tot_guadagni = pd.merge(guadagno_maschi, guadagno_femmine, on = 'movie_id')
tot_guadagni['ratio_guadagno'] = tot_guadagni['guadagno_maschi'] / tot_guadagni['guadagno_femmine']
#si aggiunge l titolo del film per una migliore visualizzazione
tot_guadagni = pd.merge(tot_guadagni, totale, on='movie_id')
tot_guadagni = tot_guadagni[['movie_id', 'title', 'ratio_guadagno']].set_index('movie_id')
tot_guadagni.head()

Unnamed: 0_level_0,title,ratio_guadagno
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,Four Rooms,0.292466
11,Star Wars,0.143336
12,Finding Nemo,0.366051
13,Forrest Gump,0.410491
14,American Beauty,0.68994


3- For each director, compute the average (among all movies he/she has directed) of the ratio found in the previous point.

In [171]:
registi = crew_id[crew_id.job == 'Director']
#si ottiene il nome 
nomi_registi = pd.merge(registi.reset_index(), attori.reset_index(), on = 'pers_id')
nomi_registi = nomi_registi[['movie_id', 'pers_id', 'name']]
#si prepara il data set per le analisi
registi = pd.merge(tot_guadagni.reset_index(), nomi_registi, on='movie_id')
registi = registi.rename(index=str, columns={ 'pers_id' : 'director_id', 'name' : 'directors'})
registi.head()

Unnamed: 0,movie_id,title,ratio_guadagno,director_id,directors
0,5,Four Rooms,0.292466,138.0,Quentin Tarantino
1,5,Four Rooms,0.292466,2294.0,Robert Rodriguez
2,5,Four Rooms,0.292466,3110.0,Allison Anders
3,5,Four Rooms,0.292466,3111.0,Alexandre Rockwell
4,11,Star Wars,0.143336,1.0,George Lucas


In [172]:
average_ratio = registi[['director_id', 'directors', 'ratio_guadagno']].groupby(['director_id', 'directors']).mean().reset_index()
average_ratio.set_index('director_id').head()

Unnamed: 0_level_0,directors,ratio_guadagno
director_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,George Lucas,0.203418
7.0,Andrew Stanton,0.48133
8.0,Lee Unkrich,0.094819
13.0,Albert Brooks,
24.0,Robert Zemeckis,0.494193


4- Find the director that has the highest average computed in the previous point

In [175]:
max_average = average_ratio['ratio_guadagno'].max()
result = average_ratio[average_ratio['ratio_guadagno'] == max_average]
result

Unnamed: 0,director_id,directors,ratio_guadagno
1379,66728.0,Jonathan Glazer,524288.0


In [176]:
# informazione in più: film di Jonathan Glazer e ratio_guadagno associato
merged_table.loc[merged_table.directors== 'Jonathan Glazer']

Unnamed: 0,movie_id,title,ratio_guadagno,director_id,directors
1856,10740,Birth,,66728.0,Jonathan Glazer
2121,11826,Sexy Beast,,66728.0,Jonathan Glazer
4039,97370,Under the Skin,524288.0,66728.0,Jonathan Glazer
