In [20]:
import pandas as pd
import numpy as np

### Download the data and load it to Pandas. 

You can find them [here](https://drive.google.com/file/d/1NY6cmF9Shjw-dD7BD6bNmfcIVz-kQcFR/view?usp=sharing).

In [2]:
titles = pd.read_csv('data/titles.csv', index_col=None)
titles.head()

Unnamed: 0,title,year
0,The Rising Son,1990
1,The Thousand Plane Raid,1969
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011


In [3]:
cast = pd.read_csv('data/cast.csv', index_col=None)
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


### Define a year as a "Superman year" whose films feature more Superman characters than Batman. How many years in film history have been Superman years?

In [10]:
superman = cast[cast['title'].str.contains('Superman')].groupby('year')['character'].count().reset_index(name = 'superman')

In [11]:
batman = cast[cast['title'].str.contains('Batman')].groupby('year')['character'].count().reset_index(name = 'batman')

In [35]:
super_bat_man = pd.merge(superman,batman,on='year',how = 'left')

In [36]:
# super_bat_man['superman_years'] = np.where(super_bat_man['superman'] > super_bat_man['batman'], 'superman_year')\
#                                     .otherwise(None)

ValueError: either both or neither of x and y should be given

In [24]:
super_bat_man['Superman_year'] = None
super_bat_man.loc[super_bat_man['superman']>super_bat_man['batman'], 'Superman_year'] = 'Superman_year'
super_bat_man.loc[super_bat_man['batman'].isnull(), 'Superman_year'] = 'Superman_year'

In [28]:
super_bat_man[super_bat_man['Superman_year'].notnull()]['year'].count()

15

### How many years have been "Batman years", with more Batman characters than Superman characters?

In [29]:
bat_super_man = pd.merge(superman,batman,on='year',how = 'right')

In [37]:
bat_super_man['batman_year'] = None
bat_super_man.loc[bat_super_man['superman']<bat_super_man['batman'], 'batman_year'] = 'batman_year'
bat_super_man.loc[bat_super_man['superman'].isnull(), 'batman_year'] = 'batman_year'

In [39]:
bat_super_man[bat_super_man['batman_year'].notnull()]['year'].count()

17

### Count the number of actor roles each year and the number of actress roles each year over the history of film.

In [92]:
counts = cast.groupby(['year','type'])['name'].count().reset_index(name='cnt')
counts

Unnamed: 0,year,type,cnt
0,1894,actor,2
1,1894,actress,1
2,1900,actor,2
3,1905,actor,1
4,1906,actor,14
...,...,...,...
235,2022,actress,11
236,2023,actor,6
237,2023,actress,5
238,2025,actor,2


### Find the difference between the number of actor roles each year and the number of actress roles each year over the history of film.

In [100]:
diff = pd.pivot_table(counts, values='cnt',columns=['type'],index = ['year'])

diff['actor_clean'] = np.select(
    [ diff['actor'].isnull() ], 
    [ 0 ], 
    default=diff['actor'])

diff['actress_clean'] = np.select(
    [ diff['actress'].isnull() ], 
    [ 0 ], 
    default=diff['actress'])

diff['difference'] = diff['actor_clean'] - diff['actress_clean']

In [101]:
diff

type,actor,actress,actor_clean,actress_clean,difference
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1894,2.0,1.0,2.0,1.0,1.0
1900,2.0,,2.0,0.0,2.0
1905,1.0,,1.0,0.0,1.0
1906,14.0,3.0,14.0,3.0,11.0
1907,5.0,,5.0,0.0,5.0
...,...,...,...,...,...
2021,9.0,4.0,9.0,4.0,5.0
2022,18.0,11.0,18.0,11.0,7.0
2023,6.0,5.0,6.0,5.0,1.0
2025,2.0,,2.0,0.0,2.0


### What is the fraction of roles that have been 'actor' roles each year in the history of film.

In [102]:
diff['total'] = diff['actor_clean'] + diff['actress_clean']

In [104]:
diff['actor_perc'] = diff['actor_clean']/diff['total']

In [105]:
diff

type,actor,actress,actor_clean,actress_clean,difference,total,actor_perc
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1894,2.0,1.0,2.0,1.0,1.0,3.0,0.666667
1900,2.0,,2.0,0.0,2.0,2.0,1.000000
1905,1.0,,1.0,0.0,1.0,1.0,1.000000
1906,14.0,3.0,14.0,3.0,11.0,17.0,0.823529
1907,5.0,,5.0,0.0,5.0,5.0,1.000000
...,...,...,...,...,...,...,...
2021,9.0,4.0,9.0,4.0,5.0,13.0,0.692308
2022,18.0,11.0,18.0,11.0,7.0,29.0,0.620690
2023,6.0,5.0,6.0,5.0,1.0,11.0,0.545455
2025,2.0,,2.0,0.0,2.0,2.0,1.000000


### What is the fraction of supporting (n=2) roles that have been 'actor' roles each year in the history of film.

In [106]:
counts_2 = cast[cast['n']==2].groupby(['year','type'])['name'].count().reset_index(name='cnt')

In [107]:
diff_2 = pd.pivot_table(counts_2, values='cnt',columns=['type'],index = ['year'])

diff_2['actor_clean'] = np.select(
    [ diff_2['actor'].isnull() ], 
    [ 0 ], 
    default=diff_2['actor'])

diff_2['actress_clean'] = np.select(
    [ diff_2['actress'].isnull() ], 
    [ 0 ], 
    default=diff_2['actress'])

diff_2['difference'] = diff_2['actor_clean'] - diff_2['actress_clean']

In [108]:
diff_2['total'] = diff_2['actor_clean'] + diff_2['actress_clean']

In [109]:
diff_2['actor_perc'] = diff_2['actor_clean']/diff_2['total']

In [111]:
diff_2

type,actor,actress,actor_clean,actress_clean,difference,total,actor_perc
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1906,2.0,1.0,2.0,1.0,1.0,3.0,0.666667
1907,1.0,,1.0,0.0,1.0,1.0,1.000000
1908,2.0,,2.0,0.0,2.0,2.0,1.000000
1910,2.0,2.0,2.0,2.0,0.0,4.0,0.500000
1911,14.0,5.0,14.0,5.0,9.0,19.0,0.736842
...,...,...,...,...,...,...,...
2016,2337.0,1823.0,2337.0,1823.0,514.0,4160.0,0.561779
2017,883.0,643.0,883.0,643.0,240.0,1526.0,0.578637
2018,41.0,30.0,41.0,30.0,11.0,71.0,0.577465
2019,7.0,2.0,7.0,2.0,5.0,9.0,0.777778


In [118]:
cast[(cast['year']== 1906)].sort_values('n')

Unnamed: 0,title,year,name,type,character,n
771162,The Joe Gans-Battling Nelson Fight,1906,Joe Gans,actor,Himself,1.0
1234324,Lika mot lika,1906,Knut Lambert,actor,Coq-H?ron,1.0
3492076,The Story of the Kelly Gang,1906,Elizabeth Tait,actress,Kate Kelly,1.0
1597101,The Joe Gans-Battling Nelson Fight,1906,Battling Nelson,actor,Himself,2.0
2160407,The Story of the Kelly Gang,1906,John Tait,actor,School Master,2.0
3041085,Lika mot lika,1906,Helfrid Lambert,actress,Suzanne,2.0
3627826,Lika mot lika,1906,Tollie Zellman,actress,"Clara, kammarjungfru",3.0
325926,The Story of the Kelly Gang,1906,Norman (II) Campbell,actor,Steve Hart,
471862,The Story of the Kelly Gang,1906,Will Coyne,actor,Joe Byrne,
478233,The Story of the Kelly Gang,1906,Sam Crewes,actor,Dan Kelly,
