In [101]:
import pandas as pd
import numpy as np

In [102]:
bechdel = pd.read_csv("./bechdeltestdata.csv")
cast = pd.read_csv("./castinfo.csv")
crew = pd.read_csv("./crewinfo.csv")

### Clean Bechdel dataframe

In [103]:
bechdel.head()

Unnamed: 0,tconst,id,title,bechdel_rating,year,averageRating,numVotes,runtimeMinutes,genres,primaryTitle
0,,7889,Agora,0,2009,,,,,
1,,7444,Batman v Superman: Dawn of Justice Ultimate Ed...,3,2016,,,,,
2,,7771,On the Milky Road,0,2016,,,,,
3,3.0,5433,Pauvre Pierrot,0,1892,6.6,947.0,4.0,"Animation,Comedy,Romance",Pauvre Pierrot
4,12.0,6199,"Arrival of a Train, The",0,1896,7.4,8160.0,1.0,"Documentary,Short",The Arrival of a Train


In [104]:
bechdel.tconst.isnull().sum()

3

In [105]:
# delete rows where tconst is null
bechdel = bechdel.dropna(subset=['tconst'])

### Clean Cast dataframe

In [106]:
cast.head(3)

Unnamed: 0,nconst,tconst,ordering,category,primaryName,primaryProfession
0,,,,,,
1,,,,,,
2,,,,,,


In [107]:
cast.nconst.isnull().sum()

60

In [108]:
# delete rows where nconst is null
cast = cast.dropna(subset=['nconst'])

In [109]:
cast.category.value_counts()

actor                  8957
writer                 6541
actress                6035
producer               5055
director               3881
composer               2465
cinematographer        1938
editor                 1134
production_designer     308
self                    115
archive_footage          12
archive_sound             1
Name: category, dtype: int64

In [110]:
# assign gender if possible
cast.loc[cast['category'].str.contains('actor'), 'gender'] = 'Male'
cast.loc[cast['category'].str.contains('actress'), 'gender'] = 'Female'

cast.primaryProfession = cast.primaryProfession.replace(np.nan, 'unknown', regex=True)
cast.loc[cast['primaryProfession'].str.contains('actor'), 'gender'] = 'Male'
cast.loc[cast['primaryProfession'].str.contains('actress'), 'gender'] = 'Female'

In [111]:
cast[cast['gender'].isnull()].shape

(16802, 7)

### Clean Crew dataframe

In [112]:
crew.head()

Unnamed: 0,nconst,tconst,crew.type,primaryName,primaryProfession
0,\N,3,writ1,,
1,\N,12,writ1,,
2,\N,14,writ1,,
3,\N,131,writ1,,
4,\N,7342204,writ1,,


In [113]:
crew.nconst.str.contains('\N').sum()

107

In [114]:
crew = crew[crew.nconst.str.contains('\N')==False]

In [115]:
# assign gender if possible
crew.primaryProfession = crew.primaryProfession.replace(np.nan, 'unknown', regex=True)
crew.loc[crew['primaryProfession'].str.contains('actor'), 'gender'] = 'Male'
crew.loc[crew['primaryProfession'].str.contains('actress'), 'gender'] = 'Female'

In [116]:
crew['primaryProfession'].isnull().sum()

0

In [117]:
crew[crew['gender'].isnull()].shape

(18966, 6)

In [118]:
# change crew type to generic label
crew.loc[crew['crew.type'].str.contains('writ'), 'crew.type'] = 'writer'
crew.loc[crew['crew.type'].str.contains('dir'), 'crew.type'] = 'director'

### Export

In [119]:
bechdel.to_csv("./bechdel_clean.csv")
crew.to_csv("./crew_clean.csv")
cast.to_csv("./cast_clean.csv")