In [1]:
import pandas as pd
import ast
from collections import defaultdict
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from scipy.stats import norm
import gender_guesser.detector as gender
predictor = gender.Detector()

In [2]:
# load bechdel data
detailed_b = pd.read_csv('Bechdel_detailed.csv', index_col=0)
detailed_b.rename(columns={'rating': 'bt_score'}, inplace=True)
detailed_b = detailed_b.dropna().reset_index(drop=True)
detailed_b.drop_duplicates(inplace=True)
detailed_b = detailed_b.reset_index(drop=True)

In [3]:
# -- This file contains the link to the various csv files --
link_b=pd.read_csv('links.csv', index_col=0)
link_b

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,114709,862.0
2,113497,8844.0
3,113228,15602.0
4,114885,31357.0
5,113041,11862.0
...,...,...
176269,6209470,439050.0
176271,2028550,111109.0
176273,303758,67758.0
176275,8536,227506.0


In [4]:
#I -- t has the cast and crew details --
credit=pd.read_csv('credits.csv')
credit

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [5]:
# -- Contains all meta data for movies -- 
details_movie=pd.read_csv("movies_metadata.csv")
details_movie['imdb_id'] = details_movie['imdb_id'].str.replace('tt', '')
details_movie.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

Since these columns have the maximum number of null values and have no relevance to the project, We can remove these columns.

In [6]:
details_movie = details_movie.drop(columns=['homepage', 'belongs_to_collection', 'overview', 'poster_path', 'tagline', 'video', 'vote_average', 'vote_count', 'runtime', 'status', 'original_language',
                                           'production_companies', 'production_countries', 'release_date'])

In [7]:
details_movie.isna().sum()

adult                0
budget               0
genres               0
id                   0
imdb_id             17
original_title       0
popularity           5
revenue              6
spoken_languages     6
title                6
dtype: int64

In [8]:
all_b_values = pd.merge(detailed_b, link_b, left_on='imdbid', right_on='imdbId', how='inner').drop(columns=['imdbId'])

In [9]:
all_b_values.drop(columns=['date'], inplace=True)

In [10]:
all_b_values=pd.merge(all_b_values, credit, left_on='tmdbId', right_on='id', how='inner')
all_b_values.drop(columns=['id_x', 'id_y', 'submitterid', 'visible'], inplace=True)
all_b_values

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,cast,crew
0,Passage de Venus,1874.0,0.0,0.0,3155794.0,315946.0,[],"[{'credit_id': '55325dc09251417ae30009c3', 'de..."
1,Sallie Gardner at a Gallop,1878.0,0.0,0.0,2221420.0,194079.0,"[{'cast_id': 5, 'character': 'Herself - Sallie...","[{'credit_id': '52fe4ce29251416c91103327', 'de..."
2,Buffalo Running,1883.0,0.0,0.0,5459794.0,426903.0,[],"[{'credit_id': '5831b973c3a3685b960216b5', 'de..."
3,Man Walking Around the Corner,1887.0,0.0,0.0,2075247.0,159897.0,[],"[{'credit_id': '52fe4c239251416c910f132b', 'de..."
4,Accordion Player,1888.0,0.0,0.0,1758563.0,96882.0,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe49c39251416c750d2a8b', 'de..."
...,...,...,...,...,...,...,...,...
7320,Landline,2017.0,3.0,0.0,5737862.0,419459.0,"[{'cast_id': 0, 'character': 'Dana Jacobs', 'c...","[{'credit_id': '5863a7fcc3a3680ab600716c', 'de..."
7321,Patient Zero,2018.0,1.0,0.0,3458254.0,295011.0,"[{'cast_id': 1, 'character': 'Dr. Gina Rose', ...","[{'credit_id': '55c8fd47c3a36849b40002ec', 'de..."
7322,Iron Sky: The Coming Race,2019.0,3.0,0.0,3038708.0,302349.0,"[{'cast_id': 1, 'character': 'Wolfgang Kortzfl...","[{'credit_id': '5461f666c3a3686f4c001f27', 'de..."
7323,Ip Man 4: The Finale,2019.0,3.0,0.0,2076298.0,324017.0,"[{'cast_id': 0, 'character': '', 'credit_id': ...",[]


In [11]:
#Checking which coloumn can be used to merge the two dataframes
print(all_b_values[all_b_values['title'] == 'Buffalo Running']['imdbid'])
print(details_movie[details_movie['title']=='Buffalo Running']['imdb_id'])

2    5459794.0
Name: imdbid, dtype: float64
41602    5459794
Name: imdb_id, dtype: object


In [12]:
details_movie['imdb_id'] = details_movie['imdb_id'].astype(float)
all_b_values = pd.merge(all_b_values, details_movie, left_on='imdbid', right_on='imdb_id', how='inner')
all_b_values

Unnamed: 0,title_x,year,bt_score,dubious,imdbid,tmdbId,cast,crew,adult,budget,genres,id,imdb_id,original_title,popularity,revenue,spoken_languages,title_y
0,Passage de Venus,1874.0,0.0,0.0,3155794.0,315946.0,[],"[{'credit_id': '55325dc09251417ae30009c3', 'de...",False,0,"[{'id': 99, 'name': 'Documentary'}]",315946,3155794.0,Passage de Venus,0.480371,0.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Passage of Venus
1,Sallie Gardner at a Gallop,1878.0,0.0,0.0,2221420.0,194079.0,"[{'cast_id': 5, 'character': 'Herself - Sallie...","[{'credit_id': '52fe4ce29251416c91103327', 'de...",False,0,"[{'id': 99, 'name': 'Documentary'}]",194079,2221420.0,Sallie Gardner at a Gallop,0.327841,0.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Sallie Gardner at a Gallop
2,Buffalo Running,1883.0,0.0,0.0,5459794.0,426903.0,[],"[{'credit_id': '5831b973c3a3685b960216b5', 'de...",False,0,"[{'id': 99, 'name': 'Documentary'}]",426903,5459794.0,Buffalo Running,0.229221,0.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Buffalo Running
3,Man Walking Around the Corner,1887.0,0.0,0.0,2075247.0,159897.0,[],"[{'credit_id': '52fe4c239251416c910f132b', 'de...",False,0,"[{'id': 99, 'name': 'Documentary'}]",159897,2075247.0,Man Walking Around a Corner,1.184891,0.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Man Walking Around a Corner
4,Accordion Player,1888.0,0.0,0.0,1758563.0,96882.0,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe49c39251416c750d2a8b', 'de...",False,0,"[{'id': 99, 'name': 'Documentary'}]",96882,1758563.0,Accordion Player,0.212768,0.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Accordion Player
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7327,12 feet deep,2017.0,3.0,0.0,5143226.0,459928.0,"[{'cast_id': 0, 'character': 'McGradey', 'cred...","[{'credit_id': '592ec68d92514130da00d716', 'de...",False,0,"[{'id': 53, 'name': 'Thriller'}]",459928,5143226.0,12 Feet Deep,4.479536,0.0,"[{'iso_639_1': 'en', 'name': 'English'}]",12 Feet Deep
7328,Landline,2017.0,3.0,0.0,5737862.0,419459.0,"[{'cast_id': 0, 'character': 'Dana Jacobs', 'c...","[{'credit_id': '5863a7fcc3a3680ab600716c', 'de...",False,0,"[{'id': 35, 'name': 'Comedy'}]",419459,5737862.0,Landline,1.811506,0.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Landline
7329,Patient Zero,2018.0,1.0,0.0,3458254.0,295011.0,"[{'cast_id': 1, 'character': 'Dr. Gina Rose', ...","[{'credit_id': '55c8fd47c3a36849b40002ec', 'de...",False,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",295011,3458254.0,Patient Zero,1.083454,0.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Patient Zero
7330,Iron Sky: The Coming Race,2019.0,3.0,0.0,3038708.0,302349.0,"[{'cast_id': 1, 'character': 'Wolfgang Kortzfl...","[{'credit_id': '5461f666c3a3686f4c001f27', 'de...",False,18000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,3038708.0,Iron Sky: The Coming Race,1.917649,0.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Iron Sky: The Coming Race


In [13]:
all_b_values.drop(columns=['title_y','imdb_id', 'spoken_languages', 'original_title', 'adult'], inplace=True)
all_b_values.rename(columns={'title_x': 'title'}, inplace=True)
all_b_values

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,cast,crew,budget,genres,id,popularity,revenue
0,Passage de Venus,1874.0,0.0,0.0,3155794.0,315946.0,[],"[{'credit_id': '55325dc09251417ae30009c3', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",315946,0.480371,0.0
1,Sallie Gardner at a Gallop,1878.0,0.0,0.0,2221420.0,194079.0,"[{'cast_id': 5, 'character': 'Herself - Sallie...","[{'credit_id': '52fe4ce29251416c91103327', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",194079,0.327841,0.0
2,Buffalo Running,1883.0,0.0,0.0,5459794.0,426903.0,[],"[{'credit_id': '5831b973c3a3685b960216b5', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",426903,0.229221,0.0
3,Man Walking Around the Corner,1887.0,0.0,0.0,2075247.0,159897.0,[],"[{'credit_id': '52fe4c239251416c910f132b', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",159897,1.184891,0.0
4,Accordion Player,1888.0,0.0,0.0,1758563.0,96882.0,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe49c39251416c750d2a8b', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",96882,0.212768,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7327,12 feet deep,2017.0,3.0,0.0,5143226.0,459928.0,"[{'cast_id': 0, 'character': 'McGradey', 'cred...","[{'credit_id': '592ec68d92514130da00d716', 'de...",0,"[{'id': 53, 'name': 'Thriller'}]",459928,4.479536,0.0
7328,Landline,2017.0,3.0,0.0,5737862.0,419459.0,"[{'cast_id': 0, 'character': 'Dana Jacobs', 'c...","[{'credit_id': '5863a7fcc3a3680ab600716c', 'de...",0,"[{'id': 35, 'name': 'Comedy'}]",419459,1.811506,0.0
7329,Patient Zero,2018.0,1.0,0.0,3458254.0,295011.0,"[{'cast_id': 1, 'character': 'Dr. Gina Rose', ...","[{'credit_id': '55c8fd47c3a36849b40002ec', 'de...",0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",295011,1.083454,0.0
7330,Iron Sky: The Coming Race,2019.0,3.0,0.0,3038708.0,302349.0,"[{'cast_id': 1, 'character': 'Wolfgang Kortzfl...","[{'credit_id': '5461f666c3a3686f4c001f27', 'de...",18000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,1.917649,0.0


In [14]:
all_b_values.duplicated().sum()

14

In [15]:
all_b_values[all_b_values.duplicated()]

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,cast,crew,budget,genres,id,popularity,revenue
756,Sleeping Beauty,1959.0,3.0,0.0,53285.0,10882.0,"[{'cast_id': 1, 'character': 'Princess Aurora ...","[{'credit_id': '52fe43c99251416c7501e219', 'de...",6000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 16, 'na...",10882,14.276169,51000000.0
1043,Le Samourai,1967.0,1.0,0.0,62229.0,5511.0,"[{'cast_id': 11, 'character': 'Jef Costello', ...","[{'credit_id': '52fe440ac3a36847f807ee01', 'de...",0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",5511,9.091288,39481.0
1044,Le Samourai,1967.0,1.0,0.0,62229.0,5511.0,"[{'cast_id': 11, 'character': 'Jef Costello', ...","[{'credit_id': '52fe440ac3a36847f807ee01', 'de...",0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",5511,9.091288,39481.0
1045,Le Samourai,1967.0,1.0,0.0,62229.0,5511.0,"[{'cast_id': 11, 'character': 'Jef Costello', ...","[{'credit_id': '52fe440ac3a36847f807ee01', 'de...",0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",5511,9.091288,39481.0
1567,Rich and Famous,1981.0,3.0,0.0,82992.0,109962.0,"[{'cast_id': 1, 'character': 'Liz Hamilton', '...","[{'credit_id': '52fe4ac8c3a36847f81e1445', 'de...",0,"[{'id': 18, 'name': 'Drama'}]",109962,12.180836,0.0
1568,Rich and Famous,1981.0,3.0,0.0,82992.0,109962.0,"[{'cast_id': 1, 'character': 'Liz Hamilton', '...","[{'credit_id': '52fe4ac8c3a36847f81e1445', 'de...",0,"[{'id': 18, 'name': 'Drama'}]",109962,10.396878,0.0
4061,Offside,2006.0,3.0,0.0,499537.0,13209.0,"[{'cast_id': 4, 'character': '', 'credit_id': ...","[{'credit_id': '52fe454f9251416c75052209', 'de...",2500,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",13209,1.52896,0.0
4062,Offside,2006.0,3.0,0.0,499537.0,13209.0,"[{'cast_id': 4, 'character': '', 'credit_id': ...","[{'credit_id': '52fe454f9251416c75052209', 'de...",2500,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",13209,1.529879,0.0
5470,Black Gold,2011.0,0.0,0.0,1701210.0,77221.0,"[{'cast_id': 2, 'character': 'Sultan Amar', 'c...","[{'credit_id': '52fe4962c3a368484e1289fd', 'de...",40000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",77221,6.652197,5446000.0
5471,Black Gold,2011.0,0.0,0.0,1701210.0,77221.0,"[{'cast_id': 2, 'character': 'Sultan Amar', 'c...","[{'credit_id': '52fe4962c3a368484e1289fd', 'de...",40000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",77221,6.475665,5446000.0


In [16]:
all_b_values.drop_duplicates(inplace=True)
all_b_values = all_b_values.reset_index(drop=True)
all_b_values

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,cast,crew,budget,genres,id,popularity,revenue
0,Passage de Venus,1874.0,0.0,0.0,3155794.0,315946.0,[],"[{'credit_id': '55325dc09251417ae30009c3', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",315946,0.480371,0.0
1,Sallie Gardner at a Gallop,1878.0,0.0,0.0,2221420.0,194079.0,"[{'cast_id': 5, 'character': 'Herself - Sallie...","[{'credit_id': '52fe4ce29251416c91103327', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",194079,0.327841,0.0
2,Buffalo Running,1883.0,0.0,0.0,5459794.0,426903.0,[],"[{'credit_id': '5831b973c3a3685b960216b5', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",426903,0.229221,0.0
3,Man Walking Around the Corner,1887.0,0.0,0.0,2075247.0,159897.0,[],"[{'credit_id': '52fe4c239251416c910f132b', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",159897,1.184891,0.0
4,Accordion Player,1888.0,0.0,0.0,1758563.0,96882.0,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe49c39251416c750d2a8b', 'de...",0,"[{'id': 99, 'name': 'Documentary'}]",96882,0.212768,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7313,12 feet deep,2017.0,3.0,0.0,5143226.0,459928.0,"[{'cast_id': 0, 'character': 'McGradey', 'cred...","[{'credit_id': '592ec68d92514130da00d716', 'de...",0,"[{'id': 53, 'name': 'Thriller'}]",459928,4.479536,0.0
7314,Landline,2017.0,3.0,0.0,5737862.0,419459.0,"[{'cast_id': 0, 'character': 'Dana Jacobs', 'c...","[{'credit_id': '5863a7fcc3a3680ab600716c', 'de...",0,"[{'id': 35, 'name': 'Comedy'}]",419459,1.811506,0.0
7315,Patient Zero,2018.0,1.0,0.0,3458254.0,295011.0,"[{'cast_id': 1, 'character': 'Dr. Gina Rose', ...","[{'credit_id': '55c8fd47c3a36849b40002ec', 'de...",0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",295011,1.083454,0.0
7316,Iron Sky: The Coming Race,2019.0,3.0,0.0,3038708.0,302349.0,"[{'cast_id': 1, 'character': 'Wolfgang Kortzfl...","[{'credit_id': '5461f666c3a3686f4c001f27', 'de...",18000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,1.917649,0.0


In [17]:
all_b_values['year']= all_b_values['year'].astype('int')
all_b_values['imdbid']= all_b_values['imdbid'].astype('int')
all_b_values['tmdbId']= all_b_values['tmdbId'].astype('int')
all_b_values['bt_score']= all_b_values['bt_score'].astype('int')
all_b_values['dubious'] = all_b_values['dubious'].astype('int')
all_b_values['budget']= all_b_values['budget'].astype('float')
all_b_values['revenue']= all_b_values['revenue'].astype('float')
all_b_values

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,cast,crew,budget,genres,id,popularity,revenue
0,Passage de Venus,1874,0,0,3155794,315946,[],"[{'credit_id': '55325dc09251417ae30009c3', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",315946,0.480371,0.0
1,Sallie Gardner at a Gallop,1878,0,0,2221420,194079,"[{'cast_id': 5, 'character': 'Herself - Sallie...","[{'credit_id': '52fe4ce29251416c91103327', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",194079,0.327841,0.0
2,Buffalo Running,1883,0,0,5459794,426903,[],"[{'credit_id': '5831b973c3a3685b960216b5', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",426903,0.229221,0.0
3,Man Walking Around the Corner,1887,0,0,2075247,159897,[],"[{'credit_id': '52fe4c239251416c910f132b', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",159897,1.184891,0.0
4,Accordion Player,1888,0,0,1758563,96882,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe49c39251416c750d2a8b', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",96882,0.212768,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7313,12 feet deep,2017,3,0,5143226,459928,"[{'cast_id': 0, 'character': 'McGradey', 'cred...","[{'credit_id': '592ec68d92514130da00d716', 'de...",0.0,"[{'id': 53, 'name': 'Thriller'}]",459928,4.479536,0.0
7314,Landline,2017,3,0,5737862,419459,"[{'cast_id': 0, 'character': 'Dana Jacobs', 'c...","[{'credit_id': '5863a7fcc3a3680ab600716c', 'de...",0.0,"[{'id': 35, 'name': 'Comedy'}]",419459,1.811506,0.0
7315,Patient Zero,2018,1,0,3458254,295011,"[{'cast_id': 1, 'character': 'Dr. Gina Rose', ...","[{'credit_id': '55c8fd47c3a36849b40002ec', 'de...",0.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",295011,1.083454,0.0
7316,Iron Sky: The Coming Race,2019,3,0,3038708,302349,"[{'cast_id': 1, 'character': 'Wolfgang Kortzfl...","[{'credit_id': '5461f666c3a3686f4c001f27', 'de...",18000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,1.917649,0.0


In [18]:
all_b_values = all_b_values[(all_b_values['cast'] != '[]') & (all_b_values['crew'] != '[]') 
                            & (all_b_values['genres'] !='[]')].reset_index(drop=True)

Now there are no Null Values, No duplicates and all information we have for visualizing is here. We need to convert the data into digestable form.
If Dubious Value is 1 -- Consider it uncertain. Else if it is 0, we must mark the 'BeschdelTestScore' as according to whether it passes the three stages of Beschdel Test or not
They are as follows:
1. There must be atleast two female character that are named
2. They both must have a dialogue with each other
3. They must not talk about boys

If the test passes all these three conditions, the movie is considered to pass the beschdel test

In [19]:
for ind, dubious in all_b_values['dubious'].items():
    if dubious ==1:
        all_b_values.at[ind, 'Category']='Dubious'
    else:
        if all_b_values.at[ind, 'bt_score']==0:
            all_b_values.at[ind, 'Category']='Two named female character not present'
        elif all_b_values.at[ind, 'bt_score']==1:
            all_b_values.at[ind, 'Category']='Two named female character have no dialogue'
        elif all_b_values.at[ind, 'bt_score']==2:
            all_b_values.at[ind, 'Category']='Two named female character talk only about boys'
        else:
            all_b_values.at[ind, 'Category']='Beschdel test passed!'
all_b_values

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,cast,crew,budget,genres,id,popularity,revenue,Category
0,Sallie Gardner at a Gallop,1878,0,0,2221420,194079,"[{'cast_id': 5, 'character': 'Herself - Sallie...","[{'credit_id': '52fe4ce29251416c91103327', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",194079,0.327841,0.0,Two named female character not present
1,Accordion Player,1888,0,0,1758563,96882,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe49c39251416c750d2a8b', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",96882,0.212768,0.0,Two named female character not present
2,"Monkeyshines, No. 1",1890,0,0,361921,32571,"[{'cast_id': 10, 'character': 'Himself', 'cred...","[{'credit_id': '52fe44d99251416c9101ef9f', 'de...",0.0,"[{'id': 35, 'name': 'Comedy'}]",32571,1.163672,0.0,Two named female character not present
3,Dickson Greeting,1891,0,0,241373,33229,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe45069251416c91024e8f', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",33229,1.041504,0.0,Two named female character not present
4,Je vous aime,1891,0,0,3201916,336380,"[{'cast_id': 0, 'character': 'Himself', 'credi...","[{'credit_id': '553266c4925141631000227d', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",336380,0.212894,0.0,Two named female character not present
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7258,12 feet deep,2017,3,0,5143226,459928,"[{'cast_id': 0, 'character': 'McGradey', 'cred...","[{'credit_id': '592ec68d92514130da00d716', 'de...",0.0,"[{'id': 53, 'name': 'Thriller'}]",459928,4.479536,0.0,Beschdel test passed!
7259,Landline,2017,3,0,5737862,419459,"[{'cast_id': 0, 'character': 'Dana Jacobs', 'c...","[{'credit_id': '5863a7fcc3a3680ab600716c', 'de...",0.0,"[{'id': 35, 'name': 'Comedy'}]",419459,1.811506,0.0,Beschdel test passed!
7260,Patient Zero,2018,1,0,3458254,295011,"[{'cast_id': 1, 'character': 'Dr. Gina Rose', ...","[{'credit_id': '55c8fd47c3a36849b40002ec', 'de...",0.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",295011,1.083454,0.0,Two named female character have no dialogue
7261,Iron Sky: The Coming Race,2019,3,0,3038708,302349,"[{'cast_id': 1, 'character': 'Wolfgang Kortzfl...","[{'credit_id': '5461f666c3a3686f4c001f27', 'de...",18000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,1.917649,0.0,Beschdel test passed!


In [20]:
for index, row in all_b_values.iterrows():
    cast_dict, crew_dict, crew_department = defaultdict(int), defaultdict(int), defaultdict(int)
    castList = ast.literal_eval(row['cast'])
    crewList = ast.literal_eval(row['crew'])
    
    castM = 0
    castF = 0
    castU = 0
    
    dirM=0
    dirF=0
    dirU=0
    
    
    for cast in castList:
        if cast['gender'] == 0:
            first_name = cast['name'].split(' ')[0]
            predicted_gender = predictor.get_gender(first_name)
            if predicted_gender == 'male' or predicted_gender == 'mostly_male' or cast['character'].lower() == 'son' or cast['character'].lower() == 'himself':
                castM += 1
            elif predicted_gender == 'female' or predicted_gender == 'mostly_female' or cast['character'].lower() == 'daughter' or cast['character'].lower() == 'herself':
                castF += 1
            elif predicted_gender == 'unknown' or predicted_gender == 'andy':
                castU += 1
        elif cast['gender'] == 1:
            castM += 1
        elif cast['gender'] == 2:
            castF += 1
    # Update the counts outside the loop
    all_b_values.at[index, 'castMale'] = castM
    all_b_values.at[index, 'castFemale'] = castF
    all_b_values.at[index, 'castUnknown'] = castU
            
    for crew in crewList:
        if crew['gender'] == 0:
            first_name = crew['name'].split(' ')[0]
            predicted_gender = predictor.get_gender(first_name)
            if predicted_gender == 'male' or predicted_gender == 'mostly_male':
                dirM += 1
            elif predicted_gender == 'female' or predicted_gender == 'mostly_female':
                dirF += 1
            elif predicted_gender == 'unknown' or predicted_gender == 'andy':
                dirU += 1
        elif crew['gender'] == 1:
            dirM += 1
        elif crew['gender'] == 2:
            dirF += 1
    
    all_b_values.at[index, 'CrewMale'] = dirM
    all_b_values.at[index, 'CrewFemale'] = dirF
    all_b_values.at[index, 'CrewUnknown'] = dirU
    
    
            
all_b_values


Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,cast,crew,budget,genres,id,popularity,revenue,Category,castMale,castFemale,castUnknown,CrewMale,CrewFemale,CrewUnknown
0,Sallie Gardner at a Gallop,1878,0,0,2221420,194079,"[{'cast_id': 5, 'character': 'Herself - Sallie...","[{'credit_id': '52fe4ce29251416c91103327', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",194079,0.327841,0.0,Two named female character not present,2.0,0.0,0.0,1.0,0.0,2.0
1,Accordion Player,1888,0,0,1758563,96882,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe49c39251416c750d2a8b', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",96882,0.212768,0.0,Two named female character not present,1.0,0.0,0.0,0.0,1.0,0.0
2,"Monkeyshines, No. 1",1890,0,0,361921,32571,"[{'cast_id': 10, 'character': 'Himself', 'cred...","[{'credit_id': '52fe44d99251416c9101ef9f', 'de...",0.0,"[{'id': 35, 'name': 'Comedy'}]",32571,1.163672,0.0,Two named female character not present,1.0,0.0,0.0,5.0,0.0,0.0
3,Dickson Greeting,1891,0,0,241373,33229,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '52fe45069251416c91024e8f', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",33229,1.041504,0.0,Two named female character not present,1.0,0.0,0.0,3.0,0.0,0.0
4,Je vous aime,1891,0,0,3201916,336380,"[{'cast_id': 0, 'character': 'Himself', 'credi...","[{'credit_id': '553266c4925141631000227d', 'de...",0.0,"[{'id': 99, 'name': 'Documentary'}]",336380,0.212894,0.0,Two named female character not present,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7258,12 feet deep,2017,3,0,5143226,459928,"[{'cast_id': 0, 'character': 'McGradey', 'cred...","[{'credit_id': '592ec68d92514130da00d716', 'de...",0.0,"[{'id': 53, 'name': 'Thriller'}]",459928,4.479536,0.0,Beschdel test passed!,6.0,1.0,1.0,50.0,19.0,2.0
7259,Landline,2017,3,0,5737862,419459,"[{'cast_id': 0, 'character': 'Dana Jacobs', 'c...","[{'credit_id': '5863a7fcc3a3680ab600716c', 'de...",0.0,"[{'id': 35, 'name': 'Comedy'}]",419459,1.811506,0.0,Beschdel test passed!,13.0,4.0,1.0,13.0,12.0,1.0
7260,Patient Zero,2018,1,0,3458254,295011,"[{'cast_id': 1, 'character': 'Dr. Gina Rose', ...","[{'credit_id': '55c8fd47c3a36849b40002ec', 'de...",0.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",295011,1.083454,0.0,Two named female character have no dialogue,8.0,8.0,0.0,2.0,4.0,0.0
7261,Iron Sky: The Coming Race,2019,3,0,3038708,302349,"[{'cast_id': 1, 'character': 'Wolfgang Kortzfl...","[{'credit_id': '5461f666c3a3686f4c001f27', 'de...",18000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,1.917649,0.0,Beschdel test passed!,16.0,12.0,2.0,1.0,3.0,1.0


Too many gender unknown. If we drop all these values the dataset will be reduced to very less. hence we can use gender guessers.

In [21]:
#pip install gender-guesser

In [22]:
budget_unknown = (all_b_values['budget'] == 0.0).sum()
print(f"Where budget of movie is not known: {budget_unknown}")
revenue_unknown = (all_b_values['revenue'] == 0.0).sum()
print(f"Where revenue of movie is not known: {revenue_unknown}")


Where budget of movie is not known: 3473
Where revenue of movie is not known: 3484


In [23]:
min_year = all_b_values['year'].min()
print("Minimum year in the dataset:", min_year)
max_year = all_b_values['year'].max()
print("Max Year in dataset:", max_year)

Minimum year in the dataset: 1878
Max Year in dataset: 2020


In [24]:
for index, row in all_b_values.iterrows():
    genre_list = []
    genreList = ast.literal_eval(row['genres'])
    
    for genre in genreList:
        genre_list.append(genre['name'])
    
    all_b_values.at[index, 'genre'] = ', '.join(genre_list)


In [25]:
all_b_values.drop(columns=['genres', 'cast', 'crew'], inplace=True)
all_b_values

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,budget,id,popularity,revenue,Category,castMale,castFemale,castUnknown,CrewMale,CrewFemale,CrewUnknown,genre
0,Sallie Gardner at a Gallop,1878,0,0,2221420,194079,0.0,194079,0.327841,0.0,Two named female character not present,2.0,0.0,0.0,1.0,0.0,2.0,Documentary
1,Accordion Player,1888,0,0,1758563,96882,0.0,96882,0.212768,0.0,Two named female character not present,1.0,0.0,0.0,0.0,1.0,0.0,Documentary
2,"Monkeyshines, No. 1",1890,0,0,361921,32571,0.0,32571,1.163672,0.0,Two named female character not present,1.0,0.0,0.0,5.0,0.0,0.0,Comedy
3,Dickson Greeting,1891,0,0,241373,33229,0.0,33229,1.041504,0.0,Two named female character not present,1.0,0.0,0.0,3.0,0.0,0.0,Documentary
4,Je vous aime,1891,0,0,3201916,336380,0.0,336380,0.212894,0.0,Two named female character not present,1.0,0.0,0.0,1.0,0.0,1.0,Documentary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7258,12 feet deep,2017,3,0,5143226,459928,0.0,459928,4.479536,0.0,Beschdel test passed!,6.0,1.0,1.0,50.0,19.0,2.0,Thriller
7259,Landline,2017,3,0,5737862,419459,0.0,419459,1.811506,0.0,Beschdel test passed!,13.0,4.0,1.0,13.0,12.0,1.0,Comedy
7260,Patient Zero,2018,1,0,3458254,295011,0.0,295011,1.083454,0.0,Two named female character have no dialogue,8.0,8.0,0.0,2.0,4.0,0.0,"Action, Drama, Horror, Thriller"
7261,Iron Sky: The Coming Race,2019,3,0,3038708,302349,18000000.0,302349,1.917649,0.0,Beschdel test passed!,16.0,12.0,2.0,1.0,3.0,1.0,"Action, Comedy, Fantasy, Science Fiction"


In [26]:
# Assuming you want to save the DataFrame to a file named 'bdel_test.csv'
all_b_values.to_csv('ScoredData.csv', index=False)