In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import requests
import json



In [2]:
cleaned_race_name_df = pd.read_csv("cleaned_race_name.csv")
movies_df = pd.read_csv("movies_cast_2.csv")
pc_df = pd.read_csv("principals.csv")


In [3]:
cleaned_race_name_df = cleaned_race_name_df.drop(["Unnamed: 0"], axis=1)

In [4]:
cleaned_race_name_df

Unnamed: 0,first_name,last_name,NEW_ACTOR_RACE,race,nconst
0,Lauren,Bacall,w,NL+W,nm0000002
1,Marlon,Brando,w,HL+W,nm0000008
2,Gong,Li,a,NL+A,nm0000084
3,Armin,Mueller-Stahl,w,NL+W,nm0000090
4,John,Cleese,i,NL+I,nm0000092
...,...,...,...,...,...
5267,Eric,Borsuk,a,NL+A,nm9858133
5268,Vilhelm,Blomgren,w,NL+W,nm9859585
5269,Roman,Davis,b,NL+B,nm9877392
5270,Gabriel,Sky,NL+O,NL+O,nm9982380


In [5]:
movies_df_cpy = movies_df.copy()
movies_df_cpy.nconst = movies_df_cpy.nconst.apply(lambda x: x[1:-1].split(","))
expanded_movies_nconst = movies_df_cpy.explode("nconst")
expanded_movies_nconst.nconst = expanded_movies_nconst.nconst.apply(lambda x: x.strip()[1:-1])
expanded_movies_nconst

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,tmdb_id,og_language,budget,revenue,ratio,ROI,averageRating,numVotes,Unnamed: 18,nconst
0,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",23827,en,230000,194183034,0.001184,844.274061,6.3,238846,Info says budget is $15k,nm2209370
0,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",23827,en,230000,194183034,0.001184,844.274061,6.3,238846,Info says budget is $15k,nm2913790
0,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",23827,en,230000,194183034,0.001184,844.274061,6.3,238846,Info says budget is $15k,nm2104166
0,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",23827,en,230000,194183034,0.001184,844.274061,6.3,238846,Info says budget is $15k,nm2910808
1,4907,tt2309260,movie,The Gallows,The Gallows,0,2015,,81,"Horror,Mystery,Thriller",299245,en,100000,42664410,0.002344,426.644100,4.2,20611,,nm3790547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3538,3185,tt0254703,movie,The Point Men,The Point Men,0,2001,,100,"Action,Drama",46943,en,6100000,714,8543.417367,0.000117,4.3,1161,,nm0531229
3539,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",2,en,400000,2339,200000.000000,0.000005,5.7,2751,,nm0000514
3539,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",2,en,400000,2339,200000.000000,0.000005,5.7,2751,,nm0674782
3539,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",2,en,400000,2339,200000.000000,0.000005,5.7,2751,,nm0072435


In [6]:
merged = pd.merge(expanded_movies_nconst, cleaned_race_name_df, left_on='nconst', right_on='nconst', how='left')
merged = merged.drop("race", axis=1)

In [7]:
merged.columns

Index(['index', 'tconst', 'titleType', 'primaryTitle', 'originalTitle',
       'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres',
       'tmdb_id', 'og_language', 'budget', 'revenue', 'ratio', 'ROI',
       'averageRating', 'numVotes', 'Unnamed: 18', 'nconst', 'first_name',
       'last_name', 'NEW_ACTOR_RACE'],
      dtype='object')

In [8]:
merged[merged.averageRating == merged.averageRating.max()]

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,...,revenue,ratio,ROI,averageRating,numVotes,Unnamed: 18,nconst,first_name,last_name,NEW_ACTOR_RACE
528,402,tt0167260,movie,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,0,2003,,201,"Action,Adventure,Drama",...,1118888979,0.084012,11.903074,9.0,1816228,,nm0000704,Elijah,Wood,w
529,402,tt0167260,movie,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,0,2003,,201,"Action,Adventure,Drama",...,1118888979,0.084012,11.903074,9.0,1816228,,nm0001557,Viggo,Mortensen,w
530,402,tt0167260,movie,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,0,2003,,201,"Action,Adventure,Drama",...,1118888979,0.084012,11.903074,9.0,1816228,,nm0005212,Ian,McKellen,w
531,402,tt0167260,movie,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,0,2003,,201,"Action,Adventure,Drama",...,1118888979,0.084012,11.903074,9.0,1816228,,nm0089217,Orlando,Bloom,NL+O
1878,557,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152,"Action,Crime,Drama",...,1004558444,0.184161,5.430046,9.0,2612004,,nm0000288,Christian,Bale,w
1879,557,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152,"Action,Crime,Drama",...,1004558444,0.184161,5.430046,9.0,2612004,,nm0005132,Heath,Ledger,w
1880,557,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152,"Action,Crime,Drama",...,1004558444,0.184161,5.430046,9.0,2612004,,nm0001173,Aaron,Eckhart,w
1881,557,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152,"Action,Crime,Drama",...,1004558444,0.184161,5.430046,9.0,2612004,,nm0000323,Michael,Caine,w


In [9]:
merged.rename(columns = {'NEW_ACTOR_RACE':'race'}, inplace = True)

In [10]:
merged

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,...,revenue,ratio,ROI,averageRating,numVotes,Unnamed: 18,nconst,first_name,last_name,race
0,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,194183034,0.001184,844.274061,6.3,238846,Info says budget is $15k,nm2209370,Katie,Featherston,w
1,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,194183034,0.001184,844.274061,6.3,238846,Info says budget is $15k,nm2913790,Micah,Sloat,b
2,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,194183034,0.001184,844.274061,6.3,238846,Info says budget is $15k,nm2104166,Mark,Fredrichs,w
3,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,194183034,0.001184,844.274061,6.3,238846,Info says budget is $15k,nm2910808,Amber,Armstrong,NL+O
4,4907,tt2309260,movie,The Gallows,The Gallows,0,2015,,81,"Horror,Mystery,Thriller",...,42664410,0.002344,426.644100,4.2,20611,,nm3790547,Reese,Mishler,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14117,3185,tt0254703,movie,The Point Men,The Point Men,0,2001,,100,"Action,Drama",...,714,8543.417367,0.000117,4.3,1161,,nm0531229,Cal,MacAninch,HL+M
14118,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,2339,200000.000000,0.000005,5.7,2751,,nm0000514,Michael,Madsen,w
14119,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,2339,200000.000000,0.000005,5.7,2751,,nm0674782,Harold,Perrineau,b
14120,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,2339,200000.000000,0.000005,5.7,2751,,nm0072435,Amber,Benson,w


In [11]:
merged = merged.drop("Unnamed: 18", axis=1)
merged

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,...,budget,revenue,ratio,ROI,averageRating,numVotes,nconst,first_name,last_name,race
0,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,230000,194183034,0.001184,844.274061,6.3,238846,nm2209370,Katie,Featherston,w
1,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,230000,194183034,0.001184,844.274061,6.3,238846,nm2913790,Micah,Sloat,b
2,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,230000,194183034,0.001184,844.274061,6.3,238846,nm2104166,Mark,Fredrichs,w
3,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,230000,194183034,0.001184,844.274061,6.3,238846,nm2910808,Amber,Armstrong,NL+O
4,4907,tt2309260,movie,The Gallows,The Gallows,0,2015,,81,"Horror,Mystery,Thriller",...,100000,42664410,0.002344,426.644100,4.2,20611,nm3790547,Reese,Mishler,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14117,3185,tt0254703,movie,The Point Men,The Point Men,0,2001,,100,"Action,Drama",...,6100000,714,8543.417367,0.000117,4.3,1161,nm0531229,Cal,MacAninch,HL+M
14118,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,400000,2339,200000.000000,0.000005,5.7,2751,nm0000514,Michael,Madsen,w
14119,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,400000,2339,200000.000000,0.000005,5.7,2751,nm0674782,Harold,Perrineau,b
14120,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,400000,2339,200000.000000,0.000005,5.7,2751,nm0072435,Amber,Benson,w


In [12]:
df = (merged.groupby(['tconst'])
      .agg({'nconst': lambda x: x.tolist(),'race': lambda x: x.tolist() })
      .reset_index())

In [14]:
df

Unnamed: 0,tconst,nconst,race
0,tt0035423,"[nm0000212, nm0413168, nm0000630, nm0005227]","[w, w, w, w]"
1,tt0118589,"[nm0001014, nm0073160, nm0066586, nm0004771]","[b, a, NL+O, w]"
2,tt0120467,"[nm0641168, nm0424682, nm0507915, nm0839486]","[w, NL+M, w, w]"
3,tt0120667,"[nm0344435, nm0004821, nm0262635, nm0004695]","[w, w, w, l]"
4,tt0120679,"[nm0000161, nm0000547, nm0001691, nm0535502]","[l, w, w, NL+O]"
...,...,...,...
3535,tt8772262,"[nm6073955, nm2930503, nm9859585, nm2860379]","[w, NL+O, w, w]"
3536,tt8946378,"[nm0185819, nm0262635, nm1869101, nm0000130]","[w, w, w, w]"
3537,tt9134216,"[nm1078479, nm1221047, nm0000491, nm0541932]","[a, NL+M, w, w]"
3538,tt9285882,"[nm7906702, nm1951953, nm8780185, nm1080139]","[NL+M, b, NL+O, NL+O]"


In [13]:
merged

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,...,budget,revenue,ratio,ROI,averageRating,numVotes,nconst,first_name,last_name,race
0,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,230000,194183034,0.001184,844.274061,6.3,238846,nm2209370,Katie,Featherston,w
1,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,230000,194183034,0.001184,844.274061,6.3,238846,nm2913790,Micah,Sloat,b
2,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,230000,194183034,0.001184,844.274061,6.3,238846,nm2104166,Mark,Fredrichs,w
3,7363,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",...,230000,194183034,0.001184,844.274061,6.3,238846,nm2910808,Amber,Armstrong,NL+O
4,4907,tt2309260,movie,The Gallows,The Gallows,0,2015,,81,"Horror,Mystery,Thriller",...,100000,42664410,0.002344,426.644100,4.2,20611,nm3790547,Reese,Mishler,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14117,3185,tt0254703,movie,The Point Men,The Point Men,0,2001,,100,"Action,Drama",...,6100000,714,8543.417367,0.000117,4.3,1161,nm0531229,Cal,MacAninch,HL+M
14118,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,400000,2339,200000.000000,0.000005,5.7,2751,nm0000514,Michael,Madsen,w
14119,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,400000,2339,200000.000000,0.000005,5.7,2751,nm0674782,Harold,Perrineau,b
14120,2493,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",...,400000,2339,200000.000000,0.000005,5.7,2751,nm0072435,Amber,Benson,w


In [14]:
from sklearn.preprocessing import OneHotEncoder
OneHotEncoder(
    categories='auto',  # Categories per feature
    drop=None, # Whether to drop one of the features
    sparse=True, # Will return sparse matrix if set True
    handle_unknown='error' # Whether to raise an error 
)         

OneHotEncoder()

In [15]:
ohe = OneHotEncoder()
transformed = ohe.fit_transform(merged[['race']])

In [16]:
transformed.toarray()

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [17]:
print(ohe.categories_)

[array(['HL+M', 'HL+O', 'NL+M', 'NL+O', 'a', 'b', 'i', 'l', 'w', nan],
      dtype=object)]


In [18]:
merged[ohe.categories_[0]] = transformed.toarray()

In [19]:
merged.columns

Index([         'index',         'tconst',      'titleType',   'primaryTitle',
        'originalTitle',        'isAdult',      'startYear',        'endYear',
       'runtimeMinutes',         'genres',        'tmdb_id',    'og_language',
               'budget',        'revenue',          'ratio',            'ROI',
        'averageRating',       'numVotes',         'nconst',     'first_name',
            'last_name',           'race',           'HL+M',           'HL+O',
                 'NL+M',           'NL+O',              'a',              'b',
                    'i',              'l',              'w',              nan],
      dtype='object')

In [20]:
merged = merged.drop([np.nan, "index"], axis=1)

In [21]:
merged

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,tmdb_id,...,race,HL+M,HL+O,NL+M,NL+O,a,b,i,l,w
0,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",23827,...,w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",23827,...,b,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",23827,...,w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,tt1179904,movie,Paranormal Activity,Paranormal Activity,0,2007,,86,"Horror,Mystery",23827,...,NL+O,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,tt2309260,movie,The Gallows,The Gallows,0,2015,,81,"Horror,Mystery,Thriller",299245,...,w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14117,tt0254703,movie,The Point Men,The Point Men,0,2001,,100,"Action,Drama",46943,...,HL+M,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14118,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",2,...,w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
14119,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",2,...,b,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14120,tt1270296,movie,The Killing Jar,The Killing Jar,0,2010,,92,"Crime,Mystery,Thriller",2,...,w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
movie_race_df= merged.groupby("tconst").agg({'HL+M': sum,'HL+O': sum, "NL+M":sum, "NL+O":sum,"a":sum, "b":sum, "i":sum, "l":sum,"w":sum})


In [25]:
movie_race_df

Unnamed: 0_level_0,HL+M,HL+O,NL+M,NL+O,a,b,i,l,w
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
tt0035423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
tt0118589,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
tt0120467,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0
tt0120667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
tt0120679,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...
tt8772262,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
tt8946378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
tt9134216,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0
tt9285882,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0


In [26]:
movie_race_df["Total PC"] = movie_race_df.sum(axis=1)

In [27]:
movie_race_df

Unnamed: 0_level_0,HL+M,HL+O,NL+M,NL+O,a,b,i,l,w,Total PC
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt0035423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0
tt0118589,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,4.0
tt0120467,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0
tt0120667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0
tt0120679,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...
tt8772262,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,4.0
tt8946378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0
tt9134216,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,4.0
tt9285882,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,4.0


In [28]:
movie_race_df["Total PC"].value_counts()

4.0     3490
3.0       18
1.0       13
2.0        9
7.0        4
8.0        3
10.0       1
5.0        1
9.0        1
Name: Total PC, dtype: int64

In [29]:
one = movie_race_df.drop(['Total PC'],1).values
two = movie_race_df['Total PC'].values[:,None]
movie_race_df['Impurity Index'] = 1 - pd.np.sum((one/two)**2, axis=1)
movie_race_df

  one = movie_race_df.drop(['Total PC'],1).values
  movie_race_df['Impurity Index'] = 1 - pd.np.sum((one/two)**2, axis=1)


Unnamed: 0_level_0,HL+M,HL+O,NL+M,NL+O,a,b,i,l,w,Total PC,Impurity Index
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt0035423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.000
tt0118589,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,4.0,0.750
tt0120467,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.375
tt0120667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0,0.375
tt0120679,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,4.0,0.625
...,...,...,...,...,...,...,...,...,...,...,...
tt8772262,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,4.0,0.375
tt8946378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.000
tt9134216,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,4.0,0.625
tt9285882,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,4.0,0.625


In [30]:
movie_race_df.to_csv("impurity_index.csv")

In [31]:
movie_race_df.a.value_counts()

0.0    2843
1.0     593
2.0      87
3.0      12
4.0       3
5.0       1
8.0       1
Name: a, dtype: int64

In [40]:
movie_race_df[movie_race_df["Total PC"] > 5]

Unnamed: 0,tconst,HL+M,HL+O,NL+M,NL+O,a,b,i,l,w,Total PC,Impurity Index
400,tt0277895,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,4.0,7.0,0.571429
630,tt0330099,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,7.0,10.0,0.46
761,tt0363589,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,3.0,7.0,0.693878
1254,tt0460829,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,6.0,8.0,0.40625
2573,tt1801113,1.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,2.0,8.0,0.75
2919,tt2525596,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,1.0,7.0,0.44898
3002,tt2910300,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,8.0,0.0
3242,tt4415068,0.0,0.0,0.0,1.0,4.0,0.0,1.0,0.0,1.0,7.0,0.612245
3461,tt6774196,1.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,4.0,9.0,0.716049


In [33]:
movie_race_df.reset_index(inplace=True)

In [34]:
movie_race_df[movie_race_df.tconst == "tt1179904"]

Unnamed: 0,tconst,HL+M,HL+O,NL+M,NL+O,a,b,i,l,w,Total PC,Impurity Index
1927,tt1179904,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,4.0,0.625


In [35]:
merged[merged.tconst == "tt1179904"][['first_name', 'last_name', 'race']]

Unnamed: 0,first_name,last_name,race
0,Katie,Featherston,w
1,Micah,Sloat,b
2,Mark,Fredrichs,w
3,Amber,Armstrong,NL+O


In [36]:
cleaned_race_name_df

Unnamed: 0,first_name,last_name,NEW_ACTOR_RACE,race,nconst
0,Lauren,Bacall,w,NL+W,nm0000002
1,Marlon,Brando,w,HL+W,nm0000008
2,Gong,Li,a,NL+A,nm0000084
3,Armin,Mueller-Stahl,w,NL+W,nm0000090
4,John,Cleese,i,NL+I,nm0000092
...,...,...,...,...,...
5267,Eric,Borsuk,a,NL+A,nm9858133
5268,Vilhelm,Blomgren,w,NL+W,nm9859585
5269,Roman,Davis,b,NL+B,nm9877392
5270,Gabriel,Sky,NL+O,NL+O,nm9982380


In [None]:
# for i in genres:
#     df["is_" + i ] = df[df["gen"] == i]