## I) Preprocess the raw data

### Loading and cleaning the dataset

In [1]:
%matplotlib inline
import os
import pandas as pd
import requests
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
characters = pd.read_csv('./data/raw/CMU_movies/character.metadata.tsv', sep='\t', 
                         names=['wiki_movie_id', 'free_movie_id', 'release', 'char_name', 'actor_birth', 'actor_gender', 'actor_height',
                                 'actor_ethnicity', 'actor_name', 'age_at_release', 'free_map_id', 'free_char_id', 'free_actor_id'] )
movies = pd.read_csv('./data/raw/CMU_movies/movie.metadata.tsv', sep='\t', names=['wiki_movie_id', 'free_movie_id', 'movie_name', 'release', 
                                                                   'box_office', 'runtime', 'languages', 'countries', 'genres'])
summaries = df = pd.read_csv('./data/raw/CMU_movies/plot_summaries.txt', delimiter='\t', header=None, names=['movie_id', 'plot_summary'], encoding='utf-8')

FileNotFoundError: [Errno 2] No such file or directory: './data/raw/CMU_movies/character.metadata.tsv'

In [4]:
print(characters.shape)
print(movies['movie_name'].nunique())
print(summaries.shape)

(450669, 13)
75478
(42303, 2)


In [5]:
characters.head(2)

Unnamed: 0,wiki_movie_id,free_movie_id,release,char_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,age_at_release,free_map_id,free_char_id,free_actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4


In [6]:
movies.head(2)

Unnamed: 0,wiki_movie_id,free_movie_id,movie_name,release,box_office,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


In [7]:
summaries.head(2)

Unnamed: 0,movie_id,plot_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...


In [8]:
print(characters.isnull().sum())

wiki_movie_id           0
free_movie_id           0
release              9995
char_name          257875
actor_birth        106145
actor_gender        45609
actor_height       295845
actor_ethnicity    344611
actor_name           1228
age_at_release     158113
free_map_id             0
free_char_id       257865
free_actor_id         815
dtype: int64


In [9]:
print(movies.isnull().sum())

wiki_movie_id        0
free_movie_id        0
movie_name           0
release           6902
box_office       73340
runtime          20450
languages            0
countries            0
genres               0
dtype: int64


In [10]:
print(summaries.isnull().sum())

movie_id        0
plot_summary    0
dtype: int64


### Cleaning the Dataset

In [11]:
# Standardize 'release' column to extract the correct year
def extract_year(release_date):
    try:
        # Attempt to convert to datetime and extract the year
        year = pd.to_datetime(release_date, errors='coerce').year
        if year is not pd.NaT:  # Check if the year is valid
            return year
        # If conversion fails, try extracting just the first 4 digits as year
        return int(str(release_date)[:4])
    except (ValueError, TypeError):
        return None  # Return None if extraction fails

# Apply the function to the 'release' column
movies['release'] = movies['release'].apply(extract_year)

movies.head(2)

Unnamed: 0,wiki_movie_id,free_movie_id,movie_name,release,box_office,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001.0,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


In [12]:
print(movies['movie_name'].nunique())

75478


In [13]:
# We are checking whether an inner join is desirable or if we should use an outer join instead
missing_in_summaries = movies[~movies['wiki_movie_id'].isin(summaries['movie_id'])]
print(f"Movies missing in summaries: {len(missing_in_summaries)}")

missing_in_movies = summaries[~summaries['movie_id'].isin(movies['wiki_movie_id'])]
print(f"Summaries missing in movies: {len(missing_in_movies)}")

missing_in_characters = movies[~movies['wiki_movie_id'].isin(characters['wiki_movie_id'])]
print(f"Movies missing in characters: {len(missing_in_characters)}")


Movies missing in summaries: 39537
Summaries missing in movies: 99
Movies missing in characters: 17411


In [14]:
print(movies['movie_name'].nunique())
print(movies['movie_name'].isnull().sum())
print(summaries.shape)

75478
0
(42303, 2)


In [15]:
# Merge movies and summaries 
full_dataset = movies.merge(summaries, how='outer', left_on='wiki_movie_id', right_on='movie_id')
# Arbitrarily drop one of the 2 columns with same ids for movie
full_dataset.drop(columns=['movie_id'], inplace=True)

# Merge our movies-summary dataset with the characters one
full_dataset = full_dataset.merge(characters, how='outer', on='wiki_movie_id')
full_dataset.head(2)

Unnamed: 0,wiki_movie_id,free_movie_id_x,movie_name,release_x,box_office,runtime,languages,countries,genres,plot_summary,...,char_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,age_at_release,free_map_id,free_char_id,free_actor_id
0,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,...,,1941-07-30,F,,/m/03ttfc,Rosa Maria Sardà,54.0,/m/02vbt4w,,/m/0gh6sw
1,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,...,,1966,F,,,Mercè Pons,29.0,/m/02vb4j6,,/m/0267qhz


In [16]:
# Making sure we haven't lost any information due to the merge (we used to as we performed inner merge before)
print(full_dataset['movie_name'].nunique())
print(full_dataset['movie_name'].isnull().sum())

75478
99


In [17]:
# We realize the outer merge has created an empty row with only the plot summary for each of the summaries 
# without corresponding ids, they are unexploitable => remove them
full_dataset = full_dataset.dropna(subset=['movie_name'])

In [18]:
full_dataset.tail()

Unnamed: 0,wiki_movie_id,free_movie_id_x,movie_name,release_x,box_office,runtime,languages,countries,genres,plot_summary,...,char_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,age_at_release,free_map_id,free_char_id,free_actor_id
468075,37492363.0,/m/0ds7zbt,Cherries and Clover,2011.0,,86.0,{},"{""/m/0d060g"": ""Canada""}","{""/m/05p553"": ""Comedy film"", ""/m/07s9rl0"": ""Dr...","When Clover's ' childhood friend, Cherries ', ...",...,,,,,,Molly Cera,,/m/0g4tzm6,,/m/0g4tzm9
468076,37492363.0,/m/0ds7zbt,Cherries and Clover,2011.0,,86.0,{},"{""/m/0d060g"": ""Canada""}","{""/m/05p553"": ""Comedy film"", ""/m/07s9rl0"": ""Dr...","When Clover's ' childhood friend, Cherries ', ...",...,,,,,,Taylor Marie Milton,,/m/0g4tzmk,,/m/0g4tzmn
468077,37492363.0,/m/0ds7zbt,Cherries and Clover,2011.0,,86.0,{},"{""/m/0d060g"": ""Canada""}","{""/m/05p553"": ""Comedy film"", ""/m/07s9rl0"": ""Dr...","When Clover's ' childhood friend, Cherries ', ...",...,,,,,,Spencer Jenkins,,/m/0g4tzmx,,/m/0g4tzm_
468078,37501922.0,/m/0c0m5vt,Terminal Bliss,1992.0,,91.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",Two adolescent children of wealthy parents dea...,...,John Hunter,1966-10-11,M,1.765,,Luke Perry,25.0,/m/0gyqn_q,/m/0gyqn_s,/m/01g65g
468079,37501922.0,/m/0c0m5vt,Terminal Bliss,1992.0,,91.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",Two adolescent children of wealthy parents dea...,...,Craig Murphy,1969-07-28,F,1.72,/m/041rx,Alexis Arquette,22.0,/m/0h35_7c,/m/0h35_7g,/m/02zjrf


In [19]:
full_dataset['actor_birth'] = full_dataset['actor_birth'].apply(extract_year)

In [20]:
# Drop the columns that were in both dataframes but not specified by lefton and righton in the merge function call
# Also drop ids that have become irrelevant
full_dataset.drop(columns=['free_movie_id_y', 'release_y'], inplace=True)

# Rename two columns in a DataFrame
full_dataset.rename(columns={'free_movie_id_x': 'free_movie_id', 'release_x': 'release'}, inplace=True)

In [21]:
full_dataset.head(5)

Unnamed: 0,wiki_movie_id,free_movie_id,movie_name,release,box_office,runtime,languages,countries,genres,plot_summary,char_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,age_at_release,free_map_id,free_char_id,free_actor_id
0,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,,1941.0,F,,/m/03ttfc,Rosa Maria Sardà,54.0,/m/02vbt4w,,/m/0gh6sw
1,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,,1966.0,F,,,Mercè Pons,29.0,/m/02vb4j6,,/m/0267qhz
2,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,,1944.0,F,,,Anna Lizaran,51.0,/m/02vc7_7,,/m/0263499
3,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,,1935.0,F,,,Núria Espert,60.0,/m/02vbd74,,/m/0263yvy
4,3217.0,/m/014hr,Army of Darkness,1992.0,21502796.0,81.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01q03"": ""Cult"", ""/m/03npn"": ""Horror"", ""/m...","After being pulled through a time portal, Ash ...",S-Mart Clerk,1965.0,M,1.87,,Ted Raimi,26.0,/m/0hgcnkm,/m/0hgcnkq,/m/07qn0


In [22]:
full_dataset.isnull().sum()

wiki_movie_id           0
free_movie_id           0
movie_name              0
release             13205
box_office         366874
runtime             57941
languages               0
countries               0
genres                  0
plot_summary       155170
char_name          275286
actor_birth        123579
actor_gender        63020
actor_height       313256
actor_ethnicity    362022
actor_name          18639
age_at_release     175524
free_map_id         17411
free_char_id       275276
free_actor_id       18226
dtype: int64

In [23]:
# We see we can compute some of the NaNs in age at release due to having the birth year of the actor and the release date of the movie 
def compute_age_at_release(row):
    if np.isnan(row['age_at_release']) and pd.notna(row['release']) and pd.notna(row['actor_birth']):
        return row['release'] - row['actor_birth']
    return row['age_at_release']

# Apply the function to each row to fill missing 'age_at_release' values
full_dataset['age_at_release'] = full_dataset.apply(compute_age_at_release, axis=1)

In [24]:
full_dataset.isnull().sum()

wiki_movie_id           0
free_movie_id           0
movie_name              0
release             13205
box_office         366874
runtime             57941
languages               0
countries               0
genres                  0
plot_summary       155170
char_name          275286
actor_birth        123579
actor_gender        63020
actor_height       313256
actor_ethnicity    362022
actor_name          18639
age_at_release     132683
free_map_id         17411
free_char_id       275276
free_actor_id       18226
dtype: int64

In [25]:
# Creating a separate actor dataframe, since some of the actors information don't depend on the movie
rep_info = ['actor_name', 'free_actor_id', 'actor_birth', 'actor_gender', 'actor_ethnicity', 'actor_height']
actor = full_dataset[rep_info].drop_duplicates()

In [26]:
actor.head()

Unnamed: 0,actor_name,free_actor_id,actor_birth,actor_gender,actor_ethnicity,actor_height
0,Rosa Maria Sardà,/m/0gh6sw,1941.0,F,/m/03ttfc,
1,Mercè Pons,/m/0267qhz,1966.0,F,,
2,Anna Lizaran,/m/0263499,1944.0,F,,
3,Núria Espert,/m/0263yvy,1935.0,F,,
4,Ted Raimi,/m/07qn0,1965.0,M,,1.87


In [27]:
# Simplifying the final dataset by keeping a link between actor and itself via 'actor_name'
final_dataset = full_dataset.drop(columns=rep_info[1:])

In [28]:
final_dataset.head(4)

Unnamed: 0,wiki_movie_id,free_movie_id,movie_name,release,box_office,runtime,languages,countries,genres,plot_summary,char_name,actor_name,age_at_release,free_map_id,free_char_id
0,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,,Rosa Maria Sardà,54.0,/m/02vbt4w,
1,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,,Mercè Pons,29.0,/m/02vb4j6,
2,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,,Anna Lizaran,51.0,/m/02vc7_7,
3,330.0,/m/0ktn59,Actrius,1996.0,,90.0,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","{""/m/07s9rl0"": ""Drama"", ""/m/01t_vv"": ""Comedy-d...",In order to prepare the role of an important o...,,Núria Espert,60.0,/m/02vbd74,


In [29]:
# Creating a separate movies_info dataframe, since some of the movie information don't vary for a single same movie
rep_info = ['movie_name', 'wiki_movie_id', 'free_movie_id', 'release', 'box_office', 'runtime', 'languages', 'countries', 'genres', 'plot_summary']
movies_info = full_dataset[rep_info].drop_duplicates()

In [30]:
# Simplifying the final dataset by keeping a link between movies_info and itself via 'movie_name'
final_dataset = final_dataset.drop(columns=rep_info[1:])

In [31]:
final_dataset.head()

Unnamed: 0,movie_name,char_name,actor_name,age_at_release,free_map_id,free_char_id
0,Actrius,,Rosa Maria Sardà,54.0,/m/02vbt4w,
1,Actrius,,Mercè Pons,29.0,/m/02vb4j6,
2,Actrius,,Anna Lizaran,51.0,/m/02vc7_7,
3,Actrius,,Núria Espert,60.0,/m/02vbd74,
4,Army of Darkness,S-Mart Clerk,Ted Raimi,26.0,/m/0hgcnkm,/m/0hgcnkq


In [32]:
# Set hierarchical indexing with 'movie_name' as the primary index and character-specific columns as secondary
final_dataset.set_index(['movie_name', 'char_name', 'actor_name'], inplace=True)

# Display the DataFrame with the hierarchical index
final_dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age_at_release,free_map_id,free_char_id
movie_name,char_name,actor_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Actrius,,Rosa Maria Sardà,54.0,/m/02vbt4w,
Actrius,,Mercè Pons,29.0,/m/02vb4j6,
Actrius,,Anna Lizaran,51.0,/m/02vc7_7,
Actrius,,Núria Espert,60.0,/m/02vbd74,
Army of Darkness,S-Mart Clerk,Ted Raimi,26.0,/m/0hgcnkm,/m/0hgcnkq


In [33]:
# Let us remove the ids for languages countries and genres as they are no longer needed
import ast

def extract_values_if_str_dict(value):
    """Parses string as a dictionary and extracts values if possible, otherwise returns the original value."""
    try:
        # Attempt to parse the string as a dictionary
        parsed_value = ast.literal_eval(value)
        if isinstance(parsed_value, dict):
            return list(parsed_value.values())
    except (ValueError, SyntaxError):
        # Return the original value if parsing fails
        return value

# Apply the function to each relevant column
movies_info['languages'] = movies_info['languages'].apply(extract_values_if_str_dict)
movies_info['countries'] = movies_info['countries'].apply(extract_values_if_str_dict)
movies_info['genres'] = movies_info['genres'].apply(extract_values_if_str_dict)

# Display the modified DataFrame
movies_info.head()

Unnamed: 0,movie_name,wiki_movie_id,free_movie_id,release,box_office,runtime,languages,countries,genres,plot_summary
0,Actrius,330.0,/m/0ktn59,1996.0,,90.0,"[Catalan language, Spanish Language]",[Spain],"[Drama, Comedy-drama]",In order to prepare the role of an important o...
4,Army of Darkness,3217.0,/m/014hr,1992.0,21502796.0,81.0,[English Language],[United States of America],"[Cult, Horror, Stop motion, Costume drama, Act...","After being pulled through a time portal, Ash ..."
18,The Birth of a Nation,3333.0,/m/0151l,1915.0,50000000.0,190.0,"[Silent film, English Language]",[United States of America],"[Silent film, Indie, Costume drama, Epic, Blac...",The film follows two juxtaposed families: the...
32,Blade Runner,3746.0,/m/017n9,1982.0,33139618.0,116.0,"[Japanese Language, Cantonese, English Languag...","[United States of America, Hong Kong]","[Thriller, Cyberpunk, Science Fiction, Future ...","{{Hatnote}} In Los Angeles, November 2019, ret..."
47,Blazing Saddles,3837.0,/m/018f8,1974.0,119500000.0,93.0,"[Yiddish Language, English Language]",[United States of America],"[Western, Satire, Comedy]","In the American Old West of 1874, construction..."


In [34]:
# We will map the ethnicity id to its corresponding value
unique_ethnicities = actor['actor_ethnicity'].unique()

# Function to split the list into batches
def split_into_batches(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

# Create an empty dictionary to store the mappings
freebase_to_wikidata_mapping = {}

# Iterate over batches of Freebase IDs
batch_size = 50  # Set batch size to 50 to avoid long URL issues
for batch in split_into_batches(unique_ethnicities, batch_size):
    # Create a batch SPARQL query
    query = """
    SELECT ?freebase_id ?item ?itemLabel WHERE {
      VALUES ?freebase_id {""" + " ".join([f'"{fb_id}"' for fb_id in batch]) + """}
      ?item wdt:P646 ?freebase_id.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
    """

    # Endpoint for Wikidata SPARQL
    url = "https://query.wikidata.org/sparql"
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Accept": "application/json"
    }

    # Send the request to Wikidata
    response = requests.get(url, headers=headers, params={"query": query, "format": "json"})

    # Check the status and response
    if response.status_code == 200:
        data = response.json()
        if 'results' in data and 'bindings' in data['results']:
            for result in data['results']['bindings']:
                freebase_id = result['freebase_id']['value']
                wikidata_id = result['item']['value'].split('/')[-1]
                label = result['itemLabel']['value']
                freebase_to_wikidata_mapping[freebase_id] = {
                    "wikidata_id": wikidata_id,
                    "label": label
                }
        else:
            print("No valid data found in response for this batch.")
    else:
        print(f"Error: Received status code {response.status_code} for batch starting with {batch[0]}")

# Assign labels for ethnicity using the batch lookup dictionary
actor['actor_ethnicity_label'] = actor['actor_ethnicity'].map(
    lambda x: freebase_to_wikidata_mapping.get(x, {}).get('label', 'Unknown')
)

In [35]:
males = actor['actor_gender'][actor['actor_gender']=='M'].count()
females = actor['actor_gender'][actor['actor_gender']=='F'].count()
print(males, females, actor['actor_gender'].isna().sum())
total = males + females
male_perc = males/total
female_perc = females/total

print(actor['actor_gender'].isnull().sum())


actor['actor_gender'] = actor['actor_gender'].fillna(
    lambda: np.random.choice(['M', 'F'], p=[male_perc, female_perc])
)



print(actor['actor_gender'].isnull().sum())

61519 35864 38378
38378
0


In [36]:
actor.head()

Unnamed: 0,actor_name,free_actor_id,actor_birth,actor_gender,actor_ethnicity,actor_height,actor_ethnicity_label
0,Rosa Maria Sardà,/m/0gh6sw,1941.0,F,/m/03ttfc,,Spaniards
1,Mercè Pons,/m/0267qhz,1966.0,F,,,Unknown
2,Anna Lizaran,/m/0263499,1944.0,F,,,Unknown
3,Núria Espert,/m/0263yvy,1935.0,F,,,Unknown
4,Ted Raimi,/m/07qn0,1965.0,M,,1.87,Unknown


In [37]:
print(actor['actor_name'].isnull().sum())

actor['actor_name'] = actor['actor_name'].fillna(
    lambda: "Unknown"
)


print(actor['actor_name'].isnull().sum())

339
0


In [38]:
actor.head()

Unnamed: 0,actor_name,free_actor_id,actor_birth,actor_gender,actor_ethnicity,actor_height,actor_ethnicity_label
0,Rosa Maria Sardà,/m/0gh6sw,1941.0,F,/m/03ttfc,,Spaniards
1,Mercè Pons,/m/0267qhz,1966.0,F,,,Unknown
2,Anna Lizaran,/m/0263499,1944.0,F,,,Unknown
3,Núria Espert,/m/0263yvy,1935.0,F,,,Unknown
4,Ted Raimi,/m/07qn0,1965.0,M,,1.87,Unknown


In [39]:
movies_info.head()

Unnamed: 0,movie_name,wiki_movie_id,free_movie_id,release,box_office,runtime,languages,countries,genres,plot_summary
0,Actrius,330.0,/m/0ktn59,1996.0,,90.0,"[Catalan language, Spanish Language]",[Spain],"[Drama, Comedy-drama]",In order to prepare the role of an important o...
4,Army of Darkness,3217.0,/m/014hr,1992.0,21502796.0,81.0,[English Language],[United States of America],"[Cult, Horror, Stop motion, Costume drama, Act...","After being pulled through a time portal, Ash ..."
18,The Birth of a Nation,3333.0,/m/0151l,1915.0,50000000.0,190.0,"[Silent film, English Language]",[United States of America],"[Silent film, Indie, Costume drama, Epic, Blac...",The film follows two juxtaposed families: the...
32,Blade Runner,3746.0,/m/017n9,1982.0,33139618.0,116.0,"[Japanese Language, Cantonese, English Languag...","[United States of America, Hong Kong]","[Thriller, Cyberpunk, Science Fiction, Future ...","{{Hatnote}} In Los Angeles, November 2019, ret..."
47,Blazing Saddles,3837.0,/m/018f8,1974.0,119500000.0,93.0,"[Yiddish Language, English Language]",[United States of America],"[Western, Satire, Comedy]","In the American Old West of 1874, construction..."


In [40]:
final_dataset.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age_at_release,free_map_id,free_char_id
movie_name,char_name,actor_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Actrius,,Rosa Maria Sardà,54.0,/m/02vbt4w,
Actrius,,Mercè Pons,29.0,/m/02vb4j6,
Actrius,,Anna Lizaran,51.0,/m/02vc7_7,
Actrius,,Núria Espert,60.0,/m/02vbd74,
Army of Darkness,S-Mart Clerk,Ted Raimi,26.0,/m/0hgcnkm,/m/0hgcnkq


In [41]:
useful_actors = actor[['actor_name', 'actor_gender', 'actor_ethnicity_label', 'actor_height']]
useful_movies = movies_info[['movie_name', 'release', 'box_office', 'runtime', 'languages', 'countries', 'genres', 'plot_summary']]
useful_final = final_dataset.reset_index()[['movie_name', 'actor_name', 'age_at_release']]

merge1 = useful_final.merge(useful_actors, on='actor_name', how='inner')

final_merged = merge1.merge(useful_movies, on='movie_name', how='inner')

print(useful_actors.shape, useful_movies.shape, useful_final.shape)
print(final_merged.shape)

(135761, 4) (81741, 8) (468080, 3)
(592933, 13)


In [42]:
import ast

# Convert the 'countries' column from a string representation to actual lists
final_merged['countries'] = final_merged['countries'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
final_merged['languages'] = final_merged['languages'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
final_merged['genres'] = final_merged['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [43]:
# Extracting the subdatasets based on the clusters defined below in order to make sure the global distr makes sense
oceanic_cluster = ["Australia", "New-Zealand"]
hollywood_cluster = ["United States of America", "Canada"]
indian_cluster = ["India"]
east_asian_cluster = ["Japan", "China", "South Korea", "Hong-Kong", "Taïwan"]
european_cluster = ["France", "Spain", "Italy", "united Kingdom"]


# Get a boolean mask for rows where the condition is met, handling non-iterable types
oceanic_mask = final_merged['countries'].apply(
    lambda x: any(country in oceanic_cluster for country in x) if isinstance(x, (list, np.ndarray)) else x
)

hollywood_mask = final_merged['countries'].apply(
    lambda x: any(country in hollywood_cluster for country in x) if isinstance(x, (list, np.ndarray)) else x
)

indian_mask = final_merged['countries'].apply(
    lambda x: any(country in indian_cluster for country in x) if isinstance(x, (list, np.ndarray)) else x
)

east_asian_mask = final_merged['countries'].apply(
    lambda x: any(country in east_asian_cluster for country in x) if isinstance(x, (list, np.ndarray)) else x
)

european_mask = final_merged['countries'].apply(
    lambda x: any(country in european_cluster for country in x) if isinstance(x, (list, np.ndarray)) else x
)

# Get the integer locations of these rows
oceanic_ilocs = np.where(oceanic_mask)[0]
hollywood_ilocs = np.where(hollywood_mask)[0]
indian_ilocs = np.where(indian_mask)[0]
east_asian_ilocs = np.where(east_asian_mask)[0]
european_ilocs = np.where(european_mask)[0]

oceanic_main = final_merged.iloc[oceanic_ilocs]
hollywood_main = final_merged.iloc[hollywood_ilocs]
indian_main = final_merged.iloc[indian_ilocs]
east_asian_main = final_merged.iloc[east_asian_ilocs]
european_main = final_merged.iloc[european_ilocs]

In [44]:
actor.isnull().sum()

actor_name                    0
free_actor_id                 1
actor_birth               77202
actor_gender                  0
actor_ethnicity          127610
actor_height             122905
actor_ethnicity_label         0
dtype: int64

We see that we are still missing a lot of ethnicities, birthdates and gender which is inherently due to our database, we will try later to remediate to this by using scrapping

### Saving the new cleaned Datasets

In [45]:
import os

# Specify the folder name
folder_names = ["data/cleaned_datasets", "data/clustered_clean"]


# Create the folder if it doesn't already exist
for folder_name in folder_names:
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"Folder '{folder_name}' created successfully.")
    else:
        print(f"Folder '{folder_name}' already exists.")


Folder 'data/cleaned_datasets' already exists.
Folder 'data/clustered_clean' already exists.


In [46]:
# List of DataFrames and their respective filenames

flat_df = final_dataset.reset_index()

dataframes = {
    "movies_info.csv": movies_info,
    "main_df.csv": flat_df,
    "actor.csv": actor
}

dataframes_clustered = {
    "oceanic_df.csv": oceanic_main,
    "hollywood_df.csv": hollywood_main,
    "indian_df.csv": indian_main,
    "east_asian_df.csv": east_asian_main,
    "europe_df.csv": european_main
}

dfs = [dataframes, dataframes_clustered]

count = 0
for df in dfs:
    # Save each DataFrame to a CSV file in the new folder
    for filename, df in df.items():
        file_path = os.path.join(folder_names[count], filename)
        df.to_csv(file_path, index=False)
        print(f"DataFrame saved to {file_path}")
    count +=1


DataFrame saved to data/cleaned_datasets/movies_info.csv
DataFrame saved to data/cleaned_datasets/main_df.csv
DataFrame saved to data/cleaned_datasets/actor.csv
DataFrame saved to data/clustered_clean/oceanic_df.csv
DataFrame saved to data/clustered_clean/hollywood_df.csv
DataFrame saved to data/clustered_clean/indian_df.csv
DataFrame saved to data/clustered_clean/east_asian_df.csv
DataFrame saved to data/clustered_clean/europe_df.csv


In [47]:
print(movies_info.shape)

(81741, 10)


## II) Preprocess the regions subsets (Europe, East Asia, Bollywood, Hollywood, Oceanic)

**Genre categorisation : Movie Genre Dataset formation with 10 final main genres without NaN values for 'main genre' category**

In [48]:
# Define the genre mappings globally 
GENRE_MAPPING = {
    "Action/Adventure": [
        "Action", "Adventure", "Supernatural", "Space western", "Action/Adventure", 
        "War film", "Epic", "Period piece", "Wuxia", "Martial Arts Film", 
        "Western", "Adventure Comedy", "Historical Epic", "Action Comedy", 
        "Gangster Film", "Epic Western", "Action Thrillers", "Sword and sorcery", 
        "Heist", "Survival", "Sword and Sandal", "Spy", "Superhero movie", "Combat Films", "Fantasy Adventure", 
        "Sword and sorcery", "Time travel", "Doomsday film", "Escape Film", 
        "Prison", "Hybrid Western", "Costume Adventure", "Roadshow theatrical release",
        "Spaghetti Western", "Women in prison films", "Road movie", "Heist", "Biker Film", 
        "Swashbuckler films", "Cavalry Film", "Space opera", "Tokusatsu", 
        "Extreme Sports", "Apocalyptic and post-apocalyptic fiction", 
        "Chase Movie", "Revisionist Western", "Caper story", "Jungle Film", 
        "B-Western", "Travel", "Auto racing", "Roadshow/Carny", "Exploitation", "Sword and Sandal", "Star vehicle", 
        "Alien invasion", "Revenge", "Foreign legion", "Indian Western", 
        "Road-Horror", "Outlaw biker film", "Prison escape", "Acid western", 
        "War effort", "Horse racing", "Movies About Gladiators", "Beach Film",
        "Outlaw", "Ninja movie", "Buddy Picture", "Singing cowboy", 
        "Beach Party film", "Adventure", "Action/Adventure", "Epic Western", "War film", "Epic", "Period piece",
        "Disaster", "Ensemble Film", "War film", "Samurai cinema", "Live action"
    ],
    "Comedy": [
        "Comedy", "Romantic comedy", "Satire", "Slapstick", "Parody", "Black comedy", 
        "Mockumentary", "Adventure Comedy", "Comedy-drama", "Comedy film", 
        "Comedy horror", "Screwball comedy", "Comedy of Errors", "Romantic comedy", 
        "Domestic Comedy", "Musical comedy", "Crime Comedy", "Fantasy Comedy", "Comedy of Errors", "Comedy Western", 
        "Screwball comedy", "Domestic Comedy", "Musical comedy", "Comedy horror", 
        "Buddy film", "Sex comedy", "Parody", "Media Satire", "Gross-out film", 
        "Gross out", "Dogme 95", "Backstage Musical", "Heavenly Comedy", "Stand-up comedy", "Comedy Thriller", 
        "Workplace Comedy", "Humour", "Camp", "Mondo film", "Bloopers & Candid Camera", "Comdedy", 
        "Ealing Comedies", "Female buddy film", "Breakdance", "Kafkaesque", "Buddy Picture", "Chick flick",
        "Comedy", "Comedy-drama", "Parody", "Satire", "Musical", "Slapstick", "Comedy ", "Comedy film",
        "Comedy of manners", "Courtroom Comedy", " Comedy"
    ],
    "Drama": [
        "Drama", "Crime Drama", "Crime Fiction", "Biographical film", 
        "Historical fiction", "Family Drama", "Romantic drama", "Costume drama", 
        "Marriage Drama", "Political drama", "Courtroom Drama", "Legal drama", 
        "Historical drama", "Coming of age", "Culture & Society", "History", 
        "Social issues", "Tragicomedy", "Avant-garde", "Experimental film", 
        "New Hollywood", "Childhood Drama", "Melodrama", "Art film", "Political cinema", 
        "Tragedy", "Feminist Film", "Juvenile Delinquency Film", "Christian film",
        "Educational", "Language & Literature", "Linguistics", "Film à clef", 
        "Rockumentary", "Medical fiction", "Buddy cop", "Docudrama", 
        "Anthology", "Existentialism", "Social problem film", 
        "Slice of life story", "Kitchen sink realism", "British New Wave", 
        "Addiction Drama", "Inspirational Drama", "Illnesses & Disabilities", 
        "Interpersonal Relationships", "Expressionism", "Early Black Cinema", 
        "British Empire Film", "Northern", "Filmed Play", "Nature", "Mumblecore", "Boxing", "Business", "Journalism", 
        "Conspiracy fiction", "Crime", "Master Criminal Films", 
        "Feature film", "Cold War", "World History", "School story", "Patriotic film", "Statutory rape", "New Queer Cinema", "Neorealism", 
        "The Netherlands in World War II", "Homoeroticism", "Drama ", "Romantic drama", "Historical Epic", "Romance Film", "Family Film", "Biopic [feature]",
        "Social issues", "Bollywood", "Crime Fiction", "Television movie", "Filipino Movies", "Film adaptation", "Teen",
        "Blaxploitation", "Sexploitation", "Tollywood", "Erotic Drama", "Pre-Code", "Anti-war", "Anti-war film", "Surrealism", 
        "Bengali Cinema", "Race movie", "Hip hop movies", "Czechoslovak New Wave"
    ],
    "Thriller/Suspense": [
        "Thriller", "Mystery", "Psychological thriller", "Erotic thriller", 
        "Suspense", "Crime Thriller", "Psychological horror", "Noir", "Film noir", 
        "Future noir", "Crime Comedy", "Political thriller", "Detective fiction", "Detective", "Film", 
        "Film & Television History", "Propaganda film", "Political satire", 
        "Natural disaster", "Remake", "Plague", "Giallo", "Whodunit", "Demonic child",
        "Neo-noir", "Private military company", "Psycho-biddy", 
        "Psychological horror", "Sci-Fi Thriller", "Z movie", "Romantic thriller", "Point of view shot",
        "Thriller", "Psychological thriller", "Crime Thriller", "Crime Fiction"
    ],
    "Horror": [
        "Horror", "Zombie Film", "Slasher", "Monster movie", "Supernatural", 
        "Sci-Fi Horror", "Erotic Horror", "Gothic Film", "Natural horror films", 
        "Haunted House Film", "Horror Comedy", "Monster movie", "Natural horror films", "Creature Film", 
        "Gothic Film", "Costume Horror", "Haunted House Film", "Monster", "Demonic child",
        "Splatter film", "Werewolf fiction", "Period Horror", 
        "Albino bias", "Vampire movies", "Revisionist Fairy Tale", "Goat gland",
        "Horror", "Sci-Fi Horror", "Erotica", "Sexploitation"
    ],
    "Science Fiction (Sci-Fi)": [
        "Science Fiction", "Sci-Fi", "Space western", "Alien", "Cyberpunk", 
        "Dystopia", "Sci-Fi Horror", "Apocalyptic and post-apocalyptic fiction", 
        "Alien invasion", "Time travel", "Dystopia", "Time travel", "Sci-Fi Adventure", "Alien Film", 
        "Apocalyptic and post-apocalyptic fiction", "Science fiction Western", "Cyberpunk", "Steampunk",
        "Sci Fi Pictures original films", "Therimin music", "Science Fiction", "Sci-Fi", "Reboot" 
    ],
    "Fantasy": [
        "Fantasy", "Magical", "Mythical", "Urban Fantasy", "Children's Fantasy", 
        "Sword and sorcery", "Fairy tale", "Mythological Fantasy", "Fantasy Adventure",
        "Romantic fantasy", "Fairy tale", "Sword and sorcery films", "Mythological Fantasy", 
        "Fantasy Drama", "Heaven-Can-Wait Fantasies", "Revisionist Fairy Tale",
        "Fantasy", "Magical"
    ],
    "Romance": [
        "Romantic", "Romance Film", "Romantic comedy", "Romantic drama", "Gay", 
        "Gay Interest", "Gay Themed", "Romantic fantasy", "Interpersonal Relationships",
        "Family & Personal Relationships", "Romantic thriller", "Romance Film", "Romantic drama",
        "Music", "LGBT", "Gay pornography", "Romance Film", "Punk rock", "Pornographic movie", "Adult",
        "Pornography", "Hardcore pornography", "Dance"
    ],
    "Documentary": [
        "Documentary", "Biography", "Biographical film", "Historical Account", 
        "Docudrama", "History", "Educational", "Inspirational Drama",
        "Educational", "Language & Literature", "Rockumentary", "Anthropology", 
        "Religious Film", "Environmental Science", "Essay Film", "Graphic & Applied Arts", 
        "Libraries and librarians", "Historical Documentaries", "Political Documetary", "Education", 
        "World History", "News", "Archives and records", "Media Studies", 
        "Inventions & Innovations", "Instrumental Music", "The Netherlands in World War II",
        "Documentary", "Social issues", "Documentary", "Music", "Concert film", "Sports",
        "Sponsored film", "Docudrama "
    ],
    "Animation/Family": [
        "Animation", "Family Film", "Children's/Family", "Children's Fantasy", 
        "Cartoon", "Animated", "Musical comedy", "Family-Oriented Adventure",
        "Anime", "Children's", "Computer Animation", "Stop motion", "Animated Musical", 
        "Animated cartoon", "Tamil cinema", "Animals", "Anima l Picture", "Pinku eiga",
        "Children's Entertainment", "Children's Issues", "Clay animation", 
        "Supermarionation", "Silhouette animation", "Jukebox musical", 
        "Operetta", "Parkour in popular culture", "Family Film", "Children's/Family",
        "Chinese Movies", "Christmas movie", "Animation "
    ]
}

FALLBACK_MAPPING = {
     "Short Film": "Drama",
    "Indie": "Drama",
    "Black-and-white": "Drama",
    "Silent film": "Drama",
    "Fan film": "Fantasy",
    "Cult": "Thriller/Suspense",
    "Experimental film": "Drama",
    "Satire": "Comedy",
    "Erotic": "Drama",
    "Softcore Porn": "Romance",
    "Historical Epic": "Action/Adventure",
    "Educational": "Documentary",
    "Japanese Movies": "Animation/Family",
    "World cinema": "Drama",
    "Musical": "Animation/Family",
    "Bollywood": "Drama",
    "Epic Western": "Action/Adventure",
}

EXTENDED_FALLBACK_MAPPING = {
    **FALLBACK_MAPPING,
    "Anthology": "Drama",
    "Existentialism": "Drama",
    "Social problem film": "Documentary",
    "Sci-Fi Adventure": "Science Fiction (Sci-Fi)",
    "Fantasy Adventure": "Fantasy",
    "Horror Comedy": "Horror",
    "Romantic thriller": "Romance",
    "Comedy Thriller": "Thriller/Suspense",
    "Steampunk": "Science Fiction (Sci-Fi)",
    "Historical Documentaries": "Documentary",
    "Art film": "Drama",
    "Music": "Documentary",
}

def map_main_genre(dataset, genre_column='genres', genre_mapping=GENRE_MAPPING, fallback_mappings=None):
    
    # Create reverse mapping from sub-genres to main genres
    sub_genre_to_main = {sub_genre: main_genre for main_genre, sub_genres in genre_mapping.items() for sub_genre in sub_genres}

    # Function to assign main genre based on sub-genres
    def assign_main_genre(genres):
        if not isinstance(genres, list) or len(genres) == 0:
            return "Other"  # Handle invalid or empty lists

        main_genre_counts = {}
        for sub_genre in genres:
            main_genre = sub_genre_to_main.get(sub_genre, "Other")
            main_genre_counts[main_genre] = main_genre_counts.get(main_genre, 0) + 1

        max_count = max(main_genre_counts.values())
        tied_genres = [genre for genre, count in main_genre_counts.items() if count == max_count]
        return sorted(tied_genres)[0] if tied_genres else "Other"

    # Apply main genre assignment to the specified column
    dataset.loc[:, 'main_genre'] = dataset[genre_column].apply(assign_main_genre)

    # Apply fallback mappings if provided
    if fallback_mappings:
        for mapping in fallback_mappings:
            dataset.loc[dataset['main_genre'] == "Other", 'main_genre'] = (
                dataset.loc[dataset['main_genre'] == "Other", genre_column]
                .apply(lambda x: reassign_genres(x, mapping))
            )

    return dataset

def reassign_genres(genres, mapping):
    """Helper function to reassign genres using a specified mapping."""
    if not isinstance(genres, list):
        return "Other"

    for sub_genre in genres:
        if sub_genre in mapping:
            return mapping[sub_genre]
    return "Other"


In [49]:
european_main = map_main_genre(european_main, genre_column='genres', fallback_mappings=[FALLBACK_MAPPING, EXTENDED_FALLBACK_MAPPING])
oceanic_main = map_main_genre(oceanic_main, genre_column='genres', fallback_mappings=[FALLBACK_MAPPING, EXTENDED_FALLBACK_MAPPING])
indian_main = map_main_genre(indian_main, genre_column='genres', fallback_mappings=[FALLBACK_MAPPING, EXTENDED_FALLBACK_MAPPING])
east_asian_main = map_main_genre(east_asian_main, genre_column='genres', fallback_mappings=[FALLBACK_MAPPING, EXTENDED_FALLBACK_MAPPING])
oceanic_main.head()

Unnamed: 0,movie_name,actor_name,age_at_release,actor_gender,actor_ethnicity_label,actor_height,release,box_office,runtime,languages,countries,genres,plot_summary,main_genre
955,Miss Congeniality,Benjamin Bratt,36.0,M,English Americans,1.88,2000.0,,109.0,"[French Language, Spanish Language, Russian La...","[United States of America, Australia]","[Romantic comedy, Thriller, Action/Adventure, ...",The film opens at a school where a boy is pick...,Action/Adventure
956,Miss Congeniality,Michael Caine,67.0,M,Unknown,1.88,2000.0,,109.0,"[French Language, Spanish Language, Russian La...","[United States of America, Australia]","[Romantic comedy, Thriller, Action/Adventure, ...",The film opens at a school where a boy is pick...,Action/Adventure
957,Miss Congeniality,Candice Bergen,54.0,F,White Americans,1.71,2000.0,,109.0,"[French Language, Spanish Language, Russian La...","[United States of America, Australia]","[Romantic comedy, Thriller, Action/Adventure, ...",The film opens at a school where a boy is pick...,Action/Adventure
958,Miss Congeniality,Ernie Hudson,54.0,M,African Americans,1.829,2000.0,,109.0,"[French Language, Spanish Language, Russian La...","[United States of America, Australia]","[Romantic comedy, Thriller, Action/Adventure, ...",The film opens at a school where a boy is pick...,Action/Adventure
959,Miss Congeniality,Heather Burns,25.0,F,Unknown,1.74,2000.0,,109.0,"[French Language, Spanish Language, Russian La...","[United States of America, Australia]","[Romantic comedy, Thriller, Action/Adventure, ...",The film opens at a school where a boy is pick...,Action/Adventure


In [50]:
def plot_top_ethnicities(dataframe, ethnicity_column='actor_ethnicity_label', region_name='Region'):
    # Count the occurrences of each ethnicity, excluding 'Unknown'
    ethnicity_counts = dataframe[dataframe[ethnicity_column] != 'Unknown'][ethnicity_column].value_counts()

    # Select the top 10 most frequent ethnicities
    top_ethnicities = ethnicity_counts.head(10)

    # Create a DataFrame to use 'hue'
    top_ethnicities_df = top_ethnicities.reset_index()
    top_ethnicities_df.columns = ['Ethnicity', 'Count']

    # Plotting the distribution using a bar chart
    plt.figure(figsize=(14, 8))
    sns.barplot(data=top_ethnicities_df, x='Ethnicity', y='Count', palette='viridis', hue='Ethnicity', dodge=False, legend=False)
    plt.title(f'Top 10 Most Frequent Ethnicities in Characters Dataset ({region_name})', fontsize=18)
    plt.xlabel('Ethnicity', fontsize=14)
    plt.ylabel('Number of Actors', fontsize=14)
    plt.xticks(rotation=30, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    sns.despine()
    plt.tight_layout()
    plt.show()


**1. Europe**

**Classify Europe into sub-regions**

In [51]:
# define european countries
european_countries = ['Slovakia', 'Estonia', 'Bulgaria', 'Scotland', 'England', 'Slovak Republic', 
                      'Luxembourg', 'Netherlands', 'Ukraine', 'Monaco', 'Switzerland', 'Italy',
                      'Kingdom of Great Britain', 'Isle of Man', 'Northern Ireland', 'Ireland',
                      'Sweden', 'Albania', 'France', 'Poland', 'Slovenia', 'Romania', 'Serbia',
                      'Croatia', 'United Kingdom', 'Republic of Macedonia', 'Denmark', 
                      'Czech Republic', 'Austria', 'Spain', 'Russia', 'Bosnia and Herzegovina', 
                      'Czechoslovakia', 'Portugal', 'Iceland', 'Yugoslavia', 'Malta', 'Wales', 
                      'Georgia', 'Cyprus', 'Lithuania', 'Greece', 'Belgium', 'Hungary', 'Germany', 
                      'Norway', 'Finland', 'Montenegro']

In [52]:
# Region mapping
region_mapping = {
    'east_europe': [
        'Slovakia', 'Slovak Republic', 'Ukraine', 'Estonia', 'Poland', 
        'Bulgaria', 'Romania', 'Serbia', 'Croatia', 'Albania', 
        'Republic of Macedonia', 'Czech Republic', 'Russia', 
        'Bosnia and Herzegovina', 'Czechoslovakia', 'Yugoslavia', 
        'Georgia', 'Lithuania', 'Hungary', 'Montenegro'
    ],
    'west_europe': [
        'Scotland', 'England', 'Luxembourg', 'Netherlands', 'Monaco', 
        'Switzerland', 'Italy', 'Kingdom of Great Britain', 'Isle of Man', 
        'Northern Ireland', 'Ireland', 'France', 'Slovenia', 
        'United Kingdom', 'Austria', 'Spain', 'Malta', 'Wales', 
        'Cyprus', 'Greece', 'Belgium', 'Germany', 'Portugal'
    ],
    'nordic_europe': [
        'Sweden', 'Denmark', 'Norway', 'Finland', 'Iceland'
    ]
}

# Reverse mapping 
country_to_region = {country: region for region, countries in region_mapping.items() for country in countries}

# Function to count regions and assign the biggest one
def assign_region(countries):
    # Ensure input is a list
    if not isinstance(countries, list):
        print(f"Warning: Expected list but got {type(countries)}. Value: {countries}")
        return "Unknown"  # Handle unexpected data types
    
    # Initialize counts for each region
    region_counts = {'east_europe': 0, 'west_europe': 0, 'nordic_europe': 0}
    
    # Count the occurrence of each region
    for country in countries:
        region = country_to_region.get(country)
        if region:
            region_counts[region] += 1
    
    # Find the region with the maximum count
    max_count = max(region_counts.values())
    # Get all regions that have the maximum count
    regions_with_max_count = [region for region, count in region_counts.items() if count == max_count]
    
    # If there's a tie, pick the first in alphabetical order
    return sorted(regions_with_max_count)[0] if max_count > 0 else "Unknown"

# Ensure 'countries' column is properly formatted
if isinstance(european_main['countries'].iloc[0], str):
    european_main['countries'] = european_main['countries'].apply(lambda x: x.split(',') if isinstance(x, str) else x)

# Apply the function to assign regions using .loc
european_main['region'] = european_main['countries'].apply(assign_region)


**Ethnicity categorization**

In [53]:
europe_data_ethnicity = european_main[european_main['actor_ethnicity_label'] != "Unknown"]

# Caucasian/white
caucasian_ethnicities = [
    'Spaniards', 'Anglo-Celtic Australians', 'White Americans', 'Irish Americans',
    'Italian Americans', 'Slovak Americans', 'European Americans', 'Swedes',
    'Czech Americans', 'Dutch Americans', 'English Americans', 'Italians',
    'Hungarian Americans', 'White people', 'French', 'Australians',
    'Polish Americans', 'Germans', 'English people', 'Scottish Americans',
    'Spanish Americans', 'British', 'Croats', 'German Americans',
    'French Americans', 'Danes', 'Welsh people', 'Scottish people',
    'Australian Americans', 'Swiss', 'White British', 'Swedish Americans',
    'Danish Americans', 'Russian Americans', 'Lithuanian Americans',
    'Austrians', 'Bosnians', 'Norwegians', 'French Canadians',
    'Scottish Canadians', 'Croatian Americans', 'Icelanders',
    'Slovene Americans', 'Sicilian Americans', 'Finns', 'Dutch',
    'Austrian Americans', 'Ukrainian Americans', 'Swedish-speaking population of Finland',
    'Uruguayans', 'Anglo-Irish people', 'Portuguese', 'Scandinavian Americans',
    'Bulgarians', 'Greek Canadians', 'German Canadians', 'Greek Americans',
    'Norwegian Americans', 'Irish Canadians', 'Serbian Canadians', 'Galicians',
    'White Africans of European ancestry', 'Irish Australians', 'Italian Canadians',
    'Belarusians', 'Poles', 'Czechs', 'Welsh Americans', 'Latvians',
    'Serbs of Bosnia and Herzegovina', 'Serbs of Croatia', 'Austrians in the United Kingdom',
    'Corsicans', 'Greek Cypriots', 'Welsh Italians', 'Bulgarian Canadians',
    'Belgians', 'Serbian Australians', 'Albanians', 'Polish Canadians',
    'Basque people', 'Slavs', 'Aromanians', 'Transylvanian Saxons', 'Rusyn American',
    'Catalans', 'Italian Australians', 'Bolivian Americans', 'White Latin American',
    'Portuguese Americans', 'Ukrainians', 'Dalmatian Italians', 'Scotch-Irish Americans',
    'English Australian', 'Scottish Australians', 'Russians',
    'Canadians in the United Kingdom', 'British Americans', 'Kiwi', 'Serbs in the United Kingdom',
    'Croatian Australians', 'names of the Greeks', 'Bosniaks', 'Serbian Americans', 'Americans', 
    'Irish migration to Great Britain', 'Albanian Americans', 'Romanichal', 'Cajun'
]


# Arab/Middle Eastern
arab_ethnicities = [
    'Israelis', 'Lebanese people', 'Lebanese Americans', 'Sudanese Arabs',
    'Syrian Americans', 'Palestinians in the United States', 'Arab Americans',
    'Arabs in Bulgaria', 'Moroccans', 'Kurds', 'Azerbaijanis', 'Israeli Americans',
    'Lebanese people in the United Kingdom', 'Iranians in the United Kingdom',
    'Iranian peoples', 'Armenians of Russia', 'Assyrian people', 'Turkish Americans',
    'Armenians', 'Armenians in Italy', 'Armenian Americans', 'Copts'
]


# African/Black
african_ethnicities = [
    'African Americans', 'Ghanaian Americans', 'Sudanese Arabs', 'Somalis',
    'Afro Trinidadians and Tobagonians', 'Berber', 'Ghanaian', 'Black people',
    'Kabyle people', 'Mandinka people', 'Moroccans', 'Black Canadians',
    'Wolof people', 'African people', 'Xhosa people', 'Buryats', 'Malagasy people',
    'Haitian Americans', 'Yoruba people', 'Black Irish' , 'British Nigerian', 
    'Guyanese Americans', 'British Jamaicans', 'South African Americans', 
    'Louisiana Creole people', 'Chinese Jamaicans'
]


# South Asian
south_asian_ethnicities = [
    'Indian Americans', 'Indians', 'Sindhis', 'Tamil', 'British Indian',
    'Punjabis', 'Punjabi diaspora', 'Tamil Brahmin', 'Gujarati people',
    'Bengali', 'Bengali Brahmins', 'Indian diaspora in France',
    'Telugu people', 'Malayali', 'Pathani', 'Afghans in India',
    'Sri Lankan Tamil', 'Kayastha', 'Jaat', 'Kashmiri Pandit', 'Marathi people',
    'Hindu', 'Rohilla', 'Nair', 'Ezhava', 'Mudaliar', 'Kanyakubja Brahmins',
    'Chitrapur Saraswat Brahmin', 'Niyogi', 'Bunt (RAJPUT)', 'Sikh', 'Parsi',
    'Indo-Canadians', 'Marwari people'
]


# East Asian
east_asian_ethnicities = [
    'Japanese Americans', 'Chinese Americans', 'Manchu', 'British Chinese',
    'Chinese Singaporeans', 'Hongkongers', 'Taiwanese people', 'Koreans',
    'Korean Americans', 'Taiwanese Americans', 'Chinese Canadians',
    'Chinese Filipino', 'Ryukyuan people', 'Tibetan people'
]


# Southeast Asian
southeast_asian_ethnicities = [
    'Malaysian Chinese', 'Singaporeans', 'Thai Americans', 'Thai Chinese',
    'Vietnamese Americans', 'Vietnamese people', 'Filipino people',
    'Filipino Americans', 'Filipino Australians', 'Cambodian Americans',
    'Samoan Americans', 'Pacific Islander Americans'
]


# Latino/Hispanic
latino_ethnicities = [
    'Mexican Americans', 'Puerto Ricans', 'Latin American British', 'Venezuelan Americans',
    'Cuban Americans', 'Uruguayans', 'Mexicans', 'Hispanic and Latino Americans',
    'Argentines', 'Dominican Americans', 'Venezuelans', 'Hispanic',
    'Panamanian Americans', 'Castilians', 'Chileans', 'Chileans in the United Kingdom',
    'Peruvians in the United Kingdom', 'Latino', 'French Chilean', 'Italian Brazilians',
    'Criollo people', 'Colombian Americans', 'Colombian Australian', 'Stateside Puerto Ricans',
    'Cubans', 'Hondurans'
]


# Indigenous Peoples
indigenous_ethnicities = [
    'Apache', 'Mohawk', 'Blackfoot Confederacy', 'Cherokee',
    'Indigenous peoples of the Americas', 'Native Americans in the United States',
    'First Nations', 'Native Hawaiians', 'Aboriginal Australians', 'Māori',
    'Ojibwe', 'Dene', 'Malagasy people', 'Gin people', 'Sámi people'
]


# Jewish
jewish_ethnicities = [
    'Jewish people', 'British Jews', 'American Jews', 'Ashkenazi Jews',
    'Israeli Jews', 'Sephardi Jews', 'Moroccan Jews', 'Lithuanian Jews'
]


# Ethnicities that are not categorized due to being too broad
other_ethnicities = [
    'multiracial American', 'multiracial people', 'Eurasian', 'Q31340083'
]

In [54]:
def classify_actor_ethnicity(df):  
    df = df.copy()  # Create a copy to avoid modifying the original DataFrame
    df.loc[:, "actor_ethnicity_classification"] = df["actor_ethnicity_label"].apply(
        lambda x: "Caucasians" if x in caucasian_ethnicities else (
            "Arabs / Middle Easterns" if x in arab_ethnicities else (
                "Africans" if x in african_ethnicities else (
                    "South Asians" if x in south_asian_ethnicities else (
                        "East Asians" if x in east_asian_ethnicities else (
                            "Southeast Asian" if x in southeast_asian_ethnicities else (
                                "Latinos" if x in latino_ethnicities else (
                                    "Indigenous People" if x in indigenous_ethnicities else (
                                        "Jewish People" if x in jewish_ethnicities else 'Unknown'
                                    )
                                )
                            )
                        )
                    )
                )
            )
        )
    )
    return df

europe_data = classify_actor_ethnicity(european_main)
display(europe_data[['actor_name', 'actor_ethnicity_classification']].head(20))


Unnamed: 0,actor_name,actor_ethnicity_classification
0,Rosa Maria Sardà,Caucasians
1,Mercè Pons,Unkown
2,Anna Lizaran,Unkown
3,Núria Espert,Unkown
907,Jacques Branchu,Unkown
908,Jean Négroni,Unkown
909,Hélène Chatelain,Unkown
910,Davos Hanich,Unkown
911,Philbert von Lifchitz,Unkown
912,Ligia Branice,Unkown


In [55]:
europe_data.dropna(subset=['actor_ethnicity_classification'], inplace=True)
actor_name_unique_df = europe_data.drop_duplicates(subset='actor_name')
actor_name_unique_df.head()

Unnamed: 0,movie_name,actor_name,age_at_release,actor_gender,actor_ethnicity_label,actor_height,release,box_office,runtime,languages,countries,genres,plot_summary,main_genre,region,actor_ethnicity_classification
0,Actrius,Rosa Maria Sardà,54.0,F,Spaniards,,1996.0,,90.0,"[Catalan language, Spanish Language]",[Spain],"[Drama, Comedy-drama]",In order to prepare the role of an important o...,Comedy,west_europe,Caucasians
1,Actrius,Mercè Pons,29.0,F,Unknown,,1996.0,,90.0,"[Catalan language, Spanish Language]",[Spain],"[Drama, Comedy-drama]",In order to prepare the role of an important o...,Comedy,west_europe,Unkown
2,Actrius,Anna Lizaran,51.0,F,Unknown,,1996.0,,90.0,"[Catalan language, Spanish Language]",[Spain],"[Drama, Comedy-drama]",In order to prepare the role of an important o...,Comedy,west_europe,Unkown
3,Actrius,Núria Espert,60.0,F,Unknown,,1996.0,,90.0,"[Catalan language, Spanish Language]",[Spain],"[Drama, Comedy-drama]",In order to prepare the role of an important o...,Comedy,west_europe,Unkown
907,La Jetée,Jacques Branchu,,M,Unknown,,1962.0,,28.0,"[French Language, German Language]",[France],"[Science Fiction, World cinema, Experimental f...",A man is a prisoner in the aftermath of the ...,Drama,west_europe,Unkown


In [56]:
# Classification of ethnicity + separation into sub-regions
west_europe_data = europe_data[europe_data['region']== 'west_europe']
east_europe_data = europe_data[europe_data['region']== 'east_europe']
nordic_europe_data = europe_data[europe_data['region']== 'nordic_europe']

# Export to a CSV file
europe_data.to_csv("data/final/europe/CMU/"+"europe_data.csv", index=False) 
west_europe_data.to_csv("data/final/europe/CMU/"+"west_europe_data.csv", index=False) 
east_europe_data.to_csv("data/final/europe/CMU/"+"east_europe_data.csv", index=False) 
nordic_europe_data.to_csv("data/final/europe/CMU/"+"nordic_europe_data.csv", index=False) 

nordic_europe_data.head()

Unnamed: 0,movie_name,actor_name,age_at_release,actor_gender,actor_ethnicity_label,actor_height,release,box_office,runtime,languages,countries,genres,plot_summary,main_genre,region,actor_ethnicity_classification
29474,Dogville,Blair Brown,56.0,F,Unknown,1.73,2003.0,16680836.0,138.0,[English Language],"[Denmark, Sweden, Norway, France, United Kingd...","[Thriller, Mystery, Drama]",,Thriller/Suspense,nordic_europe,Unkown
29475,Dogville,Jeremy Davies,33.0,M,Unknown,1.75,2003.0,16680836.0,138.0,[English Language],"[Denmark, Sweden, Norway, France, United Kingd...","[Thriller, Mystery, Drama]",,Thriller/Suspense,nordic_europe,Unkown
29476,Dogville,Ben Gazzara,72.0,M,Italian Americans,1.79,2003.0,16680836.0,138.0,[English Language],"[Denmark, Sweden, Norway, France, United Kingd...","[Thriller, Mystery, Drama]",,Thriller/Suspense,nordic_europe,Caucasians
29477,Dogville,Philip Baker Hall,71.0,M,Unknown,1.68,2003.0,16680836.0,138.0,[English Language],"[Denmark, Sweden, Norway, France, United Kingd...","[Thriller, Mystery, Drama]",,Thriller/Suspense,nordic_europe,Unkown
29478,Dogville,Siobhan Fallon,42.0,F,Unknown,,2003.0,16680836.0,138.0,[English Language],"[Denmark, Sweden, Norway, France, United Kingd...","[Thriller, Mystery, Drama]",,Thriller/Suspense,nordic_europe,Unkown


#### Preprocessing for Real-World dataset

In [57]:
## Real-world loading 
DATA_FOLDER = './data/raw/real_world/'
REALWORLD_ETHNIC_DATASET = DATA_FOLDER+"ethnic_power_relations.csv"
REALWORLD_MALE_DATASET = DATA_FOLDER+"male_population_data.xlsx"
REALWORLD_FEMALE_DATASET = DATA_FOLDER+"female_population_data.xlsx"
REALWORLD_BOTHSEXES_DATASET = DATA_FOLDER+"bothsexes_population_data.xlsx"

ethnic_realworld = pd.read_csv(REALWORLD_ETHNIC_DATASET)
pop_male_realworld = pd.read_excel(REALWORLD_MALE_DATASET, skiprows=16, dtype=str)
pop_female_realworld = pd.read_excel(REALWORLD_FEMALE_DATASET, skiprows=16, dtype=str)
pop_bothsexes_realworld = pd.read_excel(REALWORLD_BOTHSEXES_DATASET, skiprows=16, dtype=str)

In [58]:
columns_to_convert_male = pop_male_realworld.columns[11:] # only the number columns
columns_to_convert_female = pop_female_realworld.columns[11:] # only the number columns
columns_to_convert_both = pop_bothsexes_realworld.columns[11:] # only the number columns

for col in columns_to_convert_male:
    pop_male_realworld[col] = (
        pop_male_realworld[col]
        .replace('...', np.nan)  
        .astype(float)  
        .round() 
        .fillna(0)  
        .astype(int) 
    )
for col in columns_to_convert_female:
    pop_female_realworld[col] = (
        pop_female_realworld[col]
        .replace('...', np.nan)  
        .astype(float)  
        .round() 
        .fillna(0) 
        .astype(int) 
    )
for col in columns_to_convert_both:
    pop_bothsexes_realworld[col] = (
        pop_bothsexes_realworld[col]
        .replace('...', np.nan)  
        .astype(float)  
        .round()
        .fillna(0)  
        .astype(int) 
    )

In [59]:
def process_ethnic_group_data(df):
    
    # Combine 'from' and 'to' columns into a new column 'from to'
    df.loc[:, "from to"] = df["from"].astype(str) + "-" + df["to"].astype(str)
    
    # Group by 'from to', 'group', and 'size', then count occurrences
    grouped_data = df.groupby(['from to', 'group', 'size']).size()
    
    # Reset index to convert groupings to columns
    df = grouped_data.reset_index(name='counts')   
    return df


In [60]:
def process_grouped_averages_by_columns(df, group_col, drop_cols=None, decimals=2):
    
    # Drop specified columns if provided
    if drop_cols:
        drop_cols = [drop_cols] if isinstance(drop_cols, str) else drop_cols
        df = df.drop(columns=drop_cols, errors='ignore')

    # Select numeric columns only
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    
    # Group by the specified column and calculate the mean for numeric columns
    grouped_averages = (
        df.groupby(group_col)[numeric_cols]
        .mean()
        .reset_index()
    )

    # Round the averages to the specified number of decimals
    grouped_averages = grouped_averages.round(decimals)

    return grouped_averages


**Europe**

**We processed the real-world European ethnicity data this way**

The European film industry is often linked to the diverse population across Europe. We aimed to verify whether it accurately reflects the demographics of its population, specifically ethnicity, age, and gender distribution.

For the ethnicity analysis, we extracted real-world European ethnicity data from this source, seeking a dataset that included information about the timeframe (ideally from approximately 1950 to 2012) of ethnic proportions across different European countries: https://icr.ethz.ch/data/epr/core/

In [61]:
europe_ethnic_realworld = ethnic_realworld[
    ethnic_realworld["statename"].isin(['Slovakia', 'Estonia', 'Bulgaria', 'Scotland', 'England', 'Slovak Republic', 
                      'Luxembourg', 'Netherlands', 'Ukraine', 'Monaco', 'Switzerland', 'Italy',
                      'Kingdom of Great Britain', 'Isle of Man', 'Northern Ireland', 'Ireland',
                      'Sweden', 'Albania', 'France', 'Poland', 'Slovenia', 'Romania', 'Serbia',
                      'Croatia', 'United Kingdom', 'Republic of Macedonia', 'Denmark', 
                      'Czech Republic', 'Austria', 'Spain', 'Russia', 'Bosnia and Herzegovina', 
                      'Czechoslovakia', 'Portugal', 'Iceland', 'Yugoslavia', 'Malta', 'Wales', 
                      'Georgia', 'Cyprus', 'Lithuania', 'Greece', 'Belgium', 'Hungary', 'Germany', 
                      'Norway', 'Finland', 'Montenegro'])
]

In [62]:
# Function to count regions and assign the biggest one
def assign_region(countries):
    # Ensure input is a list; if a string, convert it to a list
    if isinstance(countries, str):
        countries = [countries]
    
    # Initialize counts for each region
    region_counts = {'east_europe': 0, 'west_europe': 0, 'nordic_europe': 0}
    
    # Count the occurrence of each region
    for country in countries:
        region = country_to_region.get(country)
        if region:
            region_counts[region] += 1
    
    # Find the region with the maximum count
    max_count = max(region_counts.values())
    # Get all regions that have the maximum count
    regions_with_max_count = [region for region, count in region_counts.items() if count == max_count]
    
    # If there's a tie, pick the first in alphabetical order
    return sorted(regions_with_max_count)[0] if max_count > 0 else "Unknown"

# Work on a copy of the DataFrame to avoid modifying the original
europe_ethnic_realworld = europe_ethnic_realworld.copy()

# Ensure 'statename' column is properly formatted to a list if necessary
europe_ethnic_realworld.loc[:, 'region'] = europe_ethnic_realworld['statename'].apply(assign_region)

# Display the results
europe_ethnic_realworld[['statename', 'region']]


Unnamed: 0,statename,region
362,United Kingdom,west_europe
363,United Kingdom,west_europe
364,United Kingdom,west_europe
365,United Kingdom,west_europe
366,United Kingdom,west_europe
...,...,...
1226,Finland,nordic_europe
1227,Sweden,nordic_europe
1228,Norway,nordic_europe
1229,Denmark,nordic_europe


In [63]:
process_ethnic_group_data(europe_ethnic_realworld)

Unnamed: 0,from to,group,size,counts
0,1946-1956,Byelorussians,0.0090,1
1,1946-1956,European and American Jews,0.0040,1
2,1946-1956,Germans,0.1400,1
3,1946-1956,Poles,0.8100,1
4,1946-1956,Roma,0.0005,1
...,...,...,...,...
365,2021-2021,Serbs,0.0430,1
366,2021-2021,Serbs,0.2870,1
367,2021-2021,Serbs,0.8380,1
368,2021-2021,Slovaks,0.8060,1


### Creation of the dataframe for the Male real-world population representative of the European movie industry, according to Time Period and age gaps of 5 years

- Here, we extracted the dataset of the European population statistics from 1950 to 2012 (year of the latest movie registered in our Movies CMU dataset) from this source: https://population.un.org/wpp/



In [64]:
pop_male_realworld_europe = pop_male_realworld[pop_male_realworld['Region, subregion, country or area *'].isin(european_countries)]
pop_male_realworld_europe.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
10290,10291,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10291,10292,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10292,10293,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10293,10294,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10294,10295,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0


In [65]:
def get_main_region(country):
    for region, countries in region_mapping.items():
        if country in countries:
            return region
    return 'unknown'

# Work on a copy of the DataFrame to avoid modifying the original
pop_male_realworld_europe = pop_male_realworld_europe.copy()

# Apply the function to assign regions using .loc
pop_male_realworld_europe.loc[:, 'region'] = pop_male_realworld_europe['Region, subregion, country or area *'].apply(get_main_region)

# Display the results
pop_male_realworld_europe.head()


Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,92,93,94,95,96,97,98,99,100+,region
10290,10291,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10291,10292,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10292,10293,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10293,10294,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10294,10295,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe


In [66]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *', 'region', 'Year'] + [col for col in pop_male_realworld_europe.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_male_realworld_europe = pop_male_realworld_europe[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_male_realworld_europe.head()

Unnamed: 0,"Region, subregion, country or area *",region,Year,0,1,2,3,4,5,6,...,91,92,93,94,95,96,97,98,99,100+
10290,Cyprus,west_europe,1950,8,7,6,6,6,6,5,...,0,0,0,0,0,0,0,0,0,0
10291,Cyprus,west_europe,1951,7,8,7,6,6,6,6,...,0,0,0,0,0,0,0,0,0,0
10292,Cyprus,west_europe,1952,7,7,7,7,6,6,6,...,0,0,0,0,0,0,0,0,0,0
10293,Cyprus,west_europe,1953,7,7,7,7,7,6,6,...,0,0,0,0,0,0,0,0,0,0
10294,Cyprus,west_europe,1954,7,7,7,7,7,7,7,...,0,0,0,0,0,0,0,0,0,0


In [67]:
# Ensure the 'Year' column is of integer type
pop_male_realworld_europe['Year'] = pop_male_realworld_europe['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_male_realworld_europe['Time Period'] = pop_male_realworld_europe['Year'].apply(assign_time_period)

# create subregions dataset
pop_male_realworld_west_europe = pop_male_realworld_europe[pop_male_realworld_europe['region']=='west_europe']
pop_male_realworld_east_europe = pop_male_realworld_europe[pop_male_realworld_europe['region']=='east_europe']
pop_male_realworld_nordic_europe = pop_male_realworld_europe[pop_male_realworld_europe['region']=='nordic_europe']

pop_male_realworld_west_europe.head()

Unnamed: 0,"Region, subregion, country or area *",region,Year,0,1,2,3,4,5,6,...,92,93,94,95,96,97,98,99,100+,Time Period
10290,Cyprus,west_europe,1950,8,7,6,6,6,6,5,...,0,0,0,0,0,0,0,0,0,1950-1965
10291,Cyprus,west_europe,1951,7,8,7,6,6,6,6,...,0,0,0,0,0,0,0,0,0,1950-1965
10292,Cyprus,west_europe,1952,7,7,7,7,6,6,6,...,0,0,0,0,0,0,0,0,0,1950-1965
10293,Cyprus,west_europe,1953,7,7,7,7,7,6,6,...,0,0,0,0,0,0,0,0,0,1950-1965
10294,Cyprus,west_europe,1954,7,7,7,7,7,7,7,...,0,0,0,0,0,0,0,0,0,1950-1965


In [68]:
# Apply the function to your datasets
male_europe_realworld_averages = process_grouped_averages_by_columns(pop_male_realworld_europe, 'Time Period', drop_cols='Year', decimals=2)
male_west_europe_realworld_averages = process_grouped_averages_by_columns(pop_male_realworld_west_europe, 'Time Period', drop_cols='Year', decimals=2)
male_east_europe_realworld_averages = process_grouped_averages_by_columns(pop_male_realworld_east_europe, 'Time Period', drop_cols='Year', decimals=2)
male_nordic_europe_realworld_averages = process_grouped_averages_by_columns(pop_male_realworld_nordic_europe, 'Time Period', drop_cols='Year', decimals=2)

# Display the resulting DataFrame
male_west_europe_realworld_averages


Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,153.13,149.83,148.18,146.78,144.93,142.52,140.58,138.72,136.72,...,1.08,0.67,0.4,0.26,0.15,0.02,0.0,0.0,0.0,0.0
1,1966-1980,142.54,144.31,146.86,149.29,151.56,153.53,155.06,156.13,156.74,...,2.01,1.42,0.93,0.6,0.38,0.24,0.14,0.0,0.0,0.0
2,1981-1995,118.92,119.53,120.61,121.44,122.29,123.19,124.16,125.42,127.18,...,3.24,2.36,1.65,1.1,0.7,0.43,0.27,0.19,0.01,0.09
3,1996-2012,114.08,114.13,114.21,114.39,114.74,115.18,115.72,116.27,116.95,...,6.51,4.84,3.51,2.54,1.83,1.24,0.74,0.43,0.28,0.35


In [69]:
# Export to a CSV file
male_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"male_europe_realworld_averages.csv", index=False) 
male_west_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"male_west_europe_realworld_averages.csv", index=False) 
male_east_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"male_east_europe_realworld_averages.csv", index=False) 
male_nordic_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"male_nordic_europe_realworld_averages.csv", index=False) 


In [70]:
def calculate_proportions(realworld_averages, bothsexes_averages):
    """
    Calculate proportions of a specific gender compared to both sexes.

    Parameters:
    - realworld_averages: DataFrame containing averages for a specific gender.
    - bothsexes_averages: DataFrame containing averages for both sexes.

    Returns:
    - DataFrame with calculated proportions.
    """
    proportions = realworld_averages.copy()
    proportions.iloc[:, 1:] = (
        realworld_averages.iloc[:, 1:].values /
        bothsexes_averages.iloc[:, 1:].values
    )
    return proportions

**Female**

In [71]:
pop_female_realworld_europe = pop_female_realworld[pop_female_realworld['Region, subregion, country or area *'].isin(european_countries)]
pop_female_realworld_europe.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
10290,10291,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10291,10292,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10292,10293,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10293,10294,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10294,10295,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0


In [72]:
# Work on a copy of the DataFrame to avoid modifying the original
pop_female_realworld_europe = pop_female_realworld_europe.copy()

# Apply the function to assign regions using .loc
pop_female_realworld_europe.loc[:, 'region'] = pop_female_realworld_europe['Region, subregion, country or area *'].apply(get_main_region)

# Display the results
pop_female_realworld_europe.head()


Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,92,93,94,95,96,97,98,99,100+,region
10290,10291,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10291,10292,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10292,10293,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10293,10294,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10294,10295,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe


In [73]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *', 'region', 'Year'] + [col for col in pop_female_realworld_europe.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_female_realworld_europe = pop_female_realworld_europe[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_female_realworld_europe.head()

Unnamed: 0,"Region, subregion, country or area *",region,Year,0,1,2,3,4,5,6,...,91,92,93,94,95,96,97,98,99,100+
10290,Cyprus,west_europe,1950,7,7,6,6,6,5,5,...,0,0,0,0,0,0,0,0,0,0
10291,Cyprus,west_europe,1951,7,7,7,6,6,6,5,...,0,0,0,0,0,0,0,0,0,0
10292,Cyprus,west_europe,1952,7,7,7,7,6,6,6,...,0,0,0,0,0,0,0,0,0,0
10293,Cyprus,west_europe,1953,7,7,7,7,7,6,6,...,0,0,0,0,0,0,0,0,0,0
10294,Cyprus,west_europe,1954,7,7,7,7,7,7,6,...,0,0,0,0,0,0,0,0,0,0


In [74]:
# Ensure the 'Year' column is of integer type
pop_female_realworld_europe['Year'] = pop_female_realworld_europe['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_female_realworld_europe['Time Period'] = pop_female_realworld_europe['Year'].apply(assign_time_period)

# create subregions dataset
pop_female_realworld_west_europe = pop_female_realworld_europe[pop_female_realworld_europe['region']=='west_europe']
pop_female_realworld_east_europe = pop_female_realworld_europe[pop_female_realworld_europe['region']=='east_europe']
pop_female_realworld_nordic_europe = pop_female_realworld_europe[pop_female_realworld_europe['region']=='nordic_europe']

pop_female_realworld_west_europe.head()

Unnamed: 0,"Region, subregion, country or area *",region,Year,0,1,2,3,4,5,6,...,92,93,94,95,96,97,98,99,100+,Time Period
10290,Cyprus,west_europe,1950,7,7,6,6,6,5,5,...,0,0,0,0,0,0,0,0,0,1950-1965
10291,Cyprus,west_europe,1951,7,7,7,6,6,6,5,...,0,0,0,0,0,0,0,0,0,1950-1965
10292,Cyprus,west_europe,1952,7,7,7,7,6,6,6,...,0,0,0,0,0,0,0,0,0,1950-1965
10293,Cyprus,west_europe,1953,7,7,7,7,7,6,6,...,0,0,0,0,0,0,0,0,0,1950-1965
10294,Cyprus,west_europe,1954,7,7,7,7,7,7,6,...,0,0,0,0,0,0,0,0,0,1950-1965


In [75]:
# Apply the function to your datasets
female_europe_realworld_averages = process_grouped_averages_by_columns(pop_female_realworld_europe, 'Time Period', drop_cols='Year', decimals=2)
female_west_europe_realworld_averages = process_grouped_averages_by_columns(pop_female_realworld_west_europe, 'Time Period', drop_cols='Year', decimals=2)
female_east_europe_realworld_averages = process_grouped_averages_by_columns(pop_female_realworld_east_europe, 'Time Period', drop_cols='Year', decimals=2)
female_nordic_europe_realworld_averages = process_grouped_averages_by_columns(pop_female_realworld_nordic_europe, 'Time Period', drop_cols='Year', decimals=2)

# Export to a CSV file
female_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"female_europe_realworld_averages.csv", index=False) 
female_west_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"female_west_europe_realworld_averages.csv", index=False) 
female_east_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"female_east_europe_realworld_averages.csv", index=False) 
female_nordic_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"female_nordic_europe_realworld_averages.csv", index=False) 


**Both Sexes**

In [76]:
pop_bothsexes_realworld_europe = pop_bothsexes_realworld[pop_bothsexes_realworld['Region, subregion, country or area *'].isin(european_countries)]
pop_bothsexes_realworld_europe.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
10290,10291,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10291,10292,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10292,10293,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10293,10294,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0
10294,10295,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,0


In [77]:
# Work on a copy of the DataFrame to avoid modifying the original
pop_bothsexes_realworld_europe = pop_bothsexes_realworld_europe.copy()

# Apply the function to assign regions using .loc
pop_bothsexes_realworld_europe.loc[:, 'region'] = pop_bothsexes_realworld_europe['Region, subregion, country or area *'].apply(get_main_region)

# Display the results
pop_bothsexes_realworld_europe.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,92,93,94,95,96,97,98,99,100+,region
10290,10291,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10291,10292,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10292,10293,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10293,10294,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe
10294,10295,Estimates,Cyprus,11,196,CYP,CY,196,Country/Area,922,...,0,0,0,0,0,0,0,0,0,west_europe


In [78]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *', 'region', 'Year'] + [col for col in pop_bothsexes_realworld_europe.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_bothsexes_realworld_europe = pop_bothsexes_realworld_europe[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_bothsexes_realworld_europe.head()

Unnamed: 0,"Region, subregion, country or area *",region,Year,0,1,2,3,4,5,6,...,91,92,93,94,95,96,97,98,99,100+
10290,Cyprus,west_europe,1950,15,14,12,12,12,11,11,...,0,0,0,0,0,0,0,0,0,0
10291,Cyprus,west_europe,1951,14,15,14,12,12,12,11,...,0,0,0,0,0,0,0,0,0,0
10292,Cyprus,west_europe,1952,14,13,14,14,13,12,12,...,0,0,0,0,0,0,0,0,0,0
10293,Cyprus,west_europe,1953,14,14,13,14,14,13,12,...,0,0,0,0,0,0,0,0,0,0
10294,Cyprus,west_europe,1954,14,14,13,13,14,14,13,...,0,0,0,0,0,0,0,0,0,0


In [79]:
# Ensure the 'Year' column is of integer type
pop_bothsexes_realworld_europe['Year'] = pop_bothsexes_realworld_europe['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_bothsexes_realworld_europe['Time Period'] = pop_bothsexes_realworld_europe['Year'].apply(assign_time_period)

# create subregions dataset
pop_bothsexes_realworld_west_europe = pop_bothsexes_realworld_europe[pop_bothsexes_realworld_europe['region']=='west_europe']
pop_bothsexes_realworld_east_europe = pop_bothsexes_realworld_europe[pop_bothsexes_realworld_europe['region']=='east_europe']
pop_bothsexes_realworld_nordic_europe = pop_bothsexes_realworld_europe[pop_bothsexes_realworld_europe['region']=='nordic_europe']

pop_bothsexes_realworld_west_europe.head()

Unnamed: 0,"Region, subregion, country or area *",region,Year,0,1,2,3,4,5,6,...,92,93,94,95,96,97,98,99,100+,Time Period
10290,Cyprus,west_europe,1950,15,14,12,12,12,11,11,...,0,0,0,0,0,0,0,0,0,1950-1965
10291,Cyprus,west_europe,1951,14,15,14,12,12,12,11,...,0,0,0,0,0,0,0,0,0,1950-1965
10292,Cyprus,west_europe,1952,14,13,14,14,13,12,12,...,0,0,0,0,0,0,0,0,0,1950-1965
10293,Cyprus,west_europe,1953,14,14,13,14,14,13,12,...,0,0,0,0,0,0,0,0,0,1950-1965
10294,Cyprus,west_europe,1954,14,14,13,13,14,14,13,...,0,0,0,0,0,0,0,0,0,1950-1965


In [80]:
# Apply the function to your datasets
bothsexes_europe_realworld_averages = process_grouped_averages_by_columns(pop_bothsexes_realworld_europe, 'Time Period', drop_cols='Year', decimals=2)
bothsexes_west_europe_realworld_averages = process_grouped_averages_by_columns(pop_bothsexes_realworld_west_europe, 'Time Period', drop_cols='Year', decimals=2)
bothsexes_east_europe_realworld_averages = process_grouped_averages_by_columns(pop_bothsexes_realworld_east_europe, 'Time Period', drop_cols='Year', decimals=2)
bothsexes_nordic_europe_realworld_averages = process_grouped_averages_by_columns(pop_bothsexes_realworld_nordic_europe, 'Time Period', drop_cols='Year', decimals=2)

# Export to a CSV file
bothsexes_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"bothsexes_europe_realworld_averages.csv", index=False) 
bothsexes_west_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"bothsexes_west_europe_realworld_averages.csv", index=False) 
bothsexes_east_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"bothsexes_east_europe_realworld_averages.csv", index=False) 
bothsexes_nordic_europe_realworld_averages.to_csv("data/final/europe/real_world/"+"bothsexes_nordic_europe_realworld_averages.csv", index=False) 


In [81]:
male_europe_realworld_proportions = calculate_proportions(male_europe_realworld_averages, bothsexes_europe_realworld_averages)
female_europe_realworld_proportions = calculate_proportions(female_europe_realworld_averages, bothsexes_europe_realworld_averages)

male_west_europe_realworld_proportions = calculate_proportions(male_west_europe_realworld_averages, bothsexes_west_europe_realworld_averages)
female_west_europe_realworld_proportions = calculate_proportions(female_west_europe_realworld_averages, bothsexes_west_europe_realworld_averages)

male_east_europe_realworld_proportions = calculate_proportions(male_east_europe_realworld_averages, bothsexes_east_europe_realworld_averages)
female_east_europe_realworld_proportions = calculate_proportions(female_east_europe_realworld_averages, bothsexes_east_europe_realworld_averages)

male_nordic_europe_realworld_proportions = calculate_proportions(male_nordic_europe_realworld_averages, bothsexes_nordic_europe_realworld_averages)
female_nordic_europe_realworld_proportions = calculate_proportions(female_nordic_europe_realworld_averages, bothsexes_nordic_europe_realworld_averages)

In [82]:
male_europe_realworld_proportions.to_csv("data/final/europe/real_world/"+"male_europe_realworld_proportions.csv", index=False) 
male_west_europe_realworld_proportions.to_csv("data/final/europe/real_world/"+"male_west_europe_realworld_proportions.csv", index=False) 
male_east_europe_realworld_proportions.to_csv("data/final/europe/real_world/"+"male_east_europe_realworld_proportions.csv", index=False) 
male_nordic_europe_realworld_proportions.to_csv("data/final/europe/real_world/"+"male_nordic_europe_realworld_proportions.csv", index=False) 


In [83]:
# Export to a CSV file
female_europe_realworld_proportions.to_csv("data/final/europe/real_world/"+"female_europe_realworld_proportions.csv", index=False) 
female_west_europe_realworld_proportions.to_csv("data/final/europe/real_world/"+"female_west_europe_realworld_proportions.csv", index=False) 
female_east_europe_realworld_proportions.to_csv("data/final/europe/real_world/"+"female_east_europe_realworld_proportions.csv", index=False) 
female_nordic_europe_realworld_proportions.to_csv("data/final/europe/real_world/"+"female_nordic_europe_realworld_proportions.csv", index=False) 


female_west_europe_realworld_proportions

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,0.487592,0.488028,0.488089,0.488185,0.488339,0.488539,0.488844,0.489066,0.489305,...,0.675978,0.689516,0.715976,0.705357,0.671642,0.789474,0.68,0.6,0.5,0.6
1,1966-1980,0.487156,0.487554,0.48767,0.487635,0.487837,0.487845,0.487963,0.487948,0.488013,...,0.71831,0.724806,0.726027,0.756,0.717647,0.704762,0.754098,0.756757,0.619048,0.740741
2,1981-1995,0.486731,0.48711,0.487163,0.487401,0.487198,0.487231,0.487393,0.487442,0.487471,...,0.76187,0.770508,0.777778,0.78424,0.80274,0.803213,0.796296,0.79,0.79661,0.8
3,1996-2012,0.486963,0.487019,0.48707,0.487138,0.487154,0.487338,0.487244,0.487443,0.487602,...,0.746142,0.759268,0.772698,0.784446,0.793884,0.800628,0.812217,0.825939,0.826316,0.851145


**2. Hollywood**

In [84]:
# Filter rows where 'actor_ethnicity_label' is not "Unknown"
hollywood_data_ethnicity = hollywood_main[hollywood_main['actor_ethnicity_label'] != "Unknown"]

# Define the African American ethnicities
african_american_ethnicities = [
    "African Americans", "Black people", "British Nigerian", "Yoruba people",
    "African-American Jews", "Black Canadians", "Afro Trinidadians and Tobagonians",
    "Afro-Cuban", "Black Britons", "Blackfoot Confederacy", "African people",
    "Bahamian Americans", "British Jamaicans", "Haitian Americans", "Ghanaian Americans",
    "Afro-Asians", "Afro-Guyanese", "Black Hispanic and Latino Americans", 
    "Mandinka people", "Barbadian Americans", "Wolof people", "multiracial American",
    "Akan people", "Xhosa people", "South African Americans",
    "Sierra Leoneans in the United Kingdom", "Kabyle people", "Berber",
    "Louisiana Creole people", "Nigerian Americans", "Dinka people",
    "Ghanaian", "Somalis"
]

# Define the American Indians ethnicities
american_indian_ethnicities = [
    "American Indians", "Omaha Tribe of Nebraska", "Cherokee", "Aboriginal Australians",
    "Native Hawaiians", "First Nations", "Indigenous peoples of the Americas",
    "Native Americans in the United States", "Mohawk", "Sioux", "Ojibwe", "Lumbee",
    "Cree", "Choctaw", "Five Nations", "Cheyennes", "Oneida", "Dene", "Nez Perce", "Ho-Chunk",
    "Samoan Americans", "Pacific Islander Americans", "Māori", "Inuit", "Apache",
    "Métis", "Aymara", "Iñupiaq people",
]

# Define the Arab Americans ethnicities
arab_american_ethnicities = [
    "Arab Americans", "Iranian peoples", "Afghans in India", "Muslim", "Pashtuns",
    "Lebanese Americans", "Moroccan Americans", "Syrian Americans", "Pathani",
    "Arabs in Bulgaria", "Sudanese Arabs", "Persians", "Lebanese people",
    "Moroccans", "Palestinians in the United States", "Arab Mexican",
    "Lebanese people in the United Kingdom", "Arabs", "culture of Palestine", "مسح",
    "Iranians in the United Kingdom", "Iraqi Americans", "Egyptians", "Iranian Americans",
    "Iranian Canadians"
    
]

# Define the Asian Americans ethnicities
asian_american_ethnicities = [
    "Asian Americans", "Asian people", "Indian Americans", "Japanese Americans", "Filipino Americans",
    "Tamil", "Punjabis", "Sindhis", "Telugu people", "Koreans", "Bengali", "Chinese Americans",
    "Filipino people", "Indonesian Americans", "Sri Lankan Tamils", "Tamil Americans", "Taiwanese Americans",
    "Kashmiri Pandit", "Telugu Brahmins", "Jatt Sikh", "Kannada people", "Brahmin", "Chinese Filipino",
    "Pakistani Canadians", "Sri Lankan Tamil diaspora", "Filipino Australians", "Chinese Singaporeans",
    "Nepali Indian", "Sikh", "Chaliyan", "Malaysian Chinese", "Hmong Americans", "Koryo-saram", 
    "Burmese Americans", "Vietnamese Americans", "Thai Chinese", "Cambodian Americans", "Chinese Indonesians",
    "Pakistani Americans", "Indian diaspora in France", "Indo-Canadians", "Kashmiri people", "Bengali Brahmins",
    "Rohilla", "Sinhalese", "Hindu", "Ryukyuan people", "Bangladeshi Americans", "Thai Americans", "Thai people",
    "Indian Australian", "Indian diaspora", "Punjabi diaspora", "Filipino mestizo", "Japanese Brazilians",
    "Tibetan people", "Hazaras", "Zhuang people", "Dogra", "Kurds", "Goans", "Gujarati people", "Indians",
    "Bihari people", "Hongkongers", "British Indian", "Bengali Hindus", "Korean Americans", "Kiwi", "British Chinese",
    "British Asians", "Vietnamese people", "Chinese Jamaicans", "Taiwanese people", "Sherpa", "Tamil Brahmin",
    "Lao people", "Manchu", "Jaat", "Bhutia", "Marathi people", "Kanyakubja Brahmins",
    "Gin people", "Pakistanis", "Dalit"
]

# Define the Latinos ethnicities
latino_ethnicities = [
    "Hispanic and Latino Americans", "Latinos", "Puerto Ricans", "Mexican Americans", 
    "Cuban Americans", "Stateside Puerto Ricans", "Colombian Americans", "Chilean Americans", 
    "Cajun", "Criollo people", "Portuguese Americans", "Bolivian Americans", "Cubans", 
    "Brazilian Americans", "Brazilians", "Ecuadorian Americans", "Galicians", 
    "White Latin American", "Colombians", "Chileans", "Chileans in the United Kingdom", 
    "Peruvians in the United Kingdom", "Venezuelans", "Hondurans", "Honduran Americans", 
    "Acadians", "Salvadoran Americans", "Panamanian Americans", "Indo Caribbeans", 
    "Tejano", "Spaniards in Mexico", "Spanish people of Filipino ancestry", "Spanish Americans",
    "Uruguayans", "Mexicans", "Guyanese Americans", "Dominican Americans", "Spaniards",
    "Hispanic", "Colombian Australian", "Chinese Canadians", "Portuguese", "Latino",
    "Latin American British", "Venezuelan Americans", 
    
]

# Define the Jewish Americans ethnicities
jewish_american_ethnicities = [
    "Jewish people", "American Jews", "African-American Jews", "Mizrahi Jews", 
    "Ashkenazi Jews", "Sephardi Jews", "British Jews", "Israeli Americans", 
    "history of the Jews in India", "Moroccan Jews", "Lithuanian Jews", 
    "Israeli Jews", "Assyrian people", "Israelis"
]

# Define the Caucasian Americans ethnicities
caucasian_american_ethnicities = [
    "Whites", "White people", "White Americans", "White British", "Italian Americans",
    "Irish Americans", "Scottish Americans", "German Americans", "Russian Americans",
    "French", "English Americans", "European Americans", "Scandinavian Americans",
    "Swedish Americans", "Finnish Americans", "Canadian Americans", "Dutch Americans",
    "Hungarian Americans", "Lithuanian Americans", "Austrians", "French Canadians",
    "English people", "Irish people", "Norwegian Americans", "Austrian Americans",
    "Albanian Americans", "Romanichal", "Parsi", "Swiss", "Latvians", "Belgians",
    "Italian Australians", "Australian Americans", "English Canadians", "English Australian",
    "French Chilean", "Hungarians", "Greek Americans", "Greeks in South Africa", 
    "Sicilian Americans", "Slovaks", "Slovak Americans", "Serbs of Croatia", 
    "White South Africans", "Dutch", "Dutch Australian", "Russian Canadians", 
    "German Canadians", "Romanian Americans", "Polish Canadians", "Czechs", 
    "Belarusians", "Serbs in the United Kingdom", "Serbian Canadians", "Greek Canadians",
    "Greek Cypriots", "Catalans", "Croatian Canadians", "Croatian Americans", 
    "Argentines", "Sámi people", "Welsh Americans", "Welsh Italians", "Tulu people",
    "Mohyal", "Anglo-Indian people", "Anglo-Irish people", "Canadians in the United Kingdom",
    "Slovene Americans", "Aromanians", "Swedish-speaking population of Finland", 
    "Bulgarian Canadians", "Ukrainian Americans", "Italians in the United Kingdom",
    "Croatian Australians", "Irish Australians", "Swedish Canadians", 
    "French-speaking Quebecer", "Finns", "Albanians", "Polish Australians", 
    "Mudaliar", "Serbian Australians", "Romani people", "Rajput", "Turkish Americans", "Gibraltarian people",
    "Sri Lankan Americans", "Icelanders", "Québécois", "Italian immigration to Mexico", 
    "Corsicans", "Danish Canadians", "Dutch Canadians", "German Brazilians", 
    "Greek Australians", "Slovenes", "Basque people", "Tatars", 
    "Austrians in the United Kingdom", "Transylvanian Saxons", "Afrikaners",
    "Sierra Leone Creole people", "Georgians", "Italians", "Armenians", "Danish Americans",
    "Russians", "Welsh people", "Italian Canadians", "Scottish Australians", "White Africans of European ancestry",
    "Americans", "British", "Serbian Americans", "Polish Americans", "Germans",
    "Irish migration to Great Britain", "Scotch-Irish Americans", "Black Irish", "Scottish people",
    "British Americans", "Australians", "French Americans", "Czech Americans", "Danes",
    "Armenian Americans", "Irish Canadians", "Scottish Canadians", "Italian Brazilians", "Swedes",
    "names of the Greeks", "Slavs", "Anglo-Celtic Australians", "Eurasian", "Poles",
    "Norwegians", "Croats", "Ukrainian Canadians", "Ukrainians", "Yugoslavs",
    "Rusyn American", "Canadian Australian", "Bohemian People", "Luxembourgish Americans",
    "Armenians in Italy", "Baltic Russians", "Latvian Americans", "Ossetians",
    "Castilians", "Bulgarians", "Armenians of Russia", "Estonians", "Bosnians",
    "Manx people", "peoples of the Caucasus", "Romanians"
]

def classify_actor_ethnicity(df):  
    df["actor_ethnicity_classification"] = df["actor_ethnicity_label"].apply(
        lambda x: "African Americans" if x in african_american_ethnicities else (
            "American Indians" if x in american_indian_ethnicities else (
                "Arab Americans" if x in arab_american_ethnicities else (
                    "Asian Americans" if x in asian_american_ethnicities else (
                        "Latino Americans" if x in latino_ethnicities else (
                            "Jewish Americans" if x in jewish_american_ethnicities else (
                                "Caucasian Americans" if x in caucasian_american_ethnicities else None
                            )
                        )
                    )
                )
            )
        )
    )
    return df

hollywood_data_ethnicity = classify_actor_ethnicity(hollywood_data_ethnicity)
hollywood_data_ethnicity.dropna(subset=['actor_ethnicity_classification'], inplace=True)

# Export to a CSV file
hollywood_data_ethnicity.to_csv("data/final/hollywood/hollywood_data_ethnicity.csv", index=False) 

hollywood_data_ethnicity[['actor_name', 'actor_ethnicity_classification']][:20]

Unnamed: 0,actor_name,actor_ethnicity_classification
9,Bridget Fonda,Caucasian Americans
10,Embeth Davidtz,Caucasian Americans
30,Miriam Cooper,Caucasian Americans
34,Harrison Ford,Caucasian Americans
36,Rutger Hauer,Caucasian Americans
43,Edward James Olmos,Latino Americans
44,James Hong,Asian Americans
54,Dom DeLuise,Caucasian Americans
57,Mel Brooks,Caucasian Americans
58,Madeline Kahn,Jewish Americans


In [85]:
# Classification of ethnicity for hollywood_data

hollywood_main = classify_actor_ethnicity(hollywood_main)

# Export to a CSV file
hollywood_main.to_csv("data/final/hollywood/hollywood_data.csv", index=False) 


#### Real-world for Hollywood

### We processed the real-world Hollywood ethnicity data this way :
The Hollywood industry is usually associated with the North-American population. We thus wanted to verify whether or not the Hollywood industry mirrored accurately the corresponding population (United States of America and Canada), this being ethnicity-wise, age-wise and gender-wise. 

For the ethnicity analysis, we extracted the data of the real-world Hollywood ethnicity from this source, as we needed a dataset that could also have information about the timeframe (from approximately 1950 to the present times, ideally 2012) of the ethnicity proportions of the population (for further analysis) : https://icr.ethz.ch/data/epr/core/ 

Considering the non diverse/relevant aspect of the Canada ethnicity analysis found (only taking into account "English speakers", "French speakers", "Indigenous peoples"), we decided to take into account the fairly similar nature of the population ethnicity between the United States and Canadian populations historically, and base our North-American ethnicity analysis on the United States data only. 

In [86]:
hollywood_ethnic_realworld = ethnic_realworld[
    ~ethnic_realworld["group"].isin(["English speakers", "French speakers", "Indigenous peoples"]) &
    ethnic_realworld["statename"].isin(["United States of America", "Canada"])
]

hollywood_ethnic_realworld.head()

Unnamed: 0,gwid,statename,from,to,group,groupid,gwgroupid,umbrella,size,status,reg_aut
0,2,United States of America,1946,1965,Whites,1000,201000,,0.691,MONOPOLY,
1,2,United States of America,1946,1965,African Americans,3000,203000,,0.124,DISCRIMINATED,False
2,2,United States of America,1946,1965,American Indians,5000,205000,,0.0078,POWERLESS,True
3,2,United States of America,1966,2008,Whites,1000,201000,,0.691,DOMINANT,
4,2,United States of America,1966,2008,Latinos,2000,202000,,0.125,POWERLESS,False


In [87]:
process_ethnic_group_data(hollywood_ethnic_realworld)

Unnamed: 0,from to,group,size,counts
0,1946-1965,African Americans,0.124,1
1,1946-1965,American Indians,0.0078,1
2,1946-1965,Whites,0.691,1
3,1966-2008,African Americans,0.124,1
4,1966-2008,American Indians,0.0078,1
5,1966-2008,Arab Americans,0.0042,1
6,1966-2008,Asian Americans,0.036,1
7,1966-2008,Latinos,0.125,1
8,1966-2008,Whites,0.691,1
9,2009-2014,African Americans,0.124,1


#### We also did some data processing/completing concerning the latter ethnicity data chosen :
- When missing, filling-in past unrecovered data by a proportion of ethnicity population of 0, since it would likely mean that the population proportion was negligeable at the time of the recensement to be accounted for.
- Considering the high proportion of actors from a Jewish ethnicity (cf. the prior surfacic data exploration), we decided to add from a separate source (https://www.pewresearch.org/religion/2013/10/01/chapter-1-population-estimates/ ,  https://www.pewresearch.org/religion/2021/05/11/the-size-of-the-u-s-jewish-population/) the proportion of the Jewish-American population in the count.
- For the time fragmentation of our data, we decided to take into account 4 periods of time from 1950 to 2012 : 1950-1965, 1966-1980, 1981-1995 and 1996-2012, this to be able to use a common fragmentation all throughout our analysis. We thus had to remap the periods provided in the ethnicity dataset to an approximately equivalent time fragmentation. 

In [88]:
# Original dataset
data = {
    "from to": [
        "1946-1965", "1946-1965", "1946-1965", "1966-2008", "1966-2008",
        "1966-2008", "1966-2008", "1966-2008", "1966-2008", "2009-2014",
        "2009-2014", "2009-2014", "2009-2014", "2009-2014", "2009-2014",
        "2015-2017", "2015-2017", "2015-2017", "2015-2017", "2015-2017",
        "2015-2017", "2018-2021", "2018-2021", "2018-2021", "2018-2021",
        "2018-2021", "2018-2021"
    ],
    "group": [
        "African Americans", "American Indians", "Whites", "African Americans", "American Indians",
        "Arab Americans", "Asian Americans", "Latinos", "Whites", "African Americans",
        "American Indians", "Arab Americans", "Asian Americans", "Latinos", "Whites",
        "African Americans", "American Indians", "Arab Americans", "Asian Americans", "Latinos",
        "Whites", "African Americans", "American Indians", "Arab Americans", "Asian Americans",
        "Latinos", "Whites"
    ],
    "size": [
        0.1240, 0.0078, 0.6910, 0.1240, 0.0078,
        0.0042, 0.0360, 0.1250, 0.6910, 0.1240,
        0.0078, 0.0050, 0.0440, 0.1500, 0.6600,
        0.1240, 0.0078, 0.0050, 0.0440, 0.1500,
        0.6600, 0.1340, 0.0130, 0.0050, 0.0590,
        0.1850, 0.6000
    ],
    "counts": [1] * 27
}

hollywood_ethnic_realworld = pd.DataFrame(data)

# Step 1: Add Jewish Americans data
jewish_data = [
    {"from to": "1946-1965", "group": "Jewish Americans", "size": 0.033, "counts": 1},
    {"from to": "1966-2008", "group": "Jewish Americans", "size": 0.033, "counts": 1},
    {"from to": "2009-2014", "group": "Jewish Americans", "size": 0.024, "counts": 1},
]
hollywood_ethnic_realworld = pd.concat([hollywood_ethnic_realworld, pd.DataFrame(jewish_data)], ignore_index=True)

# Step 2: Map new periods based on the logic
new_period_mapping = {
    "1946-1965": "1950-1965",
    "1966-2008": ["1966-1980", "1981-1995"],
    "2009-2014": "1996-2012",
}

# Step 3: Assign new periods
def assign_periods(row):
    if row["from to"] in new_period_mapping:
        return new_period_mapping[row["from to"]]
    return None

hollywood_ethnic_realworld["new_period"] = hollywood_ethnic_realworld.apply(assign_periods, axis=1)

# Step 4: Expand rows with multiple periods
hollywood_ethnic_realworld = hollywood_ethnic_realworld.explode("new_period").reset_index(drop=True)

# Step 5: Drop rows with periods beyond 2015-2021
hollywood_ethnic_realworld = hollywood_ethnic_realworld[hollywood_ethnic_realworld["new_period"].notna()]

# Step 6: Replace Whites with Caucasian Americans
hollywood_ethnic_realworld["group"] = hollywood_ethnic_realworld["group"].replace("Whites", "Caucasian Americans")

#Step 7: Replace "Latinos" with "Latino Americans" in the 'group' column
hollywood_ethnic_realworld['group'] = hollywood_ethnic_realworld['group'].replace('Latinos', 'Latino Americans')

# Step 8: Add missing groups for the first period (1950-1965)
missing_groups = ['Arab Americans', 'Latino Americans', 'Asian Americans']
new_period = '1950-1965'
missing_rows = pd.DataFrame({
    'new_period': [new_period] * len(missing_groups),
    'from to': ['1946-1965'] * len(missing_groups),
    'group': missing_groups,
    'size': [0] * len(missing_groups),
    'counts': [1] * len(missing_groups)
})

# Append missing rows to the dataset
hollywood_ethnic_realworld = pd.concat([hollywood_ethnic_realworld, missing_rows], ignore_index=True)

# Step 9: Verify the result
hollywood_ethnic_realworld = hollywood_ethnic_realworld[['new_period', 'from to', 'group', 'size', 'counts']]

print(f"Shape of the resulting DataFrame: {hollywood_ethnic_realworld.shape}")

grouped_ethnic_realworld = hollywood_ethnic_realworld.groupby(['new_period', 'from to', 'group', 'size']).size()

# Convert the grouped result to a DataFrame
hollywood_ethnic_realworld = grouped_ethnic_realworld.reset_index(name='counts')

hollywood_ethnic_realworld.drop(columns=['counts'], inplace=True)

# Export to a CSV file
hollywood_ethnic_realworld.to_csv("data/final/hollywood/hollywood_ethnic_realworld.csv", index=False)  

hollywood_ethnic_realworld.head()

Shape of the resulting DataFrame: (28, 5)


Unnamed: 0,new_period,from to,group,size
0,1950-1965,1946-1965,African Americans,0.124
1,1950-1965,1946-1965,American Indians,0.0078
2,1950-1965,1946-1965,Arab Americans,0.0
3,1950-1965,1946-1965,Asian Americans,0.0
4,1950-1965,1946-1965,Caucasian Americans,0.691


### Creation of the dataframe for the Male real-world population representative of the European movie industry, according to Time Period and age gaps of 5 years

- Here, we extracted the dataset of the European population statistics from 1950 to 2012 (year of the latest movie registered in our Movies CMU dataset) from this source: https://population.un.org/wpp/


In [89]:
pop_male_realworld_hollywood = pop_male_realworld[pop_male_realworld['Region, subregion, country or area *'] == 'Northern America']
pop_male_realworld_hollywood.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
19467,19468,Estimates,Northern America,,905,,,21,Region,1840,...,14,10,8,6,4,3,2,1,1,1
19468,19469,Estimates,Northern America,,905,,,21,Region,1840,...,15,11,8,6,4,3,2,1,1,1
19469,19470,Estimates,Northern America,,905,,,21,Region,1840,...,15,11,8,6,4,3,2,1,1,1
19470,19471,Estimates,Northern America,,905,,,21,Region,1840,...,16,12,8,6,4,3,2,1,1,1
19471,19472,Estimates,Northern America,,905,,,21,Region,1840,...,16,12,9,6,4,3,2,1,1,1


In [90]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *', 'Year'] + [col for col in pop_male_realworld_hollywood.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_male_realworld_hollywood = pop_male_realworld_hollywood[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_male_realworld_hollywood.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,91,92,93,94,95,96,97,98,99,100+
19467,Northern America,1950,2020,1947,1958,1939,1733,1561,1561,1556,...,14,10,8,6,4,3,2,1,1,1
19468,Northern America,1951,2068,1980,1939,1956,1936,1729,1560,1564,...,15,11,8,6,4,3,2,1,1,1
19469,Northern America,1952,2100,2026,1969,1936,1954,1933,1724,1559,...,15,11,8,6,4,3,2,1,1,1
19470,Northern America,1953,2140,2061,2012,1963,1935,1954,1930,1719,...,16,12,8,6,4,3,2,1,1,1
19471,Northern America,1954,2189,2107,2052,2005,1960,1935,1953,1927,...,16,12,9,6,4,3,2,1,1,1


In [91]:
# Ensure the 'Year' column is of integer type
pop_male_realworld_hollywood['Year'] = pop_male_realworld_hollywood['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_male_realworld_hollywood['Time Period'] = pop_male_realworld_hollywood['Year'].apply(assign_time_period)

# Group by the time period and calculate the mean
male_hollywood_realworld_averages = (
    pop_male_realworld_hollywood
    .groupby('Time Period')
    .mean(numeric_only=True)
    .reset_index() 
)

# Drop the Year column in male_hollywood_realworld_averages (if it exists)
if 'Year' in male_hollywood_realworld_averages.columns:
    male_hollywood_realworld_averages = male_hollywood_realworld_averages.drop(columns=['Year'])

# Round the averages to two decimal places
male_hollywood_realworld_averages = male_hollywood_realworld_averages.round(2)

# Export to a CSV file
male_hollywood_realworld_averages.to_csv("data/final/hollywood/male_hollywood_realworld_averages.csv", index=False) 

# Display the resulting DataFrame
male_hollywood_realworld_averages

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,2261.38,2211.31,2184.44,2159.25,2119.81,2069.0,2018.88,1969.5,1918.19,...,19.56,14.88,10.94,7.94,5.44,3.75,2.38,1.38,1.0,1.31
1,1966-1980,1985.13,1960.67,1966.87,1984.47,2012.73,2049.0,2090.13,2133.73,2176.87,...,35.73,27.93,21.27,16.07,11.73,8.33,5.8,3.87,2.53,3.6
2,1981-1995,2187.27,2157.47,2134.0,2108.53,2079.6,2046.2,2016.13,1992.13,1975.13,...,52.87,40.8,30.93,23.0,16.87,12.07,8.4,5.73,3.8,6.2
3,1996-2012,2212.41,2208.41,2213.24,2219.18,2225.0,2231.82,2239.06,2247.47,2257.41,...,93.41,71.18,53.18,38.59,27.71,19.35,13.24,8.65,5.47,8.0


### Creation of the dataframe for the Female real world population representative of the Hollywood movie industry (North America), according to Time Period and age gaps of 5 years  

In [92]:
pop_female_realworld_hollywood = pop_female_realworld[pop_female_realworld['Region, subregion, country or area *'] == 'Northern America']
pop_female_realworld_hollywood.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
19467,19468,Estimates,Northern America,,905,,,21,Region,1840,...,22,17,13,10,7,5,3,2,1,3
19468,19469,Estimates,Northern America,,905,,,21,Region,1840,...,23,17,13,10,7,5,4,2,2,3
19469,19470,Estimates,Northern America,,905,,,21,Region,1840,...,24,18,13,10,7,5,4,3,2,3
19470,19471,Estimates,Northern America,,905,,,21,Region,1840,...,25,19,14,10,7,5,4,3,2,3
19471,19472,Estimates,Northern America,,905,,,21,Region,1840,...,26,20,15,11,8,5,4,3,2,3


In [93]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *', 'Year'] + [col for col in pop_female_realworld_hollywood.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_female_realworld_hollywood = pop_female_realworld_hollywood[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_female_realworld_hollywood.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,91,92,93,94,95,96,97,98,99,100+
19467,Northern America,1950,1930,1872,1892,1876,1671,1504,1508,1507,...,22,17,13,10,7,5,3,2,1,3
19468,Northern America,1951,1979,1904,1870,1892,1875,1670,1506,1512,...,23,17,13,10,7,5,4,2,2,3
19469,Northern America,1952,2018,1950,1899,1872,1894,1873,1667,1506,...,24,18,13,10,7,5,4,3,2,3
19470,Northern America,1953,2062,1993,1944,1898,1874,1896,1871,1665,...,25,19,14,10,7,5,4,3,2,3
19471,Northern America,1954,2109,2043,1992,1943,1899,1878,1898,1870,...,26,20,15,11,8,5,4,3,2,3


In [94]:
# Ensure the 'Year' column is of integer type
pop_female_realworld_hollywood['Year'] = pop_female_realworld_hollywood['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_female_realworld_hollywood['Time Period'] = pop_female_realworld_hollywood['Year'].apply(assign_time_period)

# Group by the time period and calculate the mean
female_hollywood_realworld_averages = (
    pop_female_realworld_hollywood
    .groupby('Time Period')
    .mean(numeric_only=True)
    .reset_index() 
)

# Drop the Year column in male_hollywood_realworld_averages (if it exists)
if 'Year' in female_hollywood_realworld_averages.columns:
    female_hollywood_realworld_averages = female_hollywood_realworld_averages.drop(columns=['Year'])

# Round the averages to two decimal places
female_hollywood_realworld_averages = female_hollywood_realworld_averages.round(2)

# Export to a CSV file
female_hollywood_realworld_averages.to_csv("data/final/hollywood/female_hollywood_realworld_averages.csv", index=False) 

# Display the resulting DataFrame
female_hollywood_realworld_averages

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,2178.5,2142.56,2123.25,2103.69,2068.5,2021.44,1974.62,1927.38,1877.25,...,32.19,24.81,18.62,13.81,9.88,6.81,4.88,3.19,2.19,3.25
1,1966-1980,1893.73,1878.2,1887.47,1907.07,1936.73,1973.67,2013.8,2057.13,2100.33,...,73.13,57.73,44.47,33.93,25.27,18.6,13.2,9.2,6.2,9.67
2,1981-1995,2086.47,2060.6,2038.47,2013.8,1985.47,1952.87,1923.27,1898.4,1880.33,...,147.13,118.8,94.13,73.4,56.4,42.47,31.6,22.93,16.2,29.0
3,1996-2012,2112.59,2108.88,2112.06,2116.18,2120.18,2124.82,2129.71,2135.47,2142.35,...,236.59,193.12,154.71,121.29,93.29,70.24,51.71,37.12,25.71,45.12


### Creation of the dataframe for the Both Sexes real world population representative of the Hollywood movie industry (North America), according to Time Period and age gaps of 5 years  

In [95]:
pop_bothsexes_realworld_hollywood = pop_bothsexes_realworld[pop_bothsexes_realworld['Region, subregion, country or area *'] == 'Northern America']
pop_bothsexes_realworld_hollywood.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
19467,19468,Estimates,Northern America,,905,,,21,Region,1840,...,36,27,20,15,11,8,5,3,2,4
19468,19469,Estimates,Northern America,,905,,,21,Region,1840,...,38,28,21,15,11,8,5,4,2,4
19469,19470,Estimates,Northern America,,905,,,21,Region,1840,...,39,29,21,16,11,8,6,4,2,4
19470,19471,Estimates,Northern America,,905,,,21,Region,1840,...,41,30,22,16,12,8,6,4,3,4
19471,19472,Estimates,Northern America,,905,,,21,Region,1840,...,43,32,23,17,12,8,6,4,3,4


In [96]:
# Ensure the 'Year' column is of integer type
pop_bothsexes_realworld_hollywood['Year'] = pop_bothsexes_realworld_hollywood['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_bothsexes_realworld_hollywood['Time Period'] = pop_bothsexes_realworld_hollywood['Year'].apply(assign_time_period)

# Group by the time period and calculate the mean
bothsexes_hollywood_realworld_averages = (
    pop_bothsexes_realworld_hollywood
    .groupby('Time Period')
    .mean(numeric_only=True)
    .reset_index()  # Reset index to make it a regular DataFrame
)

# Drop the Year column in male_hollywood_realworld_averages (if it exists)
if 'Year' in bothsexes_hollywood_realworld_averages.columns:
    bothsexes_hollywood_realworld_averages = bothsexes_hollywood_realworld_averages.drop(columns=['Year'])

# Round the averages to two decimal places
bothsexes_hollywood_realworld_averages = bothsexes_hollywood_realworld_averages.round(2)

# Export to a CSV file
bothsexes_hollywood_realworld_averages.to_csv("data/final/hollywood/bothsexes_hollywood_realworld_averages.csv", index=False) 

# Display the resulting DataFrame
bothsexes_hollywood_realworld_averages

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,4439.62,4353.88,4307.75,4263.0,4188.38,4090.5,3993.56,3896.88,3795.31,...,51.88,39.5,29.5,21.62,15.38,10.56,7.19,4.69,2.94,4.56
1,1966-1980,3878.87,3838.8,3854.4,3891.53,3949.47,4022.87,4103.87,4190.8,4277.4,...,108.87,85.47,66.07,50.0,37.13,26.93,18.93,13.07,8.6,13.47
2,1981-1995,4273.87,4218.13,4172.6,4122.53,4065.13,3999.07,3939.2,3890.4,3855.47,...,199.87,159.6,125.0,96.47,73.2,54.47,40.07,28.67,19.93,35.4
3,1996-2012,4325.24,4317.12,4325.41,4335.41,4345.29,4356.53,4368.82,4382.88,4399.71,...,329.94,264.18,207.76,160.24,121.12,89.65,64.88,45.65,31.24,53.06


In [97]:
# Calculate the proportions of men 
male_hollywood_realworld_proportions = male_hollywood_realworld_averages.copy()
male_hollywood_realworld_proportions.iloc[:, 1:] = (
    male_hollywood_realworld_averages.iloc[:, 1:].values /
    bothsexes_hollywood_realworld_averages.iloc[:, 1:].values
)

# Calculate the proportions of women 
female_hollywood_realworld_proportions = female_hollywood_realworld_averages.copy()
female_hollywood_realworld_proportions.iloc[:, 1:] = (
    female_hollywood_realworld_averages.iloc[:, 1:].values /
    bothsexes_hollywood_realworld_averages.iloc[:, 1:].values
)

In [98]:
male_hollywood_realworld_proportions.to_csv("data/final/hollywood/male_hollywood_realworld_proportions.csv", index=False) 

male_hollywood_realworld_proportions

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,0.509363,0.507894,0.507095,0.50651,0.506117,0.505806,0.505534,0.505404,0.505411,...,0.377024,0.376709,0.370847,0.367253,0.353706,0.355114,0.331015,0.294243,0.340136,0.287281
1,1966-1980,0.51178,0.510751,0.510292,0.509946,0.50962,0.509338,0.509307,0.509146,0.508924,...,0.32819,0.326781,0.321931,0.3214,0.315917,0.30932,0.306392,0.296098,0.294186,0.267261
2,1981-1995,0.511777,0.511475,0.511432,0.511465,0.51157,0.511669,0.511812,0.512063,0.512293,...,0.264522,0.255639,0.24744,0.238416,0.230464,0.22159,0.209633,0.19986,0.190667,0.175141
3,1996-2012,0.511511,0.511547,0.511683,0.511873,0.512049,0.512293,0.512509,0.512784,0.513082,...,0.283112,0.269438,0.255968,0.240826,0.228781,0.215839,0.204069,0.189485,0.175096,0.150773


In [99]:
female_hollywood_realworld_proportions.to_csv("data/final/hollywood/female_hollywood_realworld_proportions.csv", index=False) 

female_hollywood_realworld_proportions

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,0.490695,0.492104,0.492891,0.493476,0.493866,0.494179,0.494451,0.494596,0.494624,...,0.62047,0.628101,0.631186,0.63876,0.642393,0.644886,0.67872,0.680171,0.744898,0.712719
1,1966-1980,0.488217,0.489267,0.489692,0.490057,0.490377,0.490612,0.490708,0.490868,0.49103,...,0.671719,0.675442,0.673074,0.6786,0.680582,0.69068,0.697306,0.703902,0.72093,0.717892
2,1981-1995,0.488192,0.48851,0.488537,0.488486,0.488415,0.488331,0.488239,0.48797,0.487704,...,0.736128,0.744361,0.75304,0.760858,0.770492,0.779695,0.78862,0.799791,0.812845,0.819209
3,1996-2012,0.488433,0.488492,0.488291,0.488115,0.487926,0.487732,0.487479,0.48723,0.48693,...,0.71707,0.731017,0.744657,0.756927,0.770228,0.783491,0.79701,0.813143,0.822983,0.850358


**3. East-Asia**

In [100]:
east_asia_countries = ['China', 'Japan', 'Mongolia', 'Hong Kong', 'South Korea', 'Taiwan']

In [101]:
east_asia_data_ethnicity = east_asian_main[east_asian_main['actor_ethnicity_label'] != "Unknown"]

# Define the chinese ethnicities
chinese_ethnicities = [
    "Chinese Americans", "Chinese Singaporeans", "British Chinese", "Malaysian Chinese", 
    "Chinese Canadians", "Thai Chinese", "Chinese Filipino", "Zhuang people", "Vietnamese Americans", 
]

taiwanese_ethnicities = [
    "Taiwanese people"
]
hong_kong_ethnicities = [
    "Hongkongers"
]

koreans_ethnicities = [
    "Koreans"
]

# Define the japanese ethnicities
japanese_ethnicities = [
    "Japanese Americans", "Ryukyuan people", "Asian people"
]

# Define the south asian ethnicities
other_asian_ethnicities = [
   "Indians", "Bihari people", "Parsi", "Malayali", "Eurasian", "Javanese"
]

# Define the european ethnicities 
european_ethnicities = [
    "Anglo-Irish people", "Welsh Italians", "Irish Americans", "English people",
    "Scottish people", "Italian Americans", "German Americans", "Hungarians", 
    "Spanish Americans", "names of the Greeks", "Portuguese", "Italians", "Germans", "White British",
    "British", "Irish migration to Great Britain", "Dutch", "Irish people",
    "Swedes", "French", "Scottish Americans", "Scandinavian Americans", "Dutch Americans",
    "Danish Americans", "Greek Americans", "Luxembourgish Americans",
    "Swedish Americans", "Albanian Americans", "Welsh people", "Cajun", "Honduras",
    "White people"
]

# Define the Americans ethnicities
american_ethnicities = [
    "Rusyn American", "Cherokee", "African Americans", "Ojibwe",
    "White Americans", "Canadian Americans", "Vietnamese Americans",
    "Asian Americans", "Jewish people", "British Americans", 
    "Hispanic and Latino Americans", "Ghanaian Americans", 
    "Mexican Americans", "Iranian Americans",
    "Latin American British", "multiracial American", "Native Hawaiians",
    "Québécois", "American Jews", "Indigenous peoples of the Americas", "Australians"
]

african_ethnicities = [
    "Akan people"
]

def classify_actor_ethnicity(df):  
    df["actor_ethnicity_classification"] = df["actor_ethnicity_label"].apply(
        lambda x: "Chinese" if x in chinese_ethnicities else (
            "Hongkongers" if x in hong_kong_ethnicities else (
                "Koreans" if x in koreans_ethnicities else (
                    "Japanese" if x in japanese_ethnicities else (
                        "Other Asians" if x in other_asian_ethnicities else (
                            "Europeans" if x in european_ethnicities else (
                                "Taiwanese" if x in taiwanese_ethnicities else (
                                "Americans" if x in american_ethnicities else None
                            )
                        )
                    )
                )
            )
        )
    )
    )
    return df
east_asia_data_ethnicity = classify_actor_ethnicity(east_asia_data_ethnicity)
east_asia_data_ethnicity.dropna(subset=['actor_ethnicity_classification'], inplace=True)

# Export to a CSV file
east_asia_data_ethnicity.to_csv("data/final/east_asia/eastasia_data_ethnicity.csv", index=False) 


In [102]:
# Classification of ethnicity for hollywood_data (df with all Ethnicities, even Unknown)
east_asian_main = classify_actor_ethnicity(east_asian_main)

# Export to a CSV file
east_asian_main.to_csv("data/final/east_asia/eastasia_data.csv", index=False) 


##### Preprocessing step 2

### We processed the real-world East Asian ethnicity data this way :
The East Asian film industry is usually associated with the populations of countries such as China, Japan, South Korea, Taiwan, Hong Kong, and Mongolia. We aimed to verify whether or not the East Asian film industry accurately mirrored the corresponding populations of these regions in terms of ethnicity, age, and gender.

For the ethnicity analysis, we extracted real-world East Asian ethnicity data from this source, as we needed a dataset that also provided information about the timeframe (from approximately 1950 to the present times, ideally up to 2012) of the ethnic proportions of the population for further analysis: https://icr.ethz.ch/data/epr/core/.

In [103]:
east_asia_ethnic_realworld = ethnic_realworld[
    ethnic_realworld["statename"].isin(east_asia_countries)
]

east_asia_ethnic_realworld.head()

Unnamed: 0,gwid,statename,from,to,group,groupid,gwgroupid,umbrella,size,status,reg_aut
3046,710,China,1946,1949,Chinese (Han),3000,71003000,,0.94,IRRELEVANT,
3047,710,China,1950,1950,Chinese (Han),3000,71003000,,0.94,MONOPOLY,
3048,710,China,1950,1950,Zhuang,13000,71013000,,0.0113,POWERLESS,False
3049,710,China,1950,1950,Uyghur,36000,71036000,,0.0062,POWERLESS,False
3050,710,China,1950,1950,Hui,5000,71005000,,0.0061,POWERLESS,False


In [104]:
process_ethnic_group_data(east_asia_ethnic_realworld)

Unnamed: 0,from to,group,size,counts
0,1946-1949,Chinese (Han),0.9400,1
1,1946-1952,Japanese,0.9730,1
2,1946-2015,Kazakh,0.0500,1
3,1946-2015,Mongols,0.9000,1
4,1949-1986,Mainland Chinese,0.1400,1
...,...,...,...,...
355,2018-2021,Wa,0.0003,1
356,2018-2021,Xibe,0.0001,1
357,2018-2021,Yao,0.0021,1
358,2018-2021,Yi,0.0065,1


In [105]:
chinese_ethnicities = [
    "Chinese (Han)", "Zhuang", "Yi", "Yao", "Uyghur", "Manchu", "Miao", "Dong", "Bouyei", 
    "Bai", "Shui", "Lahu", "Tu", "Tujia", "Wa", "She", "Qiang", "Naxi", "Mulam", "Lisu", 
    "Li", "Jingpo", "Kirghiz", "Hani", "Dongxiang", "Dai", "Daur", "Mainland Chinese", "Hui",
    "Mongolians", "Mongols", "Tibetans", "Hui (proper)", "Salar", "Xibe", "Blang", "Maonan", "Gelao"
]

taiwanese_ethnicities = [
    "Taiwanese", "Indigenous/Aboriginal Taiwanese"
]

japanese_ethnicities = [
    "Japanese", "Burakumin", "Okinawans", "Ainu"
]

koreans_ethnicities = [
    "Koreans"
]

other_asians_ethnicities = [
    "Kazakh"
]

def classify_real_world_ethnicity(group):
    if group in chinese_ethnicities:
        return "Chinese"
    elif group in taiwanese_ethnicities:
        return "Taiwanese"
    elif group in japanese_ethnicities:
        return "Japanese"
    elif group in koreans_ethnicities:
        return "Koreans"
    elif group in other_asians_ethnicities:
        return "Other Asians"
    else:
        return None

east_asia_ethnic_realworld["group_classification"] = east_asia_ethnic_realworld["group"].apply(classify_real_world_ethnicity)
east_asia_ethnic_realworld.head()


Unnamed: 0,gwid,statename,from,to,group,groupid,gwgroupid,umbrella,size,status,reg_aut,from to,group_classification
3046,710,China,1946,1949,Chinese (Han),3000,71003000,,0.94,IRRELEVANT,,1946-1949,Chinese
3047,710,China,1950,1950,Chinese (Han),3000,71003000,,0.94,MONOPOLY,,1950-1950,Chinese
3048,710,China,1950,1950,Zhuang,13000,71013000,,0.0113,POWERLESS,False,1950-1950,Chinese
3049,710,China,1950,1950,Uyghur,36000,71036000,,0.0062,POWERLESS,False,1950-1950,Chinese
3050,710,China,1950,1950,Hui,5000,71005000,,0.0061,POWERLESS,False,1950-1950,Chinese


In [106]:
eastasia_ethnic_realworld_df = pd.read_csv("test20.csv")
eastasia_ethnic_realworld_df.head()

Unnamed: 0,from to,size,group,new_period
0,1950-1952,0.973,Japanese,1950-1965
1,1950-1965,0.14,Chinese,1950-1965
2,1950-1965,0.84,Taiwanese,1950-1965
3,1966-1980,0.14,Chinese,1966-1980
4,1966-1980,0.84,Taiwanese,1966-1980


In [107]:
# Strip whitespace from all column names
eastasia_ethnic_realworld_df.columns = eastasia_ethnic_realworld_df.columns.str.strip()

# Now group by 'new_period' and 'group' and calculate the mean of 'size'
eastasia_ethnic_realworld_df = eastasia_ethnic_realworld_df.groupby(['new_period', 'group'], as_index=False)['size'].mean()

eastasia_ethnic_realworld_df.to_csv("data/final/east_asia/eastasia_ethnic_realworld.csv", index = False)
eastasia_ethnic_realworld_df


Unnamed: 0,new_period,group,size
0,1950-1965,Chinese,0.081377
1,1950-1965,Japanese,0.973
2,1950-1965,Koreans,0.003267
3,1950-1965,Other Asians,0.00085
4,1950-1965,Taiwanese,0.84
5,1966-1980,Chinese,0.039939
6,1966-1980,Japanese,0.4905
7,1966-1980,Koreans,0.003267
8,1966-1980,Other Asians,0.00075
9,1966-1980,Taiwanese,0.84


### Creation of the dataframe for the Male real-world population representative of the East-Asian movie industry, according to Time Period and age gaps of 5 years

- Here, we extracted the dataset of the European population statistics from 1950 to 2012 (year of the latest movie registered in our Movies CMU dataset) from this source: https://population.un.org/wpp/


In [108]:
pop_male_realworld_eastasia = pop_male_realworld[pop_male_realworld['Region, subregion, country or area *'].isin(east_asia_countries)]
pop_male_realworld_eastasia.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
7774,7775,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,13,8,6,3,2,1,1,0,0,0
7775,7776,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,13,8,5,3,2,1,1,0,0,0
7776,7777,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,13,8,5,3,2,1,1,0,0,0
7777,7778,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,12,8,5,3,2,1,1,0,0,0
7778,7779,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,12,8,5,3,2,1,1,0,0,0


In [109]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *', 'Year'] + [col for col in pop_male_realworld_europe.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_male_realworld_eastasia = pop_male_realworld_eastasia[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_male_realworld_eastasia.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,91,92,93,94,95,96,97,98,99,100+
7774,China,1950,9859,8614,7968,7338,6749,6307,6035,5903,...,13,8,6,3,2,1,1,0,0,0
7775,China,1951,10378,9273,8363,7794,7216,6661,6241,5982,...,13,8,5,3,2,1,1,0,0,0
7776,China,1952,11149,9780,9018,8187,7668,7126,6595,6190,...,13,8,5,3,2,1,1,0,0,0
7777,China,1953,11639,10523,9522,8839,8060,7573,7055,6541,...,12,8,5,3,2,1,1,0,0,0
7778,China,1954,11647,10999,10254,9342,8710,7964,7500,6999,...,12,8,5,3,2,1,1,0,0,0


In [110]:
# Ensure the 'Year' column is of integer type
pop_male_realworld_eastasia['Year'] = pop_male_realworld_eastasia['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_male_realworld_eastasia['Time Period'] = pop_male_realworld_eastasia['Year'].apply(assign_time_period)

pop_male_realworld_eastasia.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,92,93,94,95,96,97,98,99,100+,Time Period
7774,China,1950,9859,8614,7968,7338,6749,6307,6035,5903,...,8,6,3,2,1,1,0,0,0,1950-1965
7775,China,1951,10378,9273,8363,7794,7216,6661,6241,5982,...,8,5,3,2,1,1,0,0,0,1950-1965
7776,China,1952,11149,9780,9018,8187,7668,7126,6595,6190,...,8,5,3,2,1,1,0,0,0,1950-1965
7777,China,1953,11639,10523,9522,8839,8060,7573,7055,6541,...,8,5,3,2,1,1,0,0,0,1950-1965
7778,China,1954,11647,10999,10254,9342,8710,7964,7500,6999,...,8,5,3,2,1,1,0,0,0,1950-1965


In [111]:
male_eastasia_realworld_averages = process_grouped_averages_by_columns(pop_male_realworld_eastasia, 'Time Period', drop_cols='Year', decimals=2)
male_eastasia_realworld_averages.to_csv("data/final/east_asia/male_eastasia_realworld_averages.csv", index=False) 
male_eastasia_realworld_averages.head()

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,4161.35,3836.85,3612.81,3428.98,3329.44,3273.6,3215.85,3141.88,3040.65,...,3.75,2.5,1.6,0.98,0.56,0.23,0.15,0.0,0.0,0.0
1,1966-1980,4613.24,4575.93,4630.73,4678.58,4636.2,4537.09,4429.29,4321.87,4236.56,...,3.67,2.51,1.64,1.0,0.6,0.27,0.18,0.0,0.0,0.0
2,1981-1995,4221.22,4197.29,4183.84,4162.36,4140.24,4096.62,4048.2,4033.33,4030.93,...,11.36,7.6,5.0,3.18,1.98,1.24,0.67,0.31,0.16,0.24
3,1996-2012,3251.33,3239.92,3246.96,3260.41,3287.08,3352.22,3443.63,3529.75,3619.02,...,41.57,29.04,19.63,13.06,8.49,5.47,3.37,2.02,1.12,1.43


In [112]:
pop_female_realworld_eastasia = pop_female_realworld[pop_female_realworld['Region, subregion, country or area *'].isin(east_asia_countries)]
pop_female_realworld_eastasia.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
7774,7775,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,32,21,14,9,5,3,2,1,0,0
7775,7776,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,33,22,14,9,6,3,2,1,1,1
7776,7777,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,32,22,15,9,6,3,2,1,1,1
7777,7778,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,32,22,15,9,6,4,2,1,1,1
7778,7779,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,30,21,14,9,6,3,2,1,1,1


In [113]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *', 'Year'] + [col for col in pop_female_realworld_hollywood.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_female_realworld_eastasia = pop_female_realworld_eastasia[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_female_realworld_eastasia.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,91,92,93,94,95,96,97,98,99,100+
7774,China,1950,9313,8023,7290,6662,6169,5717,5393,5244,...,32,21,14,9,5,3,2,1,0,0
7775,China,1951,9898,8794,7795,7124,6543,6080,5650,5341,...,33,22,14,9,6,3,2,1,1,1
7776,China,1952,10630,9363,8559,7625,6998,6452,6013,5599,...,32,22,15,9,6,3,2,1,1,1
7777,China,1953,11093,10067,9123,8385,7496,6899,6377,5956,...,32,22,15,9,6,4,2,1,1,1
7778,China,1954,11098,10516,9814,8945,8253,7396,6823,6319,...,30,21,14,9,6,3,2,1,1,1


In [114]:
# Ensure the 'Year' column is of integer type
pop_female_realworld_eastasia['Year'] = pop_female_realworld_eastasia['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_female_realworld_eastasia['Time Period'] = pop_female_realworld_eastasia['Year'].apply(assign_time_period)
pop_female_realworld_eastasia.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,92,93,94,95,96,97,98,99,100+,Time Period
7774,China,1950,9313,8023,7290,6662,6169,5717,5393,5244,...,21,14,9,5,3,2,1,0,0,1950-1965
7775,China,1951,9898,8794,7795,7124,6543,6080,5650,5341,...,22,14,9,6,3,2,1,1,1,1950-1965
7776,China,1952,10630,9363,8559,7625,6998,6452,6013,5599,...,22,15,9,6,3,2,1,1,1,1950-1965
7777,China,1953,11093,10067,9123,8385,7496,6899,6377,5956,...,22,15,9,6,4,2,1,1,1,1950-1965
7778,China,1954,11098,10516,9814,8945,8253,7396,6823,6319,...,21,14,9,6,3,2,1,1,1,1950-1965


In [115]:
female_eastasia_realworld_averages = process_grouped_averages_by_columns(pop_female_realworld_eastasia, 'Time Period', drop_cols='Year', decimals=2)
female_eastasia_realworld_averages.to_csv("data/final/east_asia/female_eastasia_realworld_averages.csv", index=False) 
female_eastasia_realworld_averages.head()

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,3959.85,3659.27,3442.38,3259.52,3157.44,3096.42,3033.65,2956.65,2853.6,...,9.71,6.73,4.54,2.88,1.85,1.08,0.71,0.25,0.17,0.17
1,1966-1980,4361.13,4332.33,4387.42,4434.22,4394.58,4300.62,4198.8,4097.71,4018.31,...,11.02,7.51,4.98,3.22,2.04,1.29,0.71,0.31,0.16,0.13
2,1981-1995,3854.33,3846.84,3848.6,3841.69,3833.47,3805.16,3772.36,3769.36,3777.24,...,37.29,26.47,18.27,12.27,7.87,5.11,3.09,1.93,1.04,1.22
3,1996-2012,2804.24,2797.86,2808.45,2826.14,2856.69,2922.96,3014.39,3102.41,3194.86,...,122.25,93.8,70.08,51.22,36.94,26.24,18.24,12.37,8.22,12.47


In [116]:
pop_bothsexes_realworld_eastasia = pop_bothsexes_realworld[pop_bothsexes_realworld['Region, subregion, country or area *'].isin(east_asia_countries)]
pop_bothsexes_realworld_eastasia.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
7774,7775,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,44,30,20,12,7,4,2,1,1,1
7775,7776,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,46,30,20,13,8,4,3,1,1,1
7776,7777,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,45,31,20,13,8,5,3,1,1,1
7777,7778,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,44,30,20,12,8,5,3,1,1,1
7778,7779,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,42,29,19,12,8,5,3,1,1,1


In [117]:
# Work on a copy of the DataFrame to avoid modifying the original
pop_bothsexes_realworld_eastasia = pop_bothsexes_realworld_eastasia.copy()

# Apply the function to assign regions using .loc
pop_bothsexes_realworld_eastasia.loc[:, 'region'] = pop_bothsexes_realworld_eastasia['Region, subregion, country or area *'].apply(get_main_region)

# Display the results
pop_bothsexes_realworld_eastasia.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,92,93,94,95,96,97,98,99,100+,region
7774,7775,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,30,20,12,7,4,2,1,1,1,unknown
7775,7776,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,30,20,13,8,4,3,1,1,1,unknown
7776,7777,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,31,20,13,8,5,3,1,1,1,unknown
7777,7778,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,30,20,12,8,5,3,1,1,1,unknown
7778,7779,Estimates,China,5,156,CHN,CN,156,Country/Area,906,...,29,19,12,8,5,3,1,1,1,unknown


In [118]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *','Year'] + [col for col in pop_bothsexes_realworld_eastasia.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_bothsexes_realworld_eastasia= pop_bothsexes_realworld_eastasia[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_bothsexes_realworld_eastasia.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,91,92,93,94,95,96,97,98,99,100+
7774,China,1950,19172,16638,15258,14000,12917,12023,11428,11147,...,44,30,20,12,7,4,2,1,1,1
7775,China,1951,20276,18067,16159,14918,13760,12741,11890,11323,...,46,30,20,13,8,4,3,1,1,1
7776,China,1952,21780,19143,17577,15812,14666,13578,12608,11788,...,45,31,20,13,8,5,3,1,1,1
7777,China,1953,22732,20591,18645,17224,15555,14473,13432,12497,...,44,30,20,12,8,5,3,1,1,1
7778,China,1954,22745,21515,20068,18286,16963,15361,14324,13318,...,42,29,19,12,8,5,3,1,1,1


In [119]:
# Ensure the 'Year' column is of integer type
pop_bothsexes_realworld_eastasia['Year'] = pop_bothsexes_realworld_eastasia['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_bothsexes_realworld_eastasia['Time Period'] = pop_bothsexes_realworld_eastasia['Year'].apply(assign_time_period)
pop_bothsexes_realworld_eastasia.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,92,93,94,95,96,97,98,99,100+,Time Period
7774,China,1950,19172,16638,15258,14000,12917,12023,11428,11147,...,30,20,12,7,4,2,1,1,1,1950-1965
7775,China,1951,20276,18067,16159,14918,13760,12741,11890,11323,...,30,20,13,8,4,3,1,1,1,1950-1965
7776,China,1952,21780,19143,17577,15812,14666,13578,12608,11788,...,31,20,13,8,5,3,1,1,1,1950-1965
7777,China,1953,22732,20591,18645,17224,15555,14473,13432,12497,...,30,20,12,8,5,3,1,1,1,1950-1965
7778,China,1954,22745,21515,20068,18286,16963,15361,14324,13318,...,29,19,12,8,5,3,1,1,1,1950-1965


In [120]:
# Apply the function to your datasets
bothsexes_eastasia_realworld_averages = process_grouped_averages_by_columns(pop_bothsexes_realworld_eastasia, 'Time Period', drop_cols='Year', decimals=2)

# Export to a CSV file
bothsexes_eastasia_realworld_averages.to_csv("data/final/east_asia/bothsexes_eastasia_realworld_averages.csv", index=False)

In [121]:
# Calculate the proportions of men 
male_eastasia_realworld_proportions = male_eastasia_realworld_averages.copy()
male_eastasia_realworld_proportions.iloc[:, 1:] = (
    male_eastasia_realworld_averages.iloc[:, 1:].values /
    bothsexes_eastasia_realworld_averages.iloc[:, 1:].values
)

# Calculate the proportions of women 
female_eastasia_realworld_proportions = female_eastasia_realworld_averages.copy()
female_eastasia_realworld_proportions.iloc[:, 1:] = (
    female_eastasia_realworld_averages.iloc[:, 1:].values /
    bothsexes_eastasia_realworld_averages.iloc[:, 1:].values
)


In [122]:
# Export to a CSV file
male_eastasia_realworld_proportions.to_csv("data/final/east_asia/male_eastasia_realworld_proportions.csv", index=False)

male_eastasia_realworld_proportions

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,0.512404,0.51184,0.512075,0.512673,0.513263,0.513914,0.514579,0.515187,0.515865,...,0.277367,0.270856,0.258481,0.248731,0.231405,0.155405,0.163043,0.0,0.0,0.0
1,1966-1980,0.514042,0.513676,0.513492,0.513399,0.513374,0.51338,0.513352,0.513306,0.513219,...,0.249151,0.250499,0.251149,0.235849,0.232558,0.170886,0.183673,0.0,0.0,0.0
2,1981-1995,0.522719,0.521783,0.520869,0.52003,0.51923,0.51845,0.517639,0.516921,0.516249,...,0.233745,0.222809,0.214869,0.207436,0.198795,0.196513,0.170483,0.135371,0.125984,0.161074
3,1996-2012,0.536905,0.536597,0.536212,0.535671,0.53503,0.534203,0.533233,0.532216,0.531129,...,0.253599,0.236482,0.218865,0.2033,0.186634,0.172719,0.156236,0.140963,0.120043,0.102582


In [123]:
# Export to a CSV file
female_eastasia_realworld_proportions.to_csv("data/final/east_asia/female_eastasia_realworld_proportions.csv", index=False) 

female_eastasia_realworld_proportions

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,0.487592,0.488151,0.487919,0.487336,0.486748,0.486099,0.485424,0.484814,0.484131,...,0.718195,0.729144,0.733441,0.730964,0.764463,0.72973,0.771739,0.862069,0.809524,0.809524
1,1966-1980,0.48595,0.486331,0.486512,0.486584,0.486619,0.486623,0.486638,0.486683,0.486781,...,0.748133,0.749501,0.762634,0.759434,0.790698,0.816456,0.72449,0.704545,0.888889,0.722222
2,1981-1995,0.477286,0.478217,0.479133,0.479967,0.480758,0.481564,0.482368,0.48309,0.483759,...,0.767284,0.776019,0.785131,0.800391,0.790161,0.809826,0.78626,0.842795,0.818898,0.818792
3,1996-2012,0.463075,0.463383,0.463795,0.464322,0.464976,0.465797,0.466767,0.467781,0.468879,...,0.745791,0.763844,0.781358,0.797323,0.812047,0.828544,0.845619,0.863224,0.881029,0.894548


#### Bollywood

In [124]:
# Filter rows where 'actor_ethnicity_label' is not "Unknown"
bollywood_data_ethnicity = indian_main[indian_main['actor_ethnicity_label'] != "Unknown"]
# Count unique values in the 'actor_ethnicity_label' column
ethnicity_counts = bollywood_data_ethnicity['actor_ethnicity_label'].value_counts()

# Convert the result to a DataFrame for better readability
ethnicity_counts_df = ethnicity_counts.reset_index()
ethnicity_counts_df.columns = ['Ethnicity', 'Count']

# Display the DataFrame
#print(ethnicity_counts_df)

# Iterate over the rows and print each ethnicity and its count
for _, row in ethnicity_counts_df.iterrows():
    eth = row['Ethnicity']
    count = row['Count']
    print(f"Ethnicity: {eth}  //    Number of actors: {count}")

for eth in (ethnicity_counts_df['Ethnicity']):
    print(eth)


Ethnicity: Indians  //    Number of actors: 17050
Ethnicity: Tamil  //    Number of actors: 1227
Ethnicity: Punjabis  //    Number of actors: 1182
Ethnicity: Malayali  //    Number of actors: 1066
Ethnicity: Marathi people  //    Number of actors: 931
Ethnicity: Bengali  //    Number of actors: 849
Ethnicity: Telugu people  //    Number of actors: 514
Ethnicity: Gujarati people  //    Number of actors: 475
Ethnicity: Kayastha  //    Number of actors: 472
Ethnicity: Kashmiri Pandit  //    Number of actors: 411
Ethnicity: Tamil Brahmin  //    Number of actors: 351
Ethnicity: Sindhis  //    Number of actors: 330
Ethnicity: Parsi  //    Number of actors: 279
Ethnicity: Bunt (RAJPUT)  //    Number of actors: 235
Ethnicity: Pashtuns  //    Number of actors: 229
Ethnicity: Kanyakubja Brahmins  //    Number of actors: 190
Ethnicity: Sri Lankan Tamils  //    Number of actors: 182
Ethnicity: Nair  //    Number of actors: 182
Ethnicity: Bengali Hindus  //    Number of actors: 164
Ethnicity: Karna

In [125]:
# Define the Indian Ethnicities
South_Indian_Ethnicities =[
    'Tamil', 'Nair', 'Bunt (RAJPUT)', 'Tamil Brahmin', 'Telugu people', 'Malayali', 'Karnataka Brahmins',
    'Kannada people', 'Niyogi','Sri Lankan Tamils', 'Chitrapur Saraswat Brahmin', 'Tulu people',
    'Konkani people', 'Gaud Saraswat Brahmin', 'Mangaloreans' , 'Mudaliar', 'Telugu Brahmins', 'Chettiar']

North_Indian_Ethnicities = [
    'Punjabis', 'Pashtuns', 'Sindhis', 'Kayastha', 'Kashmiri Pandit', 'Bihari people',
    'Jaat', 'Sikh',  'Kashmiri people', 'Jatt Sikh', 'Pathani', 'Rajput', 'Marwari people',
    'Rohilla', 'Khatri', 'Mohyal', 'Dogra', 'Dalit', 'Agrawal']

Eastern_Indian_Ethnicities = ['Bengali', 'Bengali Hindus', 'Bhutia']

Western_and_Central_Indian_Ethnicities = [
    'Parsi', 'Gujarati people', 'Marathi people', 'Ezhava', 'Chaliyan', 'Indian']

Indian_Diaspora = [
    'British Indian','Nepali Indian','Indian Americans', 'Anglo-Indian people', 'Muhajir diaspora',
    'Indian Australian', 'Indian diaspora in France', 'Punjabi diaspora', 'Indo-Canadians', 'Tamil Americans']

Religious_and_Caste_Groups = [
    'Kanyakubja', 'Brahmins', 'Brahmin', 'Muslim', 'Hindkowans', 'history of the Jews in India', 'Hindu',
    'Mizrahi Jews', 'Jewish people']

Non_Indian_Ethnicities = [
    'Pakistanis','Afghans in India', 'Iranian peoples', 'Italians', 'Romani people',
    'British', 'Irish people', 'White people', 'Asian people', 'English people', 'Australians',
    'African Americans', 'Czechs', 'Pakistani Americans', 'Sudanese Australians', 'Sinhalese', 'French',
    'White British',' White Americans', 'Poles', 'British Americans', 'Native Hawaiians', 'White South Africans',
    'Spanish Americans', 'Italian Americans', 'Swedes', 'Welsh Americans', 'Brazilians', 'Puerto Ricans', 
    'Hispanic and Latino Americans', 'Uruguayans', 'British Asians', 'Germans', 'Irish migration to Great Britain',
    'Asian Americans', 'African people', 'Italian Australians', 'Anglo-Irish people', 'Vietnamese people']

def classify_actor_ethnicity(df):  
    df["actor_ethnicity_classification"] = df["actor_ethnicity_label"].apply(
        lambda x: "South_Indian_Ethnicities" if x in South_Indian_Ethnicities else (
            "North_Indian_Ethnicities" if x in North_Indian_Ethnicities else (
                "Eastern_Indian_Ethnicities" if x in Eastern_Indian_Ethnicities else (
                    "Western_and_Central_Indian_Ethnicities" if x in Western_and_Central_Indian_Ethnicities else (
                        "Religious_and_Caste_Groups" if x in Religious_and_Caste_Groups else None
                    )
                )
            )
        )
    )
    return df

bollywood_data_ethnicity = classify_actor_ethnicity(bollywood_data_ethnicity)

# Filter rows where 'actor_ethnicity_classification' is None
bollywood_data_ethnicity = bollywood_data_ethnicity[bollywood_data_ethnicity["actor_ethnicity_classification"].notnull()]

# Export to a CSV file
bollywood_data_ethnicity.to_csv("data/final/bollywood/bollywood_data_ethnicity.csv", index=False) 

bollywood_data_ethnicity[['actor_name', 'actor_ethnicity_classification']][:20]

Unnamed: 0,actor_name,actor_ethnicity_classification
1440,Albert Brooks,Religious_and_Caste_Groups
1964,Steven Bauer,Religious_and_Caste_Groups
1966,Amy Irving,Religious_and_Caste_Groups
2102,Sam Levene,Religious_and_Caste_Groups
8637,Snitz Edwards,Religious_and_Caste_Groups
10651,Laurence Harvey,Religious_and_Caste_Groups
10652,Laurence Harvey,Religious_and_Caste_Groups
10653,Laurence Harvey,Religious_and_Caste_Groups
20292,Red Buttons,Religious_and_Caste_Groups
20428,Anthony Newley,Religious_and_Caste_Groups


In [126]:
# Classification of ethnicity for hollywood_data (df with all Ethnicities, even Unknown)

bollywood_data = classify_actor_ethnicity(indian_main)

# Export to a CSV file
bollywood_data.to_csv("data/final/bollywood/bollywood_data.csv", index=False) 

### We processed the real-world Bollywood ethnicity data this way :
The Bollywood industry is primarily associated with the Indian population, and we sought to investigate whether Bollywood accurately represents this population in terms of ethnicity, age, and gender distribution.

For the ethnicity analysis, we extracted data on real-world Indian ethnic diversity from a reliable source to compare with Bollywood's representation. We specifically needed a dataset that included information on ethnic proportions over time (ideally from around 1950 to the present) to ensure a historical perspective in our analysis: https://icr.ethz.ch/data/epr/core/ 

Given the vast ethnic diversity within India, including groups like South Indians, North Indians, and various diasporas, we aimed to capture a comprehensive picture. While there may be regional variations in ethnicity within India, we focused on a unified dataset that reflects the diversity of the entire Indian population to serve as a basis for our Bollywood ethnicity analysis. This approach allows us to assess the extent to which Bollywood mirrors or diverges from the rich ethnic tapestry of India.

In [127]:
bollywood_ethnic_realworld = ethnic_realworld[
    ethnic_realworld["statename"].isin(['India'])]

bollywood_ethnic_realworld.head()

Unnamed: 0,gwid,statename,from,to,group,groupid,gwgroupid,umbrella,size,status,reg_aut
3409,750,India,1947,1948,Hindi (Non SC/ST OBCs),5000,75005000,,0.263,SENIOR PARTNER,True
3410,750,India,1947,1948,Scheduled Castes,21000,75021000,,0.1666,JUNIOR PARTNER,False
3411,750,India,1947,1948,Other Muslims,13000,75013000,,0.115,JUNIOR PARTNER,False
3412,750,India,1947,1948,Scheduled Tribes,22000,75022000,,0.0833,JUNIOR PARTNER,False
3413,750,India,1947,1948,Marathi (non-SC/ST),11000,75011000,,0.053,JUNIOR PARTNER,False


In [128]:
# Create a new column "from to" by merging 'from' and 'to'
bollywood_ethnic_realworld["from to"] = bollywood_ethnic_realworld["from"].astype(str) + "-" + bollywood_ethnic_realworld["to"].astype(str)

In [129]:
# Assuming 'hollywood_ethnic_realworld' is your DataFrame
grouped_ethnic_realworld = bollywood_ethnic_realworld.groupby(['from to', 'group', 'size']).size()

# Convert the grouped result to a DataFrame
bollywood_ethnic_realworld = grouped_ethnic_realworld.reset_index(name='counts')

bollywood_ethnic_realworld['group'].value_counts()

group
Assamese (non-SC/ST/OBCs)         14
Bengali (non-SC/ST/OBCs)          14
Telugu (Non SC/ST/OBCs)           14
Tamil (non-SC/ST/OBCs)            14
Scheduled Tribes                  14
Scheduled Castes                  14
Punjabi-Sikhs (non-SC/ST/OBCs)    14
Other Muslims                     14
Oriya (non-SC/ST)                 14
Naga                              14
Mizo                              14
Marathi (non-SC/ST)               14
Manipuri                          14
Malyalam (non-SC/ST)              14
Kannada (non-SC/ST)               14
Indigenous Tripuri                14
Hindi (Non SC/ST OBCs)            14
Gujarati (non-SC/ST)              14
Kashmiri Muslims                  13
Bodo                               9
Other Backward Classes/Castes      6
Name: count, dtype: int64

In [130]:
# Create a dictionary to map each group to its corresponding classification
group_classification_map = {
    'Assamese (non-SC/ST/OBCs)': 'Eastern_Indian_Ethnicities',
    'Bengali (non-SC/ST/OBCs)': 'Eastern_Indian_Ethnicities',
    'Gujarati (non-SC/ST)': 'Western_and_Central_Indian_Ethnicities',
    'Hindi (Non SC/ST OBCs)': 'North_Indian_Ethnicities',
    'Indigenous Tripuri': 'Eastern_Indian_Ethnicities',
    'Punjabi-Sikhs (non-SC/ST/OBCs)': 'North_Indian_Ethnicities',
    'Tamil (non-SC/ST/OBCs)': 'South_Indian_Ethnicities',
    'Telugu (Non SC/ST/OBCs)': 'South_Indian_Ethnicities',
    'Scheduled Tribes': 'Religious_and_Caste_Groups',
    'Scheduled Castes': 'Religious_and_Caste_Groups',
    'Other Muslims': 'Religious_and_Caste_Groups',
    'Oriya (non-SC/ST)': 'Eastern_Indian_Ethnicities',
    'Naga': 'Eastern_Indian_Ethnicities',
    'Mizo': 'Eastern_Indian_Ethnicities',
    'Marathi (non-SC/ST)': 'Western_and_Central_Indian_Ethnicities',
    'Manipuri': 'Eastern_Indian_Ethnicities',
    'Malyalam (non-SC/ST)': 'South_Indian_Ethnicities',
    'Kannada (non-SC/ST)': 'South_Indian_Ethnicities',
    'Kashmiri Muslims': 'North_Indian_Ethnicities',
    'Bodo': 'Eastern_Indian_Ethnicities',
    'Other Backward Classes/Castes': 'Religious_and_Caste_Groups'
}

# Add the new 'group_classification' column by mapping the 'group' column
bollywood_ethnic_realworld['group_classification'] = bollywood_ethnic_realworld['group'].map(group_classification_map)

# Display the updated DataFrame
bollywood_ethnic_realworld.head()

Unnamed: 0,from to,group,size,counts,group_classification
0,1947-1948,Assamese (non-SC/ST/OBCs),0.014,1,Eastern_Indian_Ethnicities
1,1947-1948,Bengali (non-SC/ST/OBCs),0.039,1,Eastern_Indian_Ethnicities
2,1947-1948,Gujarati (non-SC/ST),0.034,1,Western_and_Central_Indian_Ethnicities
3,1947-1948,Hindi (Non SC/ST OBCs),0.263,1,North_Indian_Ethnicities
4,1947-1948,Indigenous Tripuri,0.001,1,Eastern_Indian_Ethnicities


#### We also did some data processing/completing concerning the latter ethnicity data chosen :
- When missing, filling-in past unrecovered data by a proportion of ethnicity population of 0, since it would likely mean that the population proportion was negligeable at the time of the recensement to be accounted for.
- Considering the high proportion of actors from a Jewish ethnicity (cf. the prior surfacic data exploration), we decided to add from a separate source (https://www.pewresearch.org/religion/2013/10/01/chapter-1-population-estimates/ ,  https://www.pewresearch.org/religion/2021/05/11/the-size-of-the-u-s-jewish-population/) the proportion of the Jewish-American population in the count.
- For the time fragmentation of our data, we decided to take into account 4 periods of time from 1950 to 2012 : 1950-1965, 1966-1980, 1981-1995 and 1996-2012, this to be able to use a common fragmentation all throughout our analysis. We thus had to remap the periods provided in the ethnicity dataset to an approximately equivalent time fragmentation. 

In [131]:
bollywood_ethnic_realworld.drop(columns=['counts'], inplace=True)

In [132]:
bollywood_ethnic_realworld.head()

Unnamed: 0,from to,group,size,group_classification
0,1947-1948,Assamese (non-SC/ST/OBCs),0.014,Eastern_Indian_Ethnicities
1,1947-1948,Bengali (non-SC/ST/OBCs),0.039,Eastern_Indian_Ethnicities
2,1947-1948,Gujarati (non-SC/ST),0.034,Western_and_Central_Indian_Ethnicities
3,1947-1948,Hindi (Non SC/ST OBCs),0.263,North_Indian_Ethnicities
4,1947-1948,Indigenous Tripuri,0.001,Eastern_Indian_Ethnicities


In [133]:
# Step 2: Map new periods based on the logic
new_period_mapping = {
    "1947-1948": "1950-1965",
    "1949-1953": "1950-1965",
    "1954-1956": "1950-1965",
    "1957-1960":"1950-1965",
    "1961-1962":"1950-1965",
    "1963-1966" : "1966-1980",
    "1967-1971" : "1966-1980", 
    "1972-1976" :  "1966-1980", 
    "1977-1986" :  "1981-1995",
    "1987-1999" : "1981-1995",
    "2000-2002": "1996-2012",
    "2003-2014": "1996-2012",
    "2015-2019": "1996-2012",
    "2020-2021": "1996-2012",
}

# Step 3: Assign new periods
def assign_periods(row):
    if row["from to"] in new_period_mapping:
        return new_period_mapping[row["from to"]]
    return None

bollywood_ethnic_realworld["new_period"] = bollywood_ethnic_realworld.apply(assign_periods, axis=1)
bollywood_ethnic_realworld.head()

Unnamed: 0,from to,group,size,group_classification,new_period
0,1947-1948,Assamese (non-SC/ST/OBCs),0.014,Eastern_Indian_Ethnicities,1950-1965
1,1947-1948,Bengali (non-SC/ST/OBCs),0.039,Eastern_Indian_Ethnicities,1950-1965
2,1947-1948,Gujarati (non-SC/ST),0.034,Western_and_Central_Indian_Ethnicities,1950-1965
3,1947-1948,Hindi (Non SC/ST OBCs),0.263,North_Indian_Ethnicities,1950-1965
4,1947-1948,Indigenous Tripuri,0.001,Eastern_Indian_Ethnicities,1950-1965


In [134]:
bollywood_ethnic_realworld = bollywood_ethnic_realworld.explode("new_period").reset_index(drop=True)

In [135]:
# Step 5: Drop rows with periods beyond 2015-2021
bollywood_ethnic_realworld = bollywood_ethnic_realworld[bollywood_ethnic_realworld["new_period"].notna()]


# Step 9: Verify the result
bollywood_ethnic_realworld = bollywood_ethnic_realworld[['new_period', 'from to', 'group_classification', 'size']]

print(f"Shape of the resulting DataFrame: {bollywood_ethnic_realworld.shape}")

grouped_ethnic_realworld = bollywood_ethnic_realworld.groupby(['new_period', 'from to', 'group_classification', 'size']).size()

# Convert the grouped result to a DataFrame
bollywood_ethnic_realworld = grouped_ethnic_realworld.reset_index(name='counts')

bollywood_ethnic_realworld.drop(columns=['counts'], inplace=True)

bollywood_ethnic_realworld = bollywood_ethnic_realworld.reset_index().rename(columns={'group_classification': 'group'})

bollywood_ethnic_realworld.drop(columns=['index'], inplace=True)

# Drop rows where the interval ends after 2014
bollywood_ethnic_realworld = bollywood_ethnic_realworld[bollywood_ethnic_realworld['from to'].apply(lambda x: int(x.split('-')[1]) <= 2014)]

# Group the data by 'new_period' and 'group' and calculate the mean of 'size'
bollywood_ethnic_realworld = bollywood_ethnic_realworld.groupby(['new_period', 'group'], as_index=False)['size'].mean()

# Export to a CSV file
bollywood_ethnic_realworld.to_csv("data/final/bollywood/bollywood_ethnic_realworld.csv", index=False)  # Replace with your desired file path

bollywood_ethnic_realworld

Shape of the resulting DataFrame: (280, 4)


Unnamed: 0,new_period,group,size
0,1950-1965,Eastern_Indian_Ethnicities,0.0152
1,1950-1965,North_Indian_Ethnicities,0.102
2,1950-1965,Religious_and_Caste_Groups,0.121633
3,1950-1965,South_Indian_Ethnicities,0.03725
4,1950-1965,Western_and_Central_Indian_Ethnicities,0.0435
5,1966-1980,Eastern_Indian_Ethnicities,0.0152
6,1966-1980,North_Indian_Ethnicities,0.0955
7,1966-1980,Religious_and_Caste_Groups,0.121633
8,1966-1980,South_Indian_Ethnicities,0.03725
9,1966-1980,Western_and_Central_Indian_Ethnicities,0.0435


### Creation of the dataframe for the Male real world population representative of the Hollywood movie industry (North America), according to Time Period and age gaps of 5 years  

- Here, we extracted the dataset of the Hollywood population statistics from 1950 to 2012 (year of last movie registered in our Movies CMU dataset) that we needed from this source : https://population.un.org/wpp/
- Since we could this time afford to use it, we took into account both United States and Canadian populations.

In [136]:
pop_male_realworld_India = pop_male_realworld[pop_male_realworld['Region, subregion, country or area *'].isin(['India'])]

pop_male_realworld_India

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
8662,8663,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,6,5,3,2,1,1,1,1,0,0
8663,8664,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,6,4,3,2,1,1,1,1,0,0
8664,8665,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,6,4,3,2,1,1,0,0,0,0
8665,8666,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,6,4,2,2,1,1,0,0,0,0
8666,8667,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,6,4,3,2,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8731,8732,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,123,93,69,50,36,25,17,12,8,12
8732,8733,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,127,96,72,52,37,26,18,12,8,12
8733,8734,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,126,95,71,51,36,25,16,11,7,10
8734,8735,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,129,98,72,52,37,25,17,11,7,9


In [137]:
pop_male_realworld_India = pop_male_realworld_India.drop(columns=['Index', 'Variant', 'Notes', 'Location code', 'ISO3 Alpha-code', 'ISO2 Alpha-code', 'SDMX code**', 'Type', 'Parent code'])

pop_male_realworld_India.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,91,92,93,94,95,96,97,98,99,100+
8662,India,1950,6746,5651,4927,4671,4499,4351,4229,4131,...,6,5,3,2,1,1,1,1,0,0
8663,India,1951,6943,6113,5354,4796,4597,4453,4323,4211,...,6,4,3,2,1,1,1,1,0,0
8664,India,1952,7138,6308,5804,5217,4723,4552,4425,4305,...,6,4,3,2,1,1,0,0,0,0
8665,India,1953,7342,6500,6001,5661,5140,4677,4523,4407,...,6,4,2,2,1,1,0,0,0,0
8666,India,1954,7545,6704,6198,5859,5580,5091,4648,4504,...,6,4,3,2,1,1,0,0,0,0


In [138]:
# Ensure the 'Year' column is of integer type
pop_male_realworld_India['Year'] = pop_male_realworld_India['Year'].astype(int)


# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_male_realworld_India['Time Period'] = pop_male_realworld_India['Year'].apply(assign_time_period)

# Group by the time period and calculate the mean
male_India_realworld_averages = (
    pop_male_realworld_India
    .groupby('Time Period')
    .mean(numeric_only=True)
    .reset_index()  # Reset index to make it a regular DataFrame
)

# Drop the Year column in male_hollywood_realworld_averages (if it exists)
if 'Year' in male_India_realworld_averages.columns:
    male_India_realworld_averages = male_India_realworld_averages.drop(columns=['Year'])

# Round the averages to two decimal places
male_India_realworld_averages = male_India_realworld_averages.round(2)

# Export to a CSV file
male_India_realworld_averages.to_csv("data/final/bollywood/male_bollywood_realworld_averages.csv", index=False) 

# Display the resulting DataFrame
male_India_realworld_averages

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,8071.88,7243.25,6746.25,6407.0,6128.12,5882.88,5661.62,5457.56,5264.62,...,6.62,4.38,3.0,2.0,1.0,1.0,0.12,0.12,0.0,0.0
1,1966-1980,10551.4,9727.0,9297.67,8997.53,8734.67,8495.33,8276.4,8073.47,7888.47,...,13.07,9.2,5.93,3.93,2.53,1.53,1.0,0.53,0.07,0.27
2,1981-1995,13311.47,12643.8,12309.87,12044.87,11790.87,11538.8,11288.93,11042.87,10798.87,...,26.8,19.4,13.87,9.53,6.53,4.33,2.87,1.73,1.0,1.53
3,1996-2012,14020.53,13782.65,13701.12,13646.53,13590.12,13529.53,13461.94,13385.76,13297.41,...,53.12,39.0,28.0,19.76,13.71,9.35,6.12,4.06,2.65,3.82


### Creation of the dataframe for the Female real world population representative of the Hollywood movie industry (North America), according to Time Period and age gaps of 5 years  

In [139]:
pop_female_realworld_India = pop_female_realworld[pop_female_realworld['Region, subregion, country or area *'] == 'India']

pop_female_realworld_India

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
8662,8663,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,10,8,6,4,3,3,2,1,1,0
8663,8664,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,10,7,5,4,2,2,2,1,1,0
8664,8665,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,10,7,5,3,2,1,1,1,1,1
8665,8666,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,9,7,4,3,2,2,1,1,1,1
8666,8667,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,9,6,4,3,2,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8731,8732,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,171,130,98,72,51,36,25,17,11,18
8732,8733,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,179,136,102,75,54,38,26,17,11,18
8733,8734,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,183,138,103,75,54,37,25,17,11,16
8734,8735,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,190,144,107,78,55,38,26,17,11,15


In [140]:
pop_female_realworld_India = pop_female_realworld_India.drop(columns=['Index', 'Variant', 'Notes', 'Location code', 'ISO3 Alpha-code', 'ISO2 Alpha-code', 'SDMX code**', 'Type', 'Parent code'])
pop_female_realworld_India.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,91,92,93,94,95,96,97,98,99,100+
8662,India,1950,6465,5412,4755,4534,4353,4201,4082,3993,...,10,8,6,4,3,3,2,1,1,0
8663,India,1951,6652,5819,5072,4599,4449,4303,4171,4062,...,10,7,5,4,2,2,2,1,1,0
8664,India,1952,6837,6004,5468,4913,4516,4399,4272,4151,...,10,7,5,3,2,1,1,1,1,1
8665,India,1953,7030,6187,5656,5302,4825,4465,4368,4251,...,9,7,4,3,2,2,1,1,1,1
8666,India,1954,7222,6378,5842,5491,5211,4773,4434,4347,...,9,6,4,3,2,1,1,1,0,1


In [141]:
# Ensure the 'Year' column is of integer type
pop_female_realworld_India['Year'] = pop_female_realworld_India['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_female_realworld_India['Time Period'] = pop_female_realworld_India['Year'].apply(assign_time_period)

# Group by the time period and calculate the mean
female_India_realworld_averages = (
    pop_female_realworld_India
    .groupby('Time Period')
    .mean(numeric_only=True)
    .reset_index()  # Reset index to make it a regular DataFrame
)

# Drop the Year column in male_hollywood_realworld_averages (if it exists)
if 'Year' in female_India_realworld_averages.columns:
    female_India_realworld_averages = female_India_realworld_averages.drop(columns=['Year'])

# Round the averages to two decimal places
female_India_realworld_averages = female_India_realworld_averages.round(2)

# Export to a CSV file
female_India_realworld_averages.to_csv("data/final/bollywood/female_bollywood_realworld_averages.csv", index=False) 

# Display the resulting DataFrame
female_India_realworld_averages

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,7717.44,6884.62,6363.75,6021.81,5752.94,5523.62,5320.94,5135.81,4961.81,...,9.31,6.5,4.38,3.12,2.06,1.25,1.12,0.38,0.25,0.25
1,1966-1980,10012.8,9201.8,8758.93,8451.53,8188.13,7951.8,7737.53,7540.53,7361.4,...,15.27,10.47,7.13,4.53,2.93,1.8,1.27,0.73,0.33,0.47
2,1981-1995,12413.6,11753.0,11416.67,11153.6,10904.13,10658.8,10419.53,10189.47,9966.67,...,39.6,29.4,21.53,15.33,10.73,7.0,4.6,3.0,1.8,2.6
3,1996-2012,12840.76,12561.47,12472.53,12418.0,12365.88,12313.24,12257.18,12194.47,12121.65,...,86.18,64.88,47.82,34.59,24.53,17.12,11.59,7.76,4.94,7.71


### Creation of the dataframe for the Both Sexes real world population representative of the Hollywood movie industry (North America), according to Time Period and age gaps of 5 years  

In [142]:
pop_bothsexes_realworld_India = pop_bothsexes_realworld[pop_bothsexes_realworld['Region, subregion, country or area *'] == 'India']

pop_bothsexes_realworld_India.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,91,92,93,94,95,96,97,98,99,100+
8662,8663,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,16,13,10,6,4,4,3,2,1,0
8663,8664,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,15,11,8,6,4,2,2,2,1,1
8664,8665,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,15,10,7,5,4,2,1,1,1,1
8665,8666,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,15,10,7,5,3,2,1,1,1,1
8666,8667,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,15,10,7,4,3,2,1,1,0,1


In [143]:
# Work on a copy of the DataFrame to avoid modifying the original
pop_bothsexes_realworld_India = pop_bothsexes_realworld_India.copy()

# Apply the function to assign regions using .loc
pop_bothsexes_realworld_India.loc[:, 'region'] = pop_bothsexes_realworld_India['Region, subregion, country or area *'].apply(get_main_region)

# Display the results
pop_bothsexes_realworld_India.head()

Unnamed: 0,Index,Variant,"Region, subregion, country or area *",Notes,Location code,ISO3 Alpha-code,ISO2 Alpha-code,SDMX code**,Type,Parent code,...,92,93,94,95,96,97,98,99,100+,region
8662,8663,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,13,10,6,4,4,3,2,1,0,unknown
8663,8664,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,11,8,6,4,2,2,2,1,1,unknown
8664,8665,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,10,7,5,4,2,1,1,1,1,unknown
8665,8666,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,10,7,5,3,2,1,1,1,1,unknown
8666,8667,Estimates,India,,356,IND,IN,356,Country/Area,5501,...,10,7,4,3,2,1,1,0,1,unknown


In [144]:
# Define columns to keep
columns_to_keep = ['Region, subregion, country or area *','Year'] + [col for col in pop_bothsexes_realworld_India.columns if isinstance(col, int) or col.isdigit() or col == '100+']

# Filter the DataFrame to retain only the specified columns
pop_bothsexes_realworld_India= pop_bothsexes_realworld_India[columns_to_keep]

# Display the resulting DataFrame's columns to verify
pop_bothsexes_realworld_India.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,91,92,93,94,95,96,97,98,99,100+
8662,India,1950,13211,11063,9682,9205,8852,8552,8312,8125,...,16,13,10,6,4,4,3,2,1,0
8663,India,1951,13595,11932,10426,9395,9046,8756,8494,8274,...,15,11,8,6,4,2,2,2,1,1
8664,India,1952,13975,12312,11272,10130,9238,8951,8698,8455,...,15,10,7,5,4,2,1,1,1,1
8665,India,1953,14372,12687,11657,10963,9965,9142,8891,8658,...,15,10,7,5,3,2,1,1,1,1
8666,India,1954,14767,13082,12040,11350,10791,9865,9082,8850,...,15,10,7,4,3,2,1,1,0,1


In [145]:
# Ensure the 'Year' column is of integer type
pop_bothsexes_realworld_India['Year'] = pop_bothsexes_realworld_India['Year'].astype(int)

# Define the time periods
time_periods = {
    "1950-1965": (1950, 1965),
    "1966-1980": (1966, 1980),
    "1981-1995": (1981, 1995),
    "1996-2012": (1996, 2012),
}

# Create a new column to assign each row to a time period
def assign_time_period(year):
    for period, (start, end) in time_periods.items():
        if start <= year <= end:
            return period
    return None

# Assign the time period to each row
pop_bothsexes_realworld_India['Time Period'] = pop_bothsexes_realworld_India['Year'].apply(assign_time_period)
pop_bothsexes_realworld_India.head()

Unnamed: 0,"Region, subregion, country or area *",Year,0,1,2,3,4,5,6,7,...,92,93,94,95,96,97,98,99,100+,Time Period
8662,India,1950,13211,11063,9682,9205,8852,8552,8312,8125,...,13,10,6,4,4,3,2,1,0,1950-1965
8663,India,1951,13595,11932,10426,9395,9046,8756,8494,8274,...,11,8,6,4,2,2,2,1,1,1950-1965
8664,India,1952,13975,12312,11272,10130,9238,8951,8698,8455,...,10,7,5,4,2,1,1,1,1,1950-1965
8665,India,1953,14372,12687,11657,10963,9965,9142,8891,8658,...,10,7,5,3,2,1,1,1,1,1950-1965
8666,India,1954,14767,13082,12040,11350,10791,9865,9082,8850,...,10,7,4,3,2,1,1,0,1,1950-1965


In [146]:
# Apply the function to your datasets
bothsexes_india_realworld_averages = process_grouped_averages_by_columns(pop_bothsexes_realworld_India, 'Time Period', drop_cols='Year', decimals=2)

# Export to a CSV file
bothsexes_india_realworld_averages.to_csv("data/final/bollywood/bothsexes_bollywood_realworld_averages.csv", index=False)

In [147]:
# Calculate the proportions of men 
male_india_realworld_proportions = male_India_realworld_averages.copy()
male_india_realworld_proportions.iloc[:, 1:] = (
    male_India_realworld_averages.iloc[:, 1:].values /
    bothsexes_india_realworld_averages.iloc[:, 1:].values
)

# Calculate the proportions of women 
female_india_realworld_proportions = female_India_realworld_averages.copy()
female_india_realworld_proportions.iloc[:, 1:] = (
    female_India_realworld_averages.iloc[:, 1:].values /
    bothsexes_india_realworld_averages.iloc[:, 1:].values
)


In [148]:
# Export to a CSV file
male_india_realworld_proportions.to_csv("data/final/bollywood/male_bollywood_realworld_proportions.csv", index=False)

male_india_realworld_proportions

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,0.511226,0.512688,0.514578,0.515496,0.515786,0.515748,0.515504,0.51518,0.514802,...,0.416877,0.400366,0.403226,0.409836,0.31348,0.471698,0.10084,0.107143,0.0,0.0
1,1966-1980,0.513092,0.513873,0.514918,0.515647,0.516152,0.516521,0.516825,0.517064,0.517284,...,0.45908,0.472522,0.451637,0.467857,0.462523,0.45,0.469484,0.378571,0.09589,0.27
2,1981-1995,0.51745,0.518255,0.518823,0.519213,0.519541,0.519822,0.520023,0.5201,0.520045,...,0.40404,0.396485,0.393252,0.38227,0.381203,0.373276,0.387838,0.355236,0.357143,0.3825
3,1996-2012,0.52196,0.523176,0.523468,0.523566,0.523579,0.523533,0.523424,0.523285,0.52313,...,0.38087,0.375,0.36784,0.361574,0.358525,0.354033,0.343434,0.346712,0.351926,0.333043


In [149]:
# Export to a CSV file
female_india_realworld_proportions.to_csv("data/final/bollywood/female_eastasia_realworld_proportions.csv", index=False) 

female_india_realworld_proportions

Unnamed: 0,Time Period,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100+
0,1950-1965,0.488778,0.487303,0.485403,0.484504,0.484209,0.484252,0.484484,0.484808,0.485191,...,0.586272,0.59415,0.58871,0.639344,0.645768,0.589623,0.941176,0.339286,1.0,0.568182
1,1966-1980,0.486901,0.486127,0.485082,0.484356,0.483856,0.483474,0.483175,0.482932,0.482721,...,0.536354,0.53775,0.543031,0.539286,0.535649,0.529412,0.596244,0.521429,0.452055,0.47
2,1981-1995,0.482548,0.481742,0.481177,0.480793,0.480468,0.480178,0.479974,0.479906,0.479968,...,0.597015,0.600858,0.610434,0.614922,0.626386,0.603448,0.621622,0.616016,0.642857,0.65
3,1996-2012,0.47804,0.476821,0.476528,0.476432,0.476414,0.476468,0.476581,0.476714,0.476875,...,0.617911,0.623846,0.628219,0.632937,0.641475,0.648239,0.650393,0.662681,0.656042,0.672188
