In [2]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tq
import warnings
warnings.filterwarnings("ignore")
from glob import glob as glob

In [65]:
class DataPrep():
    def __init__(self, base_folder, update_local_files=False):
        
        # create folder if not exists - used to save prepared data
        if not os.path.exists('Processed Data'):
            os.makedirs('Processed Data')
         
        self.base_folder = base_folder
        self.update_local_files = update_local_files
        
        # list of rich countries fetched from manually created file
        self.rich_countries = pd.read_csv('Rich Countries.csv')['Country Name'].to_list()
        
        # used in preparing column - age
        self.ages_to_keep = ['1-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44',
                             '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '<1', '80-84',
                             '85-89', '90-94', '95+']
        
        # used in preparing column - cause
        self.causes_to_keep = ['HIV/AIDS and sexually transmitted infections',
                               'Interpersonal violence', 
                               'Neoplasms',
                               'Self-harm', 
                               'Chronic respiratory diseases',
                               'Cirrhosis and other chronic liver diseases', 
                               'Transport injuries',
                               'Substance use disorders', 
                               'Diabetes and kidney diseases',
                               'Cardiovascular diseases']
        
        # save file locations for all prepared data
        self.save_file_to = {'mortality': 'Processed Data/mortality.csv',
                              'yll': 'Processed Data/years-of-life-lost.csv',
                              'population': 'Processed Data/population_1990-2019.csv',
                              'standard_population': 'Processed Data/population_usa_2017.csv',
                              'standardized_mortality': 'Processed Data/gbd-rich-countries.csv',
                              'standardized_yll': 'Processed Data/yll-rich-countries.csv',
                              'gdp': 'Processed Data/gdp-pop-pivot.csv'} 
        
        # wrapping functions in decorator - used to save prepared data
        self.prepare_mortality_data = self.dec_SaveFile(self.prepare_mortality_data)
        self.prepare_yll_data = self.dec_SaveFile(self.prepare_yll_data)
        self.prepare_population_data = self.dec_SaveFile(self.prepare_population_data)
        self.prepare_standard_population = self.dec_SaveFile(self.prepare_standard_population)
        self.standardize_mortality_data = self.dec_SaveFile(self.standardize_mortality_data)
        self.standardize_yll_data = self.dec_SaveFile(self.standardize_yll_data)
        self.prepare_gdp_data = self.dec_SaveFile(self.prepare_gdp_data)

    """ decorator """
    def dec_SaveFile(self, func):
        def wrapper(*args, **kwargs):
            flag = False
            if self.update_local_files: 
                flag = True
            result = func(*args, **kwargs, save_file=flag)
            return result
        return wrapper
        
    """ utility function to merge multiple csv files into pandas dataframe, apply row level filters and return dataframe """
    def merge_files(self, file_paths, filters=None, ignore_cols=None):
        dfs = []
        for file in tq(file_paths, desc='Merging Files'):
            df = pd.read_csv(file)
            dfs.append(df.drop(ignore_cols, axis=1))

        dfc = pd.concat(dfs, ignore_index=True)
        if filters:
            for column, values in tq(filters.items(), desc='Applying Filters'):  
                dfc = dfc[dfc[column].apply(lambda x: x in values)]

        return dfc     
    
    """ Prepares Mortality Data - Data from this is later used to apply aggregations to get final data structure """
    def prepare_mortality_data(self, raw_data_path, save_file=None):

        # merge multiple data files into one dataframe
        filters = {'measure': ['Deaths'], 'sex': ['Both'], 'location': self.rich_countries}
        df_deaths = self.merge_files(glob(raw_data_path), filters, ignore_cols=['metric', 'upper', 'lower'])

        # create CN(USA or ORC) column, format years columns (1-4 years => 1-4), keep only required causes and rename all others as "Other"
        df_deaths.rename(columns = {'location':'country', 'val': 'deaths'}, inplace=True)
        df_deaths['CN'] = df_deaths['country'].apply(lambda x: 'USA' if x == 'United States of America' else 'ORC')
        df_deaths['age'] = df_deaths['age'].apply(lambda x: x.split(' ')[0])
        df_deaths['cause'] = df_deaths['cause'].apply(lambda x: 'Other' if x not in self.causes_to_keep else x)

        print(f'\nMortality Data Shape: {df_deaths.shape}\n')
        print(df_deaths.sample(5))

        if save_file: 
            df_deaths.to_csv(self.save_file_to['mortality'], index=False)
            print(f"\nFile Saved at: {self.save_file_to['mortality']}")

        print('- '*40)
        return df_deaths
    
    """ Prepares Years of Life Lost Data - Data from this is later used to apply aggregations to get final data structure """
    def prepare_yll_data(self, raw_data_path, save_file=None):

        # merge multiple data files into one dataframe
        filters = {'measure': ['YLLs (Years of Life Lost)'], 'sex': ['Both'], 'location': self.rich_countries}
        df_yll = self.merge_files(glob(raw_data_path), filters, ignore_cols=['metric', 'upper', 'lower'])

        # create CN(USA or ORC) column, format years columns (1-4 years => 1-4), keep only required causes and rename all others as "Other"
        df_yll.rename(columns = {'location':'country', 'val': 'yll'}, inplace=True)
        df_yll['CN'] = df_yll['country'].apply(lambda x: 'USA' if x == 'United States of America' else 'ORC')
        df_yll['age'] = df_yll['age'].apply(lambda x: x.split(' ')[0])
        df_yll['cause'] = df_yll['cause'].apply(lambda x: 'Other' if x not in self.causes_to_keep else x)

        print(f'\nYears of Life Lost Data Shape: {df_yll.shape}\n')
        print(df_yll.sample(5))

        if save_file: 
            df_yll.to_csv(self.save_file_to['yll'], index=False)
            print(f"\nFile Saved at: {self.save_file_to['yll']}")

        print('- '*40)
        return df_yll
    
    """ Population Data - used in standardizing Mortality and YLL Data """
    def prepare_population_data(self, raw_data_path, save_file=None):

        # merge multiple data files into one dataframe
        filters = {'sex_name': ['both'], 'location_name': self.rich_countries}
        df_pop = self.merge_files(glob(raw_data_path), filters, ignore_cols=['location_id', 'sex_id', 'age_group_id', 'measure_id', 'metric_id'])

        df_pop['age_group_name'] = df_pop['age_group_name'].replace({'1 to 4': '1-4', '5 to 9': '5-9', '10 to 14': '10-14', '15 to 19': '15-19', '20 to 24': '20-24', '25 to 29': '25-29',
                                       '30 to 34': '30-34', '35 to 39': '35-39', '40 to 44': '40-44', '45 to 49': '45-49', '50 to 54': '50-54',
                                       '55 to 59': '55-59', '60 to 64': '60-64', '65 to 69': '65-69', '70 to 74': '70-74', '75 to 79': '75-79',
                                       '<1 year': '<1', '80 to 84': '80-84', '85 to 89': '85-89', '90 to 94': '90-94', '95 plus': '95+'})
        
        # only keep required age
        df_pop = df_pop[df_pop['age_group_name'].isin(self.ages_to_keep)]
        # create CN(USA or ORC) column, remove unnecessary columns
        df_pop['sex_name'] = df_pop['sex_name'].replace({'male': 'Male', 'female': 'Female'})
        df_pop['CN'] = df_pop['location_name'].apply(lambda x: 'USA' if x == 'United States of America' else 'ORC') 
        df_mid = df_pop.drop(columns=['measure_name', 'metric_name', 'upper', 'lower']).copy()
        df = df_mid.rename(columns={'location_name': 'country', 'sex_name': 'sex', 'age_group_name': 'age', 'val': 'pop', 'year_id': 'year'})
        
        print(f'\nPopulation Data Shape: {df.shape}\n')
        print(df.sample(5))

        if save_file: 
            df.to_csv(self.save_file_to['population'], index=False)
            print(f"\nFile Saved at: {self.save_file_to['population']}")

        print('- '*40)
        return df

    """ Standard Population is - USA 2017 """
    def prepare_standard_population(self, country='USA', year=2017, save_file=None):

        # read file created from function "prepare_population_data" and filter by country and year
        df_pop = pd.read_csv('Processed Data/Population_1990-2019.csv')
        df = df_pop[(df_pop['CN'] == country) & (df_pop['year'] == year)]
        
        # apply aggregation
        df = df.groupby(['age', 'year', 'CN']).agg(pop_us17=('pop', 'sum')).reset_index()
        df['weight'] = df['pop_us17']/df['pop_us17'].sum()
        
        print(f'\nStandard Population Data Shape: {df.shape}\n')
        print(df.sample(5))

        if save_file: 
            df.to_csv(self.save_file_to['standard_population'], index=False)
            print(f"\nFile Saved at: {self.save_file_to['standard_population']}")
           
        print('- '*40)
        return df
    
    """ Standardize Mortality data using the standard population """
    def standardize_mortality_data(self, save_file=None):

        # apply aggregations to mortality and population data
        deaths_groupped = self.deaths.groupby(['age', 'year', 'cause', 'CN']).agg({'deaths': 'sum'}).reset_index()
        df_pop_groupped = self.df_pop.groupby(['age', 'year', 'CN']).agg({'pop': 'sum'}).reset_index()

        # join mortality and population data
        df_mid = deaths_groupped.merge(df_pop_groupped, left_on = ['age', 'year', 'CN'], right_on=['age', 'year', 'CN'], how='left')
        df = df_mid.merge(self.pop_us17[['age', 'pop_us17', 'weight']])

        # Deaths Std = standardized(deaths), aa_dr = age adjusted death rate
        df['Deaths Std'] = (df['deaths'] * df['pop_us17'])/df['pop']
        df['Death Rate'] = (df['deaths']/df['pop']) * pow(10,5)
        df['aa_dr'] = ((df['deaths'] * pow(10,5)) / df['pop']) * df['weight']

        print(f'\nStandardized Mortality Data Shape: {df.shape}\n')
        print(df.sample(5))

        if save_file: 
            df.to_csv(self.save_file_to['standardized_mortality'], index=False)
            print(f"\nFile Saved at: {self.save_file_to['standardized_mortality']}")
    
        print('- '*40)
        return df
    
    """ Standardize Years of Life Lost data using the standard population """
    def standardize_yll_data(self, save_file=None):

        # apply aggregations to yll and population data
        yll_groupped = self.yll.groupby(['age', 'year', 'cause', 'CN']).agg({'yll': 'sum'}).reset_index()
        df_pop_groupped = self.df_pop.groupby(['age', 'year', 'CN']).agg({'pop': 'sum'}).reset_index()
        
        # join yll and population data
        df_mid = yll_groupped.merge(df_pop_groupped, left_on=['age', 'year', 'CN'], right_on=['age', 'year', 'CN'], how='left')
        df = df_mid.merge(self.pop_us17[['age', 'pop_us17', 'weight']])

        # aa_yll, aa_yll2 = age adjusted years of life losts
        df['aa_yll'] = ((df['yll'] * pow(10,5)) / df['pop']) * df['weight']
        df['aa_yll2'] = (df['yll'] / (df['pop'] * pow(10,5))) * df['pop_us17']

        print(f'\nStandardized Years of Life Lost Data Shape: {df.shape}\n')
        print(df.sample(5))

        if save_file: 
            df.to_csv(self.save_file_to['standardized_yll'], index=False)
            print(f"\nFile Saved at: {self.save_file_to['standardized_yll']}")
        
        print('- '*40)
        return df
    
    """ GDP data used in visualizing scatter plots """
    def prepare_gdp_data(self, save_file=None):
        
        df_raw = pd.read_excel('P_Data_Extract_From_World_Development_Indicators.xlsx')
        
        # transform data from wide format to long
        old_cols = list(df_raw.columns[4:])
        temp_dct = {}
        for c in old_cols: 
            temp_dct[c] = c[:4]
        df_raw.rename(columns = temp_dct, inplace = 1)
        df_raw.dropna(subset=['Series Code'], inplace=True)
        
        pivot_cols = []
        for k, v in temp_dct.items():
            pivot_cols.append(v)
            
        # split gdp and population data into 2 different dataframes
        df_gdp = df_raw[df_raw['Series Name'] == 'GDP per capita, PPP (current international $)']
        df_pop = df_raw[df_raw['Series Name'] == 'Population, total']
        
        # unpivot population and gdp data
        df_gdp = pd.melt(df_gdp, id_vars = ['Country Name', 'Country Code'], value_vars = pivot_cols, var_name = 'Year', value_name = 'GDP PPP USD')
        df_pop = pd.melt(df_pop, id_vars = ['Country Name', 'Country Code'], value_vars = pivot_cols, var_name = 'Year', value_name = 'Population')
        
        # join the unpivoted dataframes, replace ".." by NaN
        df_processed = df_gdp.merge(df_pop, on = ['Country Name', 'Country Code', 'Year'])
        df_processed['GDP PPP USD'] = df_processed['GDP PPP USD'].replace('..', np.nan)
        df_processed['Population'] = df_processed['Population'].replace('..', np.nan)
        
        # only keep data for years 1990, 2017 and pivot data to get the required format
        df_sc = df_processed[(df_processed['Year'] == '1990') | (df_processed['Year'] == '2017')]
        df_sc = df_sc.pivot(index = 'Country Name', columns = 'Year', values = ['GDP PPP USD', 'Population']).reset_index()
        df_sc.columns = ['_'.join(col) for col in df_sc.columns.values]
        df_sc.rename(columns={'Country Name_': 'Country Name1'}, inplace=True)
        
        print(f'\nPrepared GDP-POP Data Shape: {df_sc.shape}\n')
        print(df_sc.sample(5))

        if save_file: 
            df_sc.to_csv(self.save_file_to['gdp'], index=False)
            print(f"\nFile Saved at: {self.save_file_to['gdp']}")

        print('- '*40)
        return df_sc
    
    """ Prepares Data by running all functions """
    def create_data_extracts(self):
        current_runs = 0
        pbar = tq(total = 7)
        
        pbar.set_description_str(desc='Preparing GDP Data')
        self.gdp = self.prepare_gdp_data()
        pbar.update(current_runs + 1)
        
        pbar.set_description_str(desc='Preparing Mortality Data')
        self.deaths = self.prepare_mortality_data('Raw Data/*/*.csv')
        pbar.update(current_runs + 1)
        
        pbar.set_description_str(desc='Preparing Years of Life Lost Data')
        self.yll = self.prepare_yll_data('Raw Data/*/*.csv')
        pbar.update(current_runs + 1)
        
        pbar.set_description_str(desc='Preparing Population Data')
        self.df_pop = self.prepare_population_data(raw_data_path='Population/*/*.CSV')
        pbar.update(current_runs + 1)
        
        pbar.set_description_str(desc='Preparing Standard Population Data')
        self.pop_us17 = self.prepare_standard_population()
        pbar.update(current_runs + 1)
        
        pbar.set_description_str(desc='Standardizing Mortality Data')
        self.df = self.standardize_mortality_data()
        pbar.update(current_runs + 1)
        
        pbar.set_description_str(desc='Standardizing Years of Life Lost Data')
        self.df = self.standardize_yll_data()
        pbar.update(current_runs + 1)

    """ Loads prepared data from saved files """
    def load_data_extracts(self):
        self.deaths = pd.read_csv(self.save_file_to['mortality'])
        self.yll = pd.read_csv(self.save_file_to['yll'])
        self.df_pop = pd.read_csv(self.save_file_to['population'])
        self.pop_us17 = pd.read_csv(self.save_file_to['standard_population'])
        self.deaths_final = pd.read_csv(self.save_file_to['standardized_mortality'])
        self.yll_final = pd.read_csv(self.save_file_to['standardized_yll'])
        self.gdp = pd.read_csv(self.save_file_to['gdp'])
    
    def run(self):
        if self.update_local_files:
            self.create_data_extracts()
        else:
            self.load_data_extracts()

In [66]:
x = DataPrep(base_folder='My Drive/mac_gdrive/IS', update_local_files=True)
x.run()

y = DataPrep(base_folder='My Drive/mac_gdrive/IS', update_local_files=False)
y.run()

  0%|          | 0/7 [00:00<?, ?it/s]


Prepared GDP-POP Data Shape: (217, 5)

        Country Name1 GDP PPP USD_1990 GDP PPP USD_2017 Population_1990  \
195             Tonga      2141.035912      6257.089734           98727   
145   North Macedonia      5398.604132     15706.539567         2044174   
69   French Polynesia              NaN              NaN          211089   
13       Bahamas, The     20525.453384     34067.846183          270679   
72            Georgia      5727.624172     13589.707392         4802000   

    Population_2017  
195          105415  
145         2074502  
69           295450  
13           399020  
72          3728004  

File Saved at: Processed Data/gdp-pop-pivot.csv
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 


Merging Files:   0%|          | 0/35 [00:00<?, ?it/s]

Applying Filters:   0%|          | 0/3 [00:00<?, ?it/s]


Mortality Data Shape: (399840, 8)

         measure      country   sex    age  cause  year     deaths   CN
11480411  Deaths  New Zealand  Both  20-24  Other  2013   0.960340  ORC
6365363   Deaths  Netherlands  Both  35-39  Other  1996   0.019470  ORC
15412638  Deaths       Israel  Both  15-19  Other  1996   1.812775  ORC
2301529   Deaths        Japan  Both    5-9  Other  2007  17.188784  ORC
9004656   Deaths      Finland  Both  65-69  Other  2010  23.395243  ORC

File Saved at: Processed Data/mortality.csv
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 


Merging Files:   0%|          | 0/35 [00:00<?, ?it/s]

Applying Filters:   0%|          | 0/3 [00:00<?, ?it/s]


Years of Life Lost Data Shape: (399840, 8)

                            measure    country   sex    age  \
14625459  YLLs (Years of Life Lost)      Japan  Both  90-94   
16744844  YLLs (Years of Life Lost)    Austria  Both  20-24   
4499256   YLLs (Years of Life Lost)      Spain  Both    5-9   
16873416  YLLs (Years of Life Lost)  Singapore  Both  65-69   
16692320  YLLs (Years of Life Lost)     Sweden  Both  35-39   

                                                 cause  year          yll   CN  
14625459                                         Other  2013  1471.206146  ORC  
16744844                                     Neoplasms  2007  1801.393117  ORC  
4499256   HIV/AIDS and sexually transmitted infections  2016   161.128825  ORC  
16873416                            Transport injuries  2013   269.249914  ORC  
16692320                                         Other  2007   483.714571  ORC  

File Saved at: Processed Data/years-of-life-lost.csv
- - - - - - - - - - - - - - - - - - 

Merging Files:   0%|          | 0/70 [00:00<?, ?it/s]

Applying Filters:   0%|          | 0/2 [00:00<?, ?it/s]


Population Data Shape: (41160, 6)

                country   sex    age  year           pop   CN
2579289           Italy  both     <1  2004  5.454181e+05  ORC
4862919         Germany  both     <1  1966  1.265479e+06  ORC
2103748     Netherlands  both  70-74  2000  5.502980e+05  ORC
5816248  United Kingdom  both  25-29  1999  4.205607e+06  ORC
5721083  United Kingdom  both  75-79  1991  1.872892e+06  ORC

File Saved at: Processed Data/population_1990-2019.csv
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Standard Population Data Shape: (21, 5)

      age  year   CN      pop_us17    weight
5   30-34  2017  USA  2.171028e+07  0.066893
0     1-4  2017  USA  1.551091e+07  0.047792
15  75-79  2017  USA  8.504681e+06  0.026204
8   45-49  2017  USA  2.110497e+07  0.065028
3   20-24  2017  USA  2.184669e+07  0.067313

File Saved at: Processed Data/population_usa_2017.csv
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Standa

# GBD

In [101]:
def prepare_mortality_data(raw_data_path, save_file_path=None):
    
    df_rich = pd.read_csv('Processed Data/Rich Countries.csv')
    rc = df_rich['Country Name'].to_list()
    
    filters = {'measure': ['Deaths'], 'sex': ['Both'], 'location': pd.read_csv('Processed Data/Rich Countries.csv')['Country Name'].to_list()}
    df_deaths = merge_files(glob(raw_data_path), filters, ignore_cols=['metric', 'upper', 'lower'])
    
    causes_to_keep = ['HIV/AIDS and sexually transmitted infections',
        'Interpersonal violence', 'Neoplasms',
       'Self-harm', 'Chronic respiratory diseases',
       'Cirrhosis and other chronic liver diseases', 'Transport injuries',
       'Substance use disorders', 'Diabetes and kidney diseases',
       'Cardiovascular diseases']

    df_deaths.rename(columns = {'location':'country', 'val': 'deaths'}, inplace=True)

    df_deaths['CN'] = df_deaths['country'].apply(lambda x: 'USA' if x == 'United States of America' else 'ORC')

    df_deaths['age'] = df_deaths['age'].apply(lambda x: x.split(' ')[0])

    df_deaths['cause'] = df_deaths['cause'].apply(lambda x: 'Other' if x not in causes_to_keep else x)
    
    
    print(f'Mortality Data Shape: {df_deaths.shape}')
    print(df_deaths.head())
    
    if save_file_path: 
        df_deaths.to_csv(save_file_path, index=False)
    
    return df_deaths

In [102]:
deaths = prepare_mortality_data('Raw Data/*/*.csv', save_file_path='Processed Data/mortality.csv')

Merging Files:   0%|          | 0/35 [00:00<?, ?it/s]

Applying Filters:   0%|          | 0/3 [00:00<?, ?it/s]

Mortality Data Shape: (399840, 8)
        measure      country   sex    age  cause  year     deaths   CN
1002510  Deaths  Netherlands  Both    1-4  Other  2014  15.512021  ORC
1002513  Deaths  Netherlands  Both    5-9  Other  2014   7.259305  ORC
1002516  Deaths  Netherlands  Both  10-14  Other  2014   5.849076  ORC
1002519  Deaths  Netherlands  Both  15-19  Other  2014   9.689863  ORC
1002522  Deaths  Netherlands  Both  20-24  Other  2014  13.516623  ORC


# YLL

In [103]:
def prepare_yll_data(raw_data_path, save_file_path=None):
    
    df_rich = pd.read_csv('Processed Data/Rich Countries.csv')
    rc = df_rich['Country Name'].to_list()
    
    filters = {'measure': ['YLLs (Years of Life Lost)'], 'sex': ['Both'], 'location': pd.read_csv('Processed Data/Rich Countries.csv')['Country Name'].to_list()}
    df_yll = merge_files(glob(raw_data_path), filters, ignore_cols=['metric', 'upper', 'lower'])
    
    causes_to_keep = ['HIV/AIDS and sexually transmitted infections',
        'Interpersonal violence', 'Neoplasms',
       'Self-harm', 'Chronic respiratory diseases',
       'Cirrhosis and other chronic liver diseases', 'Transport injuries',
       'Substance use disorders', 'Diabetes and kidney diseases',
       'Cardiovascular diseases']

    df_yll.rename(columns = {'location':'country', 'val': 'yll'}, inplace=True)
    
    # df_yll = df_yll[df_yll['sex'] == 'Both']
    
    # df_yll_rich = df_yll.query("country in @rc")

    df_yll['CN'] = df_yll['country'].apply(lambda x: 'USA' if x == 'United States of America' else 'ORC')

    df_yll['age'] = df_yll['age'].apply(lambda x: x.split(' ')[0])

    df_yll['cause'] = df_yll['cause'].apply(lambda x: 'Other' if x not in causes_to_keep else x)
    
    
    print(f'Years of Life Lost Data Shape: {df_yll.shape}')
    print(df_yll.head())
    
    if save_file_path: 
        df_yll.to_csv(save_file_path, index=False)
    
    return df_yll

In [104]:
yll = prepare_yll_data('Raw Data/*/*.csv', save_file_path='Processed Data/years-of-life-lost.csv')

Merging Files:   0%|          | 0/35 [00:00<?, ?it/s]

Applying Filters:   0%|          | 0/3 [00:00<?, ?it/s]

Years of Life Lost Data Shape: (399840, 8)
                        measure  country   sex    age  cause  year  \
1014  YLLs (Years of Life Lost)  Germany  Both    1-4  Other  1997   
1017  YLLs (Years of Life Lost)  Germany  Both    5-9  Other  1997   
1020  YLLs (Years of Life Lost)  Germany  Both  10-14  Other  1997   
1023  YLLs (Years of Life Lost)  Germany  Both  15-19  Other  1997   
1026  YLLs (Years of Life Lost)  Germany  Both  20-24  Other  1997   

              yll   CN  
1014  3932.699052  ORC  
1017  2310.638957  ORC  
1020  2445.463058  ORC  
1023  4377.290960  ORC  
1026  4170.870790  ORC  


# Population Data

In [33]:
def prepare_population_data(raw_data_path, save_file_path=None):

    filters = {'age_group_name': ['10 to 14', '15 to 19', '20 to 24', '25 to 29', '30 to 34', '35 to 39', '40 to 44',
                                   '45 to 49', '50 to 54', '55 to 59', '60 to 64', '65 to 69', '70 to 74', '75 to 79',
                                   '80 to 84', '85 to 89', '90 to 94', '95 plus', '1 to 4', '5 to 9', '<1 year'],
               'sex_name': ['both'],
               'location_name': pd.read_csv('Processed Data/Rich Countries.csv')['Country Name'].to_list()
              }

    df_pop = merge_files(glob(raw_data_path), filters, ignore_cols=['location_id', 'sex_id', 'age_group_id', 'measure_id', 'metric_id'])
    
    df_pop['age_group_name'] = df_pop['age_group_name'].replace({'1 to 4': '1-4', '5 to 9': '5-9', '10 to 14': '10-14', '15 to 19': '15-19', '20 to 24': '20-24', '25 to 29': '25-29',
                                   '30 to 34': '30-34', '35 to 39': '35-39', '40 to 44': '40-44', '45 to 49': '45-49', '50 to 54': '50-54',
                                   '55 to 59': '55-59', '60 to 64': '60-64', '65 to 69': '65-69', '70 to 74': '70-74', '75 to 79': '75-79',
                                   '<1 year': '<1', '80 to 84': '80-84', '85 to 89': '85-89', '90 to 94': '90-94', '95 plus': '95+'})

    df_pop['sex_name'] = df_pop['sex_name'].replace({'male': 'Male', 'female': 'Female'})
    df_pop['CN'] = df_pop['location_name'].apply(lambda x: 'USA' if x == 'United States of America' else 'ORC') 
    df_mid = df_pop.drop(columns=['measure_name', 'metric_name', 'upper', 'lower']).copy()
    df = df_mid.rename(columns={'location_name': 'country', 'sex_name': 'sex', 'age_group_name': 'age', 'val': 'pop', 'year_id': 'year'})
    
    print(f'Population Data Shape: {df.shape}')
    print(df.head())
    
    if save_file_path: 
        df.to_csv(save_file_path, index=False)
    
        
    return df

In [34]:
df_pop = prepare_population_data(raw_data_path='Population/*/*.CSV', save_file_path='Processed Data/Population_1990-2019.csv')

Merging Files:   0%|          | 0/3 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'age_group_name'

In [31]:
def prepare_standard_population(country='USA', year=2017, save_file_path=None):

    df_pop = pd.read_csv('Processed Data/Population_1990-2019.csv')
    df = df_pop[(df_pop['CN'] == country) & (df_pop['year'] == year)]
    df = df.groupby(['age', 'year', 'CN']).agg(pop_us17=('pop', 'sum')).reset_index()
    df['weight'] = df['pop_us17']/df['pop_us17'].sum()
    # df.rename(columns={'pop': 'pop_us17'}, inplace=True)
    print(f'Population Data Shape: {df.shape}')
    print(df.head())
    
    if save_file_path: 
        df.to_csv(save_file_path, index=False)
    
    return df

In [108]:
pop_us17 = prepare_standard_population(save_file_path='Processed Data/Population_USA_2017.csv')

Population Data Shape: (21, 5)
     age  year   CN      pop_us17    weight
0    1-4  2017  USA  1.551091e+07  0.047792
1  10-14  2017  USA  2.134715e+07  0.065774
2  15-19  2017  USA  2.158521e+07  0.066508
3  20-24  2017  USA  2.184669e+07  0.067313
4  25-29  2017  USA  2.280305e+07  0.070260


# Standardization

In [117]:
deaths

Unnamed: 0,measure,country,sex,age,cause,year,deaths,CN
1002510,Deaths,Netherlands,Both,1-4,Other,2014,15.512021,ORC
1002513,Deaths,Netherlands,Both,5-9,Other,2014,7.259305,ORC
1002516,Deaths,Netherlands,Both,10-14,Other,2014,5.849076,ORC
1002519,Deaths,Netherlands,Both,15-19,Other,2014,9.689863,ORC
1002522,Deaths,Netherlands,Both,20-24,Other,2014,13.516623,ORC
...,...,...,...,...,...,...,...,...
16499182,Deaths,United Kingdom,Both,80-84,Neoplasms,2002,25329.410294,ORC
16499185,Deaths,United Kingdom,Both,85-89,Neoplasms,2002,17499.620020,ORC
16499188,Deaths,United Kingdom,Both,90-94,Neoplasms,2002,8538.434125,ORC
16499191,Deaths,United Kingdom,Both,95+,Neoplasms,2002,2808.581999,ORC


In [123]:
deaths_groupped

Unnamed: 0,age,year,cause,CN,deaths
0,1-4,1990,Cardiovascular diseases,ORC,359.971925
1,1-4,1990,Cardiovascular diseases,USA,247.321974
2,1-4,1990,Chronic respiratory diseases,ORC,181.964379
3,1-4,1990,Chronic respiratory diseases,USA,111.185008
4,1-4,1990,Cirrhosis and other chronic liver diseases,ORC,71.666268
...,...,...,...,...,...
13615,<1,2019,Other,USA,20437.990990
13616,<1,2019,Substance use disorders,ORC,0.000000
13617,<1,2019,Substance use disorders,USA,0.000000
13618,<1,2019,Transport injuries,ORC,49.928416


In [130]:
df_pop

Unnamed: 0,country,sex,age,year,pop,CN
5096,Czechia,both,1-4,1954,692251.524051,ORC
5097,Czechia,both,5-9,1954,858392.203682,ORC
5098,Czechia,both,10-14,1954,693383.250461,ORC
5099,Czechia,both,15-19,1954,560568.597839,ORC
5100,Czechia,both,20-24,1954,645732.189148,ORC
...,...,...,...,...,...,...
6592467,Bermuda,both,<1,1994,843.659704,ORC
6592468,Bermuda,both,80-84,1994,678.232727,ORC
6592469,Bermuda,both,85-89,1994,350.199657,ORC
6592470,Bermuda,both,90-94,1994,113.705951,ORC


In [138]:
def standardize_mortality_data(save_file_path=None):

    deaths_groupped = deaths.groupby(['age', 'year', 'cause', 'CN']).agg({'deaths': 'sum'}).reset_index()
    df_pop_groupped = df_pop.groupby(['age', 'year', 'CN']).agg({'pop': 'sum'}).reset_index()

    df_mid = deaths_groupped.merge(df_pop_groupped, left_on = ['age', 'year', 'CN'], right_on=['age', 'year', 'CN'], how='left')

    df = df_mid.merge(pop_us17[['age', 'pop_us17', 'weight']])

    df['Deaths Std'] = (df['deaths'] * df['pop_us17'])/df['pop']
    df['Death Rate'] = (df['deaths']/df['pop']) * pow(10,5)
    df['aa_dr'] = ((df['deaths'] * pow(10,5)) / df['pop']) * df['weight']
    
    print(f'Standardized Mortality Data Shape: {df.shape}')
    print(df.head())
    
    if save_file_path: 
        df.to_csv(save_file_path, index=False)
    
    return df

In [140]:
standardize_mortality_data()

Standardized Mortality Data Shape: (13620, 11)
   age  year                                       cause   CN      deaths  \
0  1-4  1990                     Cardiovascular diseases  ORC  359.971925   
1  1-4  1990                     Cardiovascular diseases  USA  247.321974   
2  1-4  1990                Chronic respiratory diseases  ORC  181.964379   
3  1-4  1990                Chronic respiratory diseases  USA  111.185008   
4  1-4  1990  Cirrhosis and other chronic liver diseases  ORC   71.666268   

            pop      pop_us17    weight  Deaths Std  Death Rate     aa_dr  
0  2.671558e+07  1.551091e+07  0.047792  208.997636    1.347423  0.064396  
1  1.545399e+07  1.551091e+07  0.047792  248.232952    1.600376  0.076485  
2  2.671558e+07  1.551091e+07  0.047792  105.647475    0.681117  0.032552  
3  1.545399e+07  1.551091e+07  0.047792  111.594543    0.719458  0.034384  
4  2.671558e+07  1.551091e+07  0.047792   41.609024    0.268256  0.012820  


Unnamed: 0,age,year,cause,CN,deaths,pop,pop_us17,weight,Deaths Std,Death Rate,aa_dr
0,1-4,1990,Cardiovascular diseases,ORC,359.971925,2.671558e+07,1.551091e+07,0.047792,208.997636,1.347423,0.064396
1,1-4,1990,Cardiovascular diseases,USA,247.321974,1.545399e+07,1.551091e+07,0.047792,248.232952,1.600376,0.076485
2,1-4,1990,Chronic respiratory diseases,ORC,181.964379,2.671558e+07,1.551091e+07,0.047792,105.647475,0.681117,0.032552
3,1-4,1990,Chronic respiratory diseases,USA,111.185008,1.545399e+07,1.551091e+07,0.047792,111.594543,0.719458,0.034384
4,1-4,1990,Cirrhosis and other chronic liver diseases,ORC,71.666268,2.671558e+07,1.551091e+07,0.047792,41.609024,0.268256,0.012820
...,...,...,...,...,...,...,...,...,...,...,...
13615,<1,2019,Other,USA,20437.990990,3.778658e+06,3.801319e+06,0.011713,20560.561793,540.879667,6.335055
13616,<1,2019,Substance use disorders,ORC,0.000000,6.009077e+06,3.801319e+06,0.011713,0.000000,0.000000,0.000000
13617,<1,2019,Substance use disorders,USA,0.000000,3.778658e+06,3.801319e+06,0.011713,0.000000,0.000000,0.000000
13618,<1,2019,Transport injuries,ORC,49.928416,6.009077e+06,3.801319e+06,0.011713,31.584525,0.830883,0.009732


In [141]:
def standardize_yll_data(save_file_path=None):

    yll_groupped = yll.groupby(['age', 'year', 'cause', 'CN']).agg({'yll': 'sum'}).reset_index()

    df_mid = yll_groupped.merge(df_pop_groupped, left_on=['age', 'year', 'CN'], right_on=['age', 'year', 'CN'], how='left')

    df = df_mid.merge(pop_us17[['age', 'pop_us17', 'weight']])

    df['aa_yll'] = ((df['yll'] * pow(10,5)) / df['pop']) * df['weight']
    df['aa_yll2'] = (df['yll'] / (df['pop'] * pow(10,5))) * df['pop_us17']
    
    print(f'Standardized Years of Life Lost Data Shape: {df.shape}')
    print(df.head())
    
    if save_file_path: 
        df.to_csv(save_file_path, index=False)
    
    return df

In [142]:
standardize_yll_data()

Standardized Years of Life Lost Data Shape: (13620, 10)
   age  year                                       cause   CN           yll  \
0  1-4  1990                     Cardiovascular diseases  ORC  30963.091500   
1  1-4  1990                     Cardiovascular diseases  USA  21273.472710   
2  1-4  1990                Chronic respiratory diseases  ORC  15651.719802   
3  1-4  1990                Chronic respiratory diseases  USA   9563.611309   
4  1-4  1990  Cirrhosis and other chronic liver diseases  ORC   6164.395182   

            pop      pop_us17    weight    aa_yll   aa_yll2  
0  2.671558e+07  1.551091e+07  0.047792  5.539014  0.179770  
1  1.545399e+07  1.551091e+07  0.047792  6.578859  0.213518  
2  2.671558e+07  1.551091e+07  0.047792  2.799950  0.090873  
3  1.545399e+07  1.551091e+07  0.047792  2.957564  0.095988  
4  2.671558e+07  1.551091e+07  0.047792  1.102754  0.035790  


Unnamed: 0,age,year,cause,CN,yll,pop,pop_us17,weight,aa_yll,aa_yll2
0,1-4,1990,Cardiovascular diseases,ORC,3.096309e+04,2.671558e+07,1.551091e+07,0.047792,5.539014,0.179770
1,1-4,1990,Cardiovascular diseases,USA,2.127347e+04,1.545399e+07,1.551091e+07,0.047792,6.578859,0.213518
2,1-4,1990,Chronic respiratory diseases,ORC,1.565172e+04,2.671558e+07,1.551091e+07,0.047792,2.799950,0.090873
3,1-4,1990,Chronic respiratory diseases,USA,9.563611e+03,1.545399e+07,1.551091e+07,0.047792,2.957564,0.095988
4,1-4,1990,Cirrhosis and other chronic liver diseases,ORC,6.164395e+03,2.671558e+07,1.551091e+07,0.047792,1.102754,0.035790
...,...,...,...,...,...,...,...,...,...,...
13615,<1,2019,Other,USA,1.813077e+06,3.778658e+06,3.801319e+06,0.011713,561.989877,18.239505
13616,<1,2019,Substance use disorders,ORC,0.000000e+00,6.009077e+06,3.801319e+06,0.011713,0.000000,0.000000
13617,<1,2019,Substance use disorders,USA,0.000000e+00,3.778658e+06,3.801319e+06,0.011713,0.000000,0.000000
13618,<1,2019,Transport injuries,ORC,4.416860e+03,6.009077e+06,3.801319e+06,0.011713,0.860906,0.027941


In [137]:
df = dff[(dff['CN']=='USA') & (dff['year']==2017)].groupby(['cause', 'year', 'CN']).agg({'aa_yll':'sum', 'aa_yll2': 'sum'})
df['percent'] = df['aa_yll']/df['aa_yll'].sum()*100
df['percent2'] = df['aa_yll2']/df['aa_yll2'].sum()*100
for col in ['percent', 'percent2']:
    df[col] = df[col].apply(lambda x: np.round(x, 2))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,aa_yll,aa_yll2,percent,percent2
cause,year,CN,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cardiovascular diseases,2017,USA,4356.403445,141.388033,23.27,23.27
Chronic respiratory diseases,2017,USA,1066.771965,34.622319,5.7,5.7
Cirrhosis and other chronic liver diseases,2017,USA,558.393367,18.12278,2.98,2.98
Diabetes and kidney diseases,2017,USA,952.234542,30.904982,5.09,5.09
HIV/AIDS and sexually transmitted infections,2017,USA,91.255054,2.961703,0.49,0.49
Interpersonal violence,2017,USA,326.061226,10.582389,1.74,1.74
Neoplasms,2017,USA,4571.444663,148.367243,24.42,24.42
Other,2017,USA,4531.913755,147.084258,24.2,24.2
Self-harm,2017,USA,604.513498,19.619619,3.23,3.23
Substance use disorders,2017,USA,1049.952599,34.076443,5.61,5.61
