### Import libraries and tools

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

# Data Cleaning

### Load Raw Data

In [2]:
# Load raw data
df = pd.read_csv("LifeExpectancyRaw.csv")

### Drop Rows

In [3]:
# Drop rows where life expectancy is missing (10 countries where the country only has one year of data)
df = df.dropna(subset=['Life expectancy '])

# Drop South Sudan due to large portion of missing data
df = df[df['Country'] != 'South Sudan']

### Convert Columns to Binary

In [4]:
# Convert status column to binary
df['Status'].replace({'Developed': 1, 'Developing': 0}, inplace=True)

### Manually Insert Data

In [5]:
# Manually insert population
pop_df = pd.read_csv("population-and-demography.csv")

# Rename values and columns for merging
pop_df['Entity'] = pop_df['Entity'].replace({
    'Bolivia': 'Bolivia (Plurinational State of)',
    'Brunei': 'Brunei Darussalam',
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Cape Verde': 'Cabo Verde',
    'Democratic Republic of Congo': 'Democratic Republic of the Congo',
    'North Korea': "Democratic People's Republic of Korea",
    'Iran': 'Iran (Islamic Republic of)',
    'South Korea': 'Republic of Korea',
    'Laos': "Lao People's Democratic Republic",
    'Micronesia (country)': 'Micronesia (Federated States of)',
    'Moldova': 'Republic of Moldova',
    'North Macedonia': 'The former Yugoslav republic of Macedonia',
    'Tanzania': 'United Republic of Tanzania',
    'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
    'United States': 'United States of America',
    'Venezuela': 'Venezuela (Bolivarian Republic of)',
    'Vietnam': 'Viet Nam', 
    'Russia': 'Russian Federation',
    'Eswatini': 'Swaziland', 
    'Syria': 'Syrian Arab Republic',
    'East Timor': 'Timor-Leste'
})

pop_df = pop_df.rename(columns={
    'Entity': 'Country',
    "Population - Sex: all - Age: all - Variant: estimates": "Population"})

# Merge df with pop_df on Country and Year
df = df.merge(pop_df[['Country', 'Year', 'Population']], on=['Country', 'Year'], how='left', suffixes=('', '_pop_df'))

# Fill missing values in df's Population column with values from pop_df
df['Population'] = df['Population_pop_df']

# Drop the additional Population column from pop_df after filling in missing values
df = df.drop(columns=['Population_pop_df'])

In [6]:
# Load measles data
measles_df = pd.read_csv('MeaslesCoverage.csv', encoding='latin1')

# Replace original values with NaN
df['Measles '] = np.nan

# Merge df with measles_df on 'Country' and 'Year'
measles_df = measles_df.rename(columns={'NAME': 'Country', 'YEAR': 'Year'})
merged_df = df.merge(measles_df[['Country', 'Year', 'COVERAGE']], on=['Country', 'Year'], how='left')

# Fill NaN values in the 'Measles' column with corresponding 'COVERAGE' values from measles_df
df['Measles '] = merged_df['Measles '].fillna(merged_df['COVERAGE'])

# Replace values in 'Measles' column that fall outside the expected percentage range with NaN
df.loc[df['Measles '] > 100, 'Measles '] = np.nan

In [7]:
# Manually insert under-five deaths
mortality_df = pd.read_csv("child-mortality.csv")

# Rename values and columns for merging
mortality_df['Entity'] = mortality_df['Entity'].replace({
    'Bolivia': 'Bolivia (Plurinational State of)',
    'Brunei': 'Brunei Darussalam',
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Cape Verde': 'Cabo Verde',
    'Democratic Republic of Congo': 'Democratic Republic of the Congo',
    'North Korea': "Democratic People's Republic of Korea",
    'Iran': 'Iran (Islamic Republic of)',
    'South Korea': 'Republic of Korea',
    'Laos': "Lao People's Democratic Republic",
    'Micronesia (country)': 'Micronesia (Federated States of)',
    'Moldova': 'Republic of Moldova',
    'North Macedonia': 'The former Yugoslav republic of Macedonia',
    'Tanzania': 'United Republic of Tanzania',
    'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
    'United States': 'United States of America',
    'Venezuela': 'Venezuela (Bolivarian Republic of)',
    'Vietnam': 'Viet Nam', 
    'Russia': 'Russian Federation',
    'Eswatini': 'Swaziland', 
    'Syria': 'Syrian Arab Republic',
    'East Timor': 'Timor-Leste'
})

mortality_df = mortality_df.rename(columns={
    'Entity': 'Country',
    'Under-five mortality rate': 'under-five deaths '})

# Merge df with pop_df on Country and Year
df = df.merge(mortality_df[['Country', 'Year', 'under-five deaths ']], on=['Country', 'Year'], how='left', suffixes=('', '_mortality_df'))

# Replace under-five deaths with values from mortality_df
df['under-five deaths '] = df['under-five deaths '].astype(float) # Make columns the same dtype to avoid errors
df['under-five deaths '] = df['under-five deaths _mortality_df'] * 10 # Adjust from percentage to rate per 1000

# Drop the additional column from mortality_df after filling in values
df = df.drop(columns=['under-five deaths _mortality_df'])

In [8]:
# Manually insert GDP data
gdp_df = pd.read_csv("GDP.csv", skiprows=4)
gdp_df.drop(columns=['Country Code', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023', 'Unnamed: 68'], inplace=True)            # drop unnecessary columns

# Rename column names to match our data
gdp_df['Country Name'] = gdp_df['Country Name'].replace({
    'Bahamas, The': 'Bahamas',
    'Bolivia': 'Bolivia (Plurinational State of)',
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Congo, Dem. Rep.':'Democratic Republic of the Congo',
    'Congo, Rep.':'Congo',
    'Egypt, Arab Rep.':'Egypt',
    'Gambia, The': "Gambia",
    'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',
    'Korea, Dem. People\'s Rep.':"Democratic People's Republic of Korea",
    'Korea, Rep.': 'Republic of Korea',
    'Kyrgyz Republic': 'Kyrgyzstan',
    "Lao PDR": "Lao People's Democratic Republic",
    'Micronesia, Fed. Sts.': 'Micronesia (Federated States of)',
    'Moldova': 'Republic of Moldova',
    'North Macedonia': 'The former Yugoslav republic of Macedonia',
    'Slovak Republic': 'Slovakia',
    'St. Lucia': 'Saint Lucia',
    'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
    'Tanzania': 'United Republic of Tanzania',
    'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
    'United States': 'United States of America',
    'Venezuela, RB': 'Venezuela (Bolivarian Republic of)',
    'Vietnam': 'Viet Nam',
    'Yemen, Rep.': 'Yemen'
})
# Turn year columns into seperate rows
gdp_df = pd.melt(gdp_df, 
                   id_vars=['Country Name', 'Indicator Name'],
                   var_name='Year', 
                   value_name='GDP')

# Change year to int to be able to merge
gdp_df['Year'] = gdp_df['Year'].astype(int)

# Merge df with gdp_df on 'Country' and 'Year'
gdp_df_merged = df.merge(gdp_df[['Country Name', 'Year', 'GDP']], left_on=['Country', 'Year'], 
                         right_on=['Country Name', 'Year'], 
                         how='left', 
                         suffixes=('', '_new'))

# Add Venezuela data
years = list(range(2000,2016))
gdp_ven = [11.9e3, 12.1e3, 10.8e3, 9.83e3, 11.4e3, 12.4e3, 13.4e3, 14.4e3, 14.9e3, 14.2e3, 13.8e3, 14.2e3, 14.7e3, 14.7e3, 1.4e4, 14.1e3]
venezuela_gdp = pd.DataFrame({
    'Country': 'Venezuela (Bolivarian Republic of)',
    'Year': years,
    'GDP_v': gdp_ven
})

# Add North Korea data
gdp_nk = [10.61e9, 11.02e9, 10.91e9, 11.05e9, 11.17e9, 13.03e9, 13.76e9, 14.37e9, 13.34e9, 12.04e9, 13.95e9, 15.69e9, 15.91e9, 16.57e9, 17.4e9, 16.28e9]
nk_gdp = pd.DataFrame({
    'Country': "Democratic People's Republic of Korea",
    'Year':years,
    'GDP_nk':gdp_nk
})

# Merge GDP data back into the main DataFrame on Country and Year
gdp_df_merged = gdp_df_merged.merge(venezuela_gdp, on=['Country', 'Year'], how='outer')
gdp_df_merged = gdp_df_merged.merge(nk_gdp, on=['Country', 'Year'], how='outer')

# Fill missing GDP values
gdp_df_merged['GDP'] = gdp_df_merged['GDP'].fillna(gdp_df_merged['GDP_new'])
gdp_df_merged['GDP'] = gdp_df_merged['GDP'].fillna(gdp_df_merged['GDP_v'])
gdp_df_merged['GDP'] = gdp_df_merged['GDP'].fillna(gdp_df_merged['GDP_nk']/gdp_df_merged['Population'])

# Drop joined columns after filling in missing values
df = gdp_df_merged.drop(columns=['Country Name', 'GDP_v', 'GDP_new', 'GDP_nk'])

In [9]:
# Manually insert infant mortality
infant_df = pd.read_csv("infant-mortality.csv")

# Rename values and columns for merging
infant_df['Entity'] = infant_df['Entity'].replace({
    'Bolivia': 'Bolivia (Plurinational State of)',
    'Brunei': 'Brunei Darussalam',
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Cape Verde': 'Cabo Verde',
    'Democratic Republic of Congo': 'Democratic Republic of the Congo',
    'North Korea': "Democratic People's Republic of Korea",
    'Iran': 'Iran (Islamic Republic of)',
    'South Korea': 'Republic of Korea',
    'Laos': "Lao People's Democratic Republic",
    'Micronesia (country)': 'Micronesia (Federated States of)',
    'Moldova': 'Republic of Moldova',
    'North Macedonia': 'The former Yugoslav republic of Macedonia',
    'Tanzania': 'United Republic of Tanzania',
    'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
    'United States': 'United States of America',
    'Venezuela': 'Venezuela (Bolivarian Republic of)',
    'Vietnam': 'Viet Nam', 
    'Russia': 'Russian Federation',
    'Eswatini': 'Swaziland', 
    'Syria': 'Syrian Arab Republic',
    'East Timor': 'Timor-Leste'
})
infant_df = infant_df.rename(columns={'Entity': 'Country'})

# Merge df with pop_df on Country and Year
df = df.merge(infant_df[['Country', 'Year', 'infant deaths']], on=['Country', 'Year'], how='left', suffixes=('', '_infant_df'))

# Replace infant deaths with values from infant_df
df['infant deaths'] = df['infant deaths_infant_df'] * 10 # Adjust from percentage to rate per 1000

# Drop the additional column from infant_df after filling in values
df = df.drop(columns=['infant deaths_infant_df'])

### Forward/Backward Fill and Linear Interpolation by Country

In [10]:
def fill_missing_values(group, col):
    # Interpolate values for years 2001-2014
    group.loc[(group['Year'] > 2000) & (group['Year'] < 2015), col] = group[col].interpolate(method='linear')

    # For year 2000, fill with the next year's value if available
    group.loc[group['Year'] == 2000, col] = group.loc[group['Year'] == 2000, col].fillna(group.loc[group['Year'] == 2001, col])

    # For year 2015, fill with the previous year's value if available
    group.loc[group['Year'] == 2015, col] = group.loc[group['Year'] == 2015, col].fillna(group.loc[group['Year'] == 2014, col])

    return group

# Fill missing values by interpolation
start_col, end_col = df.columns.get_loc('Status'), df.columns.get_loc('Schooling')
for col in df.columns[start_col:end_col + 1]:
  df = df.groupby('Country', group_keys=False).apply(lambda group: fill_missing_values(group, col))
  df = df.reset_index(drop=True)

### Fix Values that Fall Outside the Reasonable Range

In [11]:
# Replace values outside the reasonable adult mortality range with NaN
df.loc[~df['Adult Mortality'].between(1, 500), 'Adult Mortality'] = np.nan

# Load sample data
fixed_df = pd.read_csv("LifeExpectancyCleanSample.csv")

# Rename columns in fixed_df
fixed_df['Country'] = fixed_df['Country'].replace({'Bahamas, The':'Bahamas', 'Bolivia':'Bolivia (Plurinational State of)',
        "Cote d'Ivoire":"Côte d'Ivoire", 'Congo, Rep.':'Congo', "Egypt, Arab Rep.":"Egypt",
        'Gambia, The':'Gambia', 'Iran, Islamic Rep.':'Iran (Islamic Republic of)', 'Kyrgyz Republic':'Kyrgyzstan', 
        "Lao PDR":"Lao People's Democratic Republic", 'Micronesia, Fed. Sts.':'Micronesia (Federated States of)',
        'Moldova':'Republic of Moldova', 'St. Lucia':'Saint Lucia', "Congo, Dem. Rep.":"Democratic Republic of the Congo",
        'St. Vincent and the Grenadines':'Saint Vincent and the Grenadines', 'Slovak Republic':'Slovakia',
        'Eswatini':'Swaziland', 'North Macedonia':'The former Yugoslav republic of Macedonia',
        'Turkiye':'Turkey', 'United Kingdom':'United Kingdom of Great Britain and Northern Ireland',
        'Tanzania':'United Republic of Tanzania', 'United States':'United States of America',
        'Venezuela, RB':'Venezuela (Bolivarian Republic of)', 'Vietnam':'Viet Nam', 'Yemen, Rep.':'Yemen'})

# Replace BMI values with BMI values from fixed_df
df.rename(columns={' BMI ': 'BMI'}, inplace=True)
df = df.merge(fixed_df[['Country', 'Year', 'BMI']], on=['Country', 'Year'], how='left', suffixes=('', '_new'))
df['BMI'] = df['BMI_new'].combine_first(df['BMI'])
df.drop(columns=['BMI_new'], inplace=True)

# For countries not found in fixed_df, we drop the BMI values in our df to impute by knn
df.loc[df['Country'].isin(["Democratic People's Republic of Korea", 'Republic of Korea', 'Sudan']), 'BMI'] = np.nan

# Replace 0 values in 'Total expenditure' with NaN
df['percentage expenditure'] = df['percentage expenditure'].replace(0, np.nan)

### KNN

In [12]:
# Store the 'Country' column temporarily in order to perform KNN on Numeric Data
country_col = df['Country']
df = df.drop(columns=['Country'])

# Apply KNN Imputer for Remaining NaN Values
knn_imputer = KNNImputer(n_neighbors=2)
df = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# Re-add 'Country' to the DataFrame
df['Country'] = country_col.values

### Formatting

In [13]:
# Strip columns of leading and trailing whitespace
df.columns = df.columns.str.strip()

# Rename columns to create consistent capitalization
df.rename(columns={'Life expectancy':'Life Expectancy', 'infant deaths':'Infant Deaths', 'percentage expenditure':'Percentage Expenditure', 
                   'under-five deaths':'Under-five Deaths', 'Total expenditure':'Total Expenditure', 'thinness  1-19 years':'Thinness 1-19 Years', 
                   'thinness 5-9 years':'Thinness 5-9 Years', 'Income composition of resources':'Income Composition of Resources'}, inplace=True)

# Sort by countries
cols = ['Country'] + [col for col in df.columns if col != 'Country']
df = df[cols]
df = df.sort_values(by=['Country', 'Year'])

# Rename lengthy country names
df['Country'] = df['Country'].replace({'Bolivia (Plurinational State of)':'Bolivia', "Côte d'Ivoire":"Cote d'Ivoire", 'Democratic Republic of the Congo':'DR of the Congo',
                   'Iran (Islamic Republic of)':'Iran', "Lao People's Democratic Republic":"Lao People's DR",
                   'Micronesia (Federated States of)':'Micronesia', 'Saint Vincent and the Grenadines':'St Vincent and the Grenadines',
                   'Russian Federation':'Russia', 'The former Yugoslav republic of Macedonia':'North Macedonia',
                   'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
                   'United States of America':'United States', 'Venezuela (Bolivarian Republic of)':'Venezuela',
                   'Viet Nam':'Vietnam'})

### Assertions

In [14]:
assert df.duplicated().sum() == 0, "The DataFrame contains duplicate rows."
assert df.isnull().sum().sum() == 0, "There are null values in the DataFrame."
assert df.groupby('Country')['Year'].apply(lambda x: set(range(2000, 2016)).issubset(x)).all(), "Not all countries have entries for each year from 2000 to 2015."
assert df['Life Expectancy'].between(35, 100).all(), "Life expectancy values are out of the specified range"
assert df['Adult Mortality'].between(1, 500).all(), "Adult mortality values are out of the specified range"
assert df['Infant Deaths'].between(1, 500).all(), "Infant mortality values are out of the specified range"
assert df['Alcohol'].between(0, 18).all(), "Alcohol consumption values are out of the specified range"
assert df['Percentage Expenditure'].between(0, 30000).all(), "Percentage expenditure values are out of the specified range"
assert df['Hepatitis B'].between(0, 100).all(), "Hepatitis B values are out of the specified range"
assert df['Measles'].between(0, 100).all(), "Measles values are out of the specified range"
assert df['BMI'].between(18, 33).all(), "BMI values are out of the specified range"
assert df['Under-five Deaths'].between(1, 1000).all(), "Under-five mortality values are out of the specified range"
assert df['Polio'].between(0, 100).all(), "Polio values are out of the specified range"
assert df['Total Expenditure'].between(0, 30000).all(), "Total expenditure values are out of the specified range"
assert df['Diphtheria'].between(0, 100).all(), "Diptheria values are out of the specified range"
assert df['HIV/AIDS'].between(0, 300).all(), "HIV/AIDS values are out of the specified range"
assert df['GDP'].between(0, 300000).all(), "GDP values are out of the specified range"
assert df['Population'].between(100, 2000000000).all(), "Population values are out of the specified range"
assert df['Thinness 1-19 Years'].between(0, 60).all(), "Thinness (1-19) values are out of the specified range"
assert df['Thinness 5-9 Years'].between(0, 60).all(), "Thinness (5-9) values are out of the specified range"
assert df['Income Composition of Resources'].between(0, 1).all(), "Income composition resources values are out of the specified range"
assert df['Schooling'].between(0, 25).all(), "Schooling values are out of the specified range"

### Export Clean Data to CSV

In [15]:
df.to_csv('LifeExpectancyDraft.csv', index=False)

# Feature Engineering

In [16]:
# Read in cleaned data
df = pd.read_csv('LifeExpectancyDraft.csv')

### Population Density

In [17]:
# Load Area Data
area_df = pd.read_csv('LandAreakm.csv')

# Rename Columns for clarity
area_df = area_df.rename(columns={'Entity': 'Country', 'Land area (sq. km)': 'Area'})

# Add 'Area' Column to df
area_df = area_df.groupby('Country', as_index=False).agg({'Area': 'first'})
df = df.merge(area_df[['Country', 'Area']], on='Country', how='left')

# Drop Duplicate Rows
df = df.drop_duplicates()

# Create Population Density Column
df['PopulationDensity'] = df['Population'] / df['Area']

# Remove Area Column
df = df.drop('Area', axis=1)

### Population Growth Rate

In [18]:
# Load population data
pop_df = pd.read_csv('population-and-demography.csv')
# rename population column
pop_df.rename(columns={'Population - Sex: all - Age: all - Variant: estimates':'Population_1999'}, inplace=True)

pop_df['Entity'] = pop_df['Entity'].replace({
    'Brunei': 'Brunei Darussalam',
    'Cape Verde': 'Cabo Verde',
    'Democratic Republic of Congo': 'DR of the Congo',
    'Laos': "Lao People's DR",
    'Micronesia (country)': 'Micronesia',
    'Moldova': 'Republic of Moldova',
    'Tanzania': 'United Republic of Tanzania',
    'Eswatini': 'Swaziland', 
    'Saint Vincent and the Grenadines': 'St Vincent and the Grenadines',
    'South Korea': 'Republic of Korea',
    'Syria': 'Syrian Arab Republic',
    'East Timor': 'Timor-Leste'
})

# Calculate population growth rate
df["Population Growth Rate"] = (
    df.groupby("Country")["Population"]
    .apply(lambda x: x.diff() / x.shift()).reset_index(level=0, drop=True)
)

# Filter datasets for years 1999 and 2000
df_1999 = pop_df[pop_df["Year"] == 1999]
df_2000 = df[df["Year"] == 2000]

# Manually insert North Korea Population data
nk_row = pd.DataFrame({"Entity": ["Democratic People's Republic of Korea"], "Code": ["CODE"], "Year": [1999], "Population_1999": [23204498]})
df_1999 = pd.concat([df_1999, nk_row], ignore_index=True)

# Merge the 2000 data with the 1999 poopulation data
df_merged = pd.merge(df_2000, df_1999, left_on="Country", right_on="Entity", suffixes=("_2000", "_1999"))

# Calculate the population growth rate for the year 2000
df_merged["Population Growth Rate"] = (
    (df_merged["Population"] - df_merged["Population_1999"]) 
    / df_merged["Population_1999"]
)

# Create dictionary of population growth rates
growth_rate_dict = dict(zip(df_merged["Country"], df_merged["Population Growth Rate"]))

# Add the Growth Rate from 2000 into the main df
df.loc[df["Year"] == 2000, "Population Growth Rate"] = df.loc[df["Country"].map(growth_rate_dict).notna(), "Country"].map(growth_rate_dict)

### BMI Classification

In [19]:
# Create columns for each BMI classification based on the BMI range
df['UnderweightBMI'] = (df['BMI'] < 18.5).astype(int)
df['HealthyWeightBMI'] = ((df['BMI'] >= 18.5) & (df['BMI'] < 25)).astype(int)
df['OverweightBMI'] = ((df['BMI'] >= 25) & (df['BMI'] < 30)).astype(int)
df['ObesityBMI'] = (df['BMI'] >= 30).astype(int)

### Income Classification

In [20]:
# Low-income economies: GNI per capita of $1,135 or less
# Lower-middle-income economies: GNI per capita between $1,136 and $4,465
# Upper-middle-income economies: GNI per capita between $4,466 and $13,845
# High-income economies: GNI per capita of $13,846 or more
df['LowIncome'] = (df['GDP'] < 1136).astype(int)
df['LowerMiddleIncome'] = ((df['GDP'] >= 1136) & (df['GDP'] < 4466)).astype(int)
df['UpperMiddleIncome'] = ((df['GDP'] >= 4466) & (df['GDP'] < 13846)).astype(int)
df['HighIncome'] = (df['GDP'] >= 13846).astype(int)

### Health Coverage Index 
- Positive index means above average 
- Negative index means below average
- Zero index means exactly average

In [21]:
# Reverse necessary columns so a higher number means better health
for col in ['Measles', 'HIV/AIDS', 'Thinness 1-19 Years', 'Thinness 5-9 Years']:
    df[f'{col}_adjusted'] = df[col].max() - df[col]

# Standardize all columns using z-score normalization
for col in ['Hepatitis B', 'Polio', 'Diphtheria', 'Measles_adjusted', 'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted', 'Thinness 5-9 Years_adjusted']:
    df[f'{col}_z'] = (df[col] - df[col].mean()) / df[col].std()

# Combine into a health index by taking the weighted mean of z-scores
z_cols = [f'{col}_z' for col in ['Hepatitis B', 'Polio', 'Diphtheria', 'Measles_adjusted', 'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted', 'Thinness 5-9 Years_adjusted']]
weights = {
    'Hepatitis B_z': 0.2,
    'Polio_z': 0.2,
    'Diphtheria_z': 0.2,
    'Measles_adjusted_z': 0.15,
    'HIV/AIDS_adjusted_z': 0.1,
    'Thinness 1-19 Years_adjusted_z': 0.075,
    'Thinness 5-9 Years_adjusted_z': 0.075,
}

# Normalize weights 
total_weight = sum(weights.values())
weights = {k: v / total_weight for k, v in weights.items()}

# Calculate the health index (handles missing data)
df['Health Coverage Index'] = df[z_cols].apply(
    lambda row: np.average(
        row.dropna(), 
        weights=[weights[col] for col in row.index if col in weights]
    ) if not row.isnull().all() else np.nan,
    axis=1
)

# Drop unnecessary rows
df = df.drop(columns={'Measles_adjusted',
       'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted',
       'Thinness 5-9 Years_adjusted', 'Hepatitis B_z', 'Polio_z',
       'Diphtheria_z', 'Measles_adjusted_z', 'HIV/AIDS_adjusted_z',
       'Thinness 1-19 Years_adjusted_z', 'Thinness 5-9 Years_adjusted_z'})

### Human Rights

In [22]:
rights_df = pd.read_csv('human-rights-index-vdem.csv')

# Rename Columns for clarity
rights_df = rights_df.rename(columns={'Entity': 'Country'})

# Create the 'Human Rights' column in df using the mapping
rights_mapping = rights_df.set_index(['Country', 'Year'])['Civil liberties index (best estimate, aggregate: average)'].to_dict()
df['Human Rights'] = df.apply(lambda row: rights_mapping.get((row['Country'], row['Year']), None), axis=1)

### Electoral Democracy

In [23]:
democracy_df = pd.read_csv('electoral-democracy-index.csv')

# Rename Columns for clarity
democracy_df = democracy_df.rename(columns={'Entity': 'Country'})

# Create 'Democracy' column
democracy_mapping = democracy_df.set_index(['Country', 'Year'])['Electoral democracy index (best estimate, aggregate: average)'].to_dict()
df['Democracy'] = df.apply(lambda row: democracy_mapping.get((row['Country'], row['Year']), None), axis=1)

### Armed Personnel

In [24]:
army_df = pd.read_csv('armed-forces-personnel-percent.csv')

# Rename Columns for clarity
army_df = army_df.rename(columns={'Entity': 'Country'})

# Create 'Army Personnel' column
army_mapping = army_df.set_index(['Country', 'Year'])['Armed forces personnel (% of total population)'].to_dict()
df['Armed Personnel'] = df.apply(lambda row: army_mapping.get((row['Country'], row['Year']), None), axis=1)

### Suicide Rate

In [25]:
suicide_df = pd.read_csv('death-rate-from-suicides-gho.csv')

# Rename Columns for clarity
suicide_df = suicide_df.rename(columns={'Entity': 'Country'})

# Create 'Suicide Rate' column
suicide_mapping = suicide_df.set_index(['Country', 'Year'])['Age-standardized death rate from self-harm amongboth sexes'].to_dict()
df['Suicide Rate'] = df.apply(lambda row: suicide_mapping.get((row['Country'], row['Year']), None), axis=1)

### Agriculture Employment

In [26]:
agr_df = pd.read_csv('agriculture-value-added-per-worker-wdi.csv')

# Rename Columns for clarity
agr_df = agr_df.rename(columns={'Entity': 'Country'})

# Create 'Emigration' column
agr_mapping = agr_df.set_index(['Country', 'Year'])["Agriculture, forestry, and fishing, value added per worker (constant 2015 US$)"].to_dict()
df['Agriculture Employment'] = df.apply(lambda row: agr_mapping.get((row['Country'], row['Year']), None), axis=1)

### KNN

In [27]:
# Store the 'Country' column temporarily in order to perform KNN
country_col = df['Country']
df = df.drop(columns=['Country'])

# Apply KNN Imputer for Remaining NaN Values
knn_imputer = KNNImputer(n_neighbors=2)
df = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# Re-add 'Country' to the DataFrame
df['Country'] = country_col.values
df = df[['Country'] + [col for col in df.columns if col != 'Country']]

In [28]:
df.to_csv('LifeExpectancyClean.csv')