# Feature Engineering

In [122]:
import csv
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

In [123]:
# Read in cleaned data
df = pd.read_csv('LifeExpectancyDraft.csv')

Population Density

In [124]:
# Load Area Data
area_df = pd.read_csv('LandAreakm.csv')

# Rename Columns for clarity
area_df = area_df.rename(columns={'Entity': 'Country', 'Land area (sq. km)': 'Area'})

# Add 'Area' Column to df
area_df = area_df.groupby('Country', as_index=False).agg({'Area': 'first'})
df = df.merge(area_df[['Country', 'Area']], on='Country', how='left')

# Drop Duplicate Rows
df = df.drop_duplicates()

# Create Population Density Column
df['PopulationDensity'] = df['Population'] / df['Area']

# Remove Area Column
df = df.drop('Area', axis=1)

Population Growth Rate

In [125]:
# Load population data
pop_df = pd.read_csv('population-and-demography.csv')
# rename population column
pop_df.rename(columns={'Population - Sex: all - Age: all - Variant: estimates':'Population_1999'}, inplace=True)

pop_df['Entity'] = pop_df['Entity'].replace({
    'Brunei': 'Brunei Darussalam',
    'Cape Verde': 'Cabo Verde',
    'Democratic Republic of Congo': 'DR of the Congo',
    'Laos': "Lao People's DR",
    'Micronesia (country)': 'Micronesia',
    'Moldova': 'Republic of Moldova',
    'Tanzania': 'United Republic of Tanzania',
    'Eswatini': 'Swaziland', 
    'Saint Vincent and the Grenadines': 'St Vincent and the Grenadines',
    'South Korea': 'Republic of Korea',
    'Syria': 'Syrian Arab Republic',
    'East Timor': 'Timor-Leste'
})

# Calculate population growth rate
df["Population Growth Rate"] = (
    df.groupby("Country")["Population"]
    .apply(lambda x: x.diff() / x.shift()).reset_index(level=0, drop=True)
)

# Filter datasets for years 1999 and 2000
df_1999 = pop_df[pop_df["Year"] == 1999]
df_2000 = df[df["Year"] == 2000]

# Manually insert North Korea Population data
nk_row = pd.DataFrame({"Entity": ["Democratic People's Republic of Korea"], "Code": ["CODE"], "Year": [1999], "Population_1999": [23204498]})
df_1999 = pd.concat([df_1999, nk_row], ignore_index=True)

# Merge the 2000 data with the 1999 poopulation data
df_merged = pd.merge(df_2000, df_1999, left_on="Country", right_on="Entity", suffixes=("_2000", "_1999"))

# Calculate the population growth rate for the year 2000
df_merged["Population Growth Rate"] = (
    (df_merged["Population"] - df_merged["Population_1999"]) 
    / df_merged["Population_1999"]
)

# Create dictionary of population growth rates
growth_rate_dict = dict(zip(df_merged["Country"], df_merged["Population Growth Rate"]))

# Add the Growth Rate from 2000 into the main df
df.loc[df["Year"] == 2000, "Population Growth Rate"] = df.loc[df["Country"].map(growth_rate_dict).notna(), "Country"].map(growth_rate_dict)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.diff() / x.shift()).reset_index(level=0, drop=True)


BMI Classification

In [126]:
# Create columns for each BMI classification based on the BMI range
df['UnderweightBMI'] = (df['BMI'] < 18.5).astype(int)
df['HealthyWeightBMI'] = ((df['BMI'] >= 18.5) & (df['BMI'] < 25)).astype(int)
df['OverweightBMI'] = ((df['BMI'] >= 25) & (df['BMI'] < 30)).astype(int)
df['ObesityBMI'] = (df['BMI'] >= 30).astype(int)

Income Classification

In [127]:
# Low-income economies: GNI per capita of $1,135 or less
# Lower-middle-income economies: GNI per capita between $1,136 and $4,465
# Upper-middle-income economies: GNI per capita between $4,466 and $13,845
# High-income economies: GNI per capita of $13,846 or more
df['LowIncome'] = (df['GDP'] < 1136).astype(int)
df['LowerMiddleIncome'] = ((df['GDP'] >= 1136) & (df['GDP'] < 4466)).astype(int)
df['UpperMiddleIncome'] = ((df['GDP'] >= 4466) & (df['GDP'] < 13846)).astype(int)
df['HighIncome'] = (df['GDP'] >= 13846).astype(int)

Health Coverage Index 
- Positive index means above average 
- Negative index means below average
- Zero index means exactly average

In [128]:
# Reverse necessary columns so a higher number means better health
for col in ['Measles', 'HIV/AIDS', 'Thinness 1-19 Years', 'Thinness 5-9 Years']:
    df[f'{col}_adjusted'] = df[col].max() - df[col]

# Standardize all columns using z-score normalization
for col in ['Hepatitis B', 'Polio', 'Diphtheria', 'Measles_adjusted', 'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted', 'Thinness 5-9 Years_adjusted']:
    df[f'{col}_z'] = (df[col] - df[col].mean()) / df[col].std()

# Combine into a health index by taking the weighted mean of z-scores
z_cols = [f'{col}_z' for col in ['Hepatitis B', 'Polio', 'Diphtheria', 'Measles_adjusted', 'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted', 'Thinness 5-9 Years_adjusted']]
weights = {
    'Hepatitis B_z': 0.2,
    'Polio_z': 0.2,
    'Diphtheria_z': 0.2,
    'Measles_adjusted_z': 0.15,
    'HIV/AIDS_adjusted_z': 0.1,
    'Thinness 1-19 Years_adjusted_z': 0.075,
    'Thinness 5-9 Years_adjusted_z': 0.075,
}

# Normalize weights 
total_weight = sum(weights.values())
weights = {k: v / total_weight for k, v in weights.items()}

# Calculate the health index (handles missing data)
df['Health Coverage Index'] = df[z_cols].apply(
    lambda row: np.average(
        row.dropna(), 
        weights=[weights[col] for col in row.index if col in weights]
    ) if not row.isnull().all() else np.nan,
    axis=1
)

# Drop unnecessary rows
df = df.drop(columns={'Measles_adjusted',
       'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted',
       'Thinness 5-9 Years_adjusted', 'Hepatitis B_z', 'Polio_z',
       'Diphtheria_z', 'Measles_adjusted_z', 'HIV/AIDS_adjusted_z',
       'Thinness 1-19 Years_adjusted_z', 'Thinness 5-9 Years_adjusted_z'})

Human Rights

In [129]:
rights_df = pd.read_csv('human-rights-index-vdem.csv')

# Rename Columns for clarity
rights_df = rights_df.rename(columns={'Entity': 'Country'})

# Create the 'Human Rights' column in df using the mapping
rights_mapping = rights_df.set_index(['Country', 'Year'])['Civil liberties index (best estimate, aggregate: average)'].to_dict()
df['Human Rights'] = df.apply(lambda row: rights_mapping.get((row['Country'], row['Year']), None), axis=1)

Electoral Democracy

In [130]:
democracy_df = pd.read_csv('electoral-democracy-index.csv')

# Rename Columns for clarity
democracy_df = democracy_df.rename(columns={'Entity': 'Country'})

# Create 'Democracy' column
democracy_mapping = democracy_df.set_index(['Country', 'Year'])['Electoral democracy index (best estimate, aggregate: average)'].to_dict()
df['Democracy'] = df.apply(lambda row: democracy_mapping.get((row['Country'], row['Year']), None), axis=1)

Armed Personnel

In [131]:
army_df = pd.read_csv('armed-forces-personnel-percent.csv')

# Rename Columns for clarity
army_df = army_df.rename(columns={'Entity': 'Country'})

# Create 'Army Personnel' column
army_mapping = army_df.set_index(['Country', 'Year'])['Armed forces personnel (% of total population)'].to_dict()
df['Armed Personnel'] = df.apply(lambda row: army_mapping.get((row['Country'], row['Year']), None), axis=1)

Suicide Rate

In [132]:
suicide_df = pd.read_csv('death-rate-from-suicides-gho.csv')

# Rename Columns for clarity
suicide_df = suicide_df.rename(columns={'Entity': 'Country'})

# Create 'Suicide Rate' column
suicide_mapping = suicide_df.set_index(['Country', 'Year'])['Age-standardized death rate from self-harm amongboth sexes'].to_dict()
df['Suicide Rate'] = df.apply(lambda row: suicide_mapping.get((row['Country'], row['Year']), None), axis=1)

Agriculture Employment

In [134]:
agr_df = pd.read_csv('agriculture-value-added-per-worker-wdi.csv')

# Rename Columns for clarity
agr_df = agr_df.rename(columns={'Entity': 'Country'})

# Create 'Emigration' column
agr_mapping = agr_df.set_index(['Country', 'Year'])["Agriculture, forestry, and fishing, value added per worker (constant 2015 US$)"].to_dict()
df['Agriculture Employment'] = df.apply(lambda row: agr_mapping.get((row['Country'], row['Year']), None), axis=1)

KNN

In [136]:
# # Store the 'Country' column temporarily in order to perform KNN
# country_col = df['Country']
# df = df.drop(columns=['Country'])

# # Apply KNN Imputer for Remaining NaN Values
# knn_imputer = KNNImputer(n_neighbors=2)
# df = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# # Re-add 'Country' to the DataFrame
# df['Country'] = country_col.values
# df = df[['Country'] + [col for col in df.columns if col != 'Country']]

In [137]:
df.to_csv('LifeExpectancyClean.csv')

In [None]:
missing_percentage = df[col].isnull().mean()
print(missing_percentage)

0.7664835164835165
