# Feature Engineering

In [90]:
import csv
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

In [91]:
df = pd.read_csv('LifeExpectancyDraft.csv')

Population Density

In [92]:
# Load Area Data
area_df = pd.read_csv('LandAreakm.csv')

# Rename Columns for clarity
area_df = area_df.rename(columns={'Entity': 'Country', 'Land area (sq. km)': 'Area'})

# Add 'Area' Column to df
area_df = area_df.groupby('Country', as_index=False).agg({'Area': 'first'})
df = df.merge(area_df[['Country', 'Area']], on='Country', how='left')

# Drop Duplicate Rows
df = df.drop_duplicates()

# Create Population Density Column
df['PopulationDensity'] = df['Population'] / df['Area']

# Remove Area Column
df = df.drop('Area', axis=1)

BMI Classification

In [93]:
# Create columns for each BMI classification based on the BMI range
df['UnderweightBMI'] = (df['BMI'] < 18.5).astype(int)
df['HealthyWeightBMI'] = ((df['BMI'] >= 18.5) & (df['BMI'] < 25)).astype(int)
df['OverweightBMI'] = ((df['BMI'] >= 25) & (df['BMI'] < 30)).astype(int)
df['ObesityBMI'] = (df['BMI'] >= 30).astype(int)

Income Classification

In [94]:
# Low-income economies: GNI per capita of $1,135 or less
# Lower-middle-income economies: GNI per capita between $1,136 and $4,465
# Upper-middle-income economies: GNI per capita between $4,466 and $13,845
# High-income economies: GNI per capita of $13,846 or more
df['LowIncome'] = (df['GDP'] < 1136).astype(int)
df['LowerMiddleIncome'] = ((df['GDP'] >= 1136) & (df['GDP'] < 4466)).astype(int)
df['UpperMiddleIncome'] = ((df['GDP'] >= 4466) & (df['GDP'] < 13846)).astype(int)
df['HighIncome'] = (df['GDP'] >= 13846).astype(int)

Health Coverage Index 
- Positive index means above average 
- Negative index means below average
- Zero index means exactly average

In [95]:
# Reverse necessary columns so a higher number means better health
for col in ['Measles', 'HIV/AIDS', 'Thinness 1-19 Years', 'Thinness 5-9 Years']:
    df[f'{col}_adjusted'] = df[col].max() - df[col]

# Standardize all columns using z-score normalization
for col in ['Hepatitis B', 'Polio', 'Diphtheria', 'Measles_adjusted', 'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted', 'Thinness 5-9 Years_adjusted']:
    df[f'{col}_z'] = (df[col] - df[col].mean()) / df[col].std()

# Combine into a health index by taking the weighted mean of z-scores
z_cols = [f'{col}_z' for col in ['Hepatitis B', 'Polio', 'Diphtheria', 'Measles_adjusted', 'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted', 'Thinness 5-9 Years_adjusted']]
weights = {
    'Hepatitis B_z': 0.2,
    'Polio_z': 0.2,
    'Diphtheria_z': 0.2,
    'Measles_adjusted_z': 0.15,
    'HIV/AIDS_adjusted_z': 0.1,
    'Thinness 1-19 Years_adjusted_z': 0.075,
    'Thinness 5-9 Years_adjusted_z': 0.075,
}

# Normalize weights 
total_weight = sum(weights.values())
weights = {k: v / total_weight for k, v in weights.items()}

# Calculate the health index (handles missing data)
df['Health Coverage Index'] = df[z_cols].apply(
    lambda row: np.average(
        row.dropna(), 
        weights=[weights[col] for col in row.index if col in weights]
    ) if not row.isnull().all() else np.nan,
    axis=1
)

# Drop unnecessary rows
df = df.drop(columns={'Measles_adjusted',
       'HIV/AIDS_adjusted', 'Thinness 1-19 Years_adjusted',
       'Thinness 5-9 Years_adjusted', 'Hepatitis B_z', 'Polio_z',
       'Diphtheria_z', 'Measles_adjusted_z', 'HIV/AIDS_adjusted_z',
       'Thinness 1-19 Years_adjusted_z', 'Thinness 5-9 Years_adjusted_z'})

Human Rights

In [96]:
rights_df = pd.read_csv('human-rights-index-vdem.csv')

# Rename Columns for clarity
rights_df = rights_df.rename(columns={'Entity': 'Country'})

# Create the 'Human Rights' column in df using the mapping
rights_mapping = rights_df.set_index(['Country', 'Year'])['Civil liberties index (best estimate, aggregate: average)'].to_dict()
df['Human Rights'] = df.apply(lambda row: rights_mapping.get((row['Country'], row['Year']), None), axis=1)

Electoral Democracy

In [97]:
democracy_df = pd.read_csv('electoral-democracy-index.csv')

# Rename Columns for clarity
democracy_df = democracy_df.rename(columns={'Entity': 'Country'})

# Create the 'Human Rights' column in df using the mapping
democracy_mapping = democracy_df.set_index(['Country', 'Year'])['Electoral democracy index (best estimate, aggregate: average)'].to_dict()
df['Democracy'] = df.apply(lambda row: democracy_mapping.get((row['Country'], row['Year']), None), axis=1)

KNN

In [98]:
# Store the 'Country' column temporarily in order to perform KNN
country_col = df['Country']
df = df.drop(columns=['Country'])

# Apply KNN Imputer for Remaining NaN Values
knn_imputer = KNNImputer(n_neighbors=2)
df = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# Re-add 'Country' to the DataFrame
df['Country'] = country_col.values
df = df[['Country'] + [col for col in df.columns if col != 'Country']]

In [99]:
df.to_csv('LifeExpectancyClean.csv')