# Feature Engineering

In [49]:
import numpy as np
import pandas as pd

In [None]:
# Once data cleaning is complete, we'll switch to reading in the cleaned version. 
# For now, using the raw data to verify that the code functions as expected.
df = pd.read_csv('LifeExpectancyDraft.csv')

Population Density

In [52]:
# Load Area Data
area_df = pd.read_csv('LandAreakm.csv')

# Rename Columns for clarity
area_df = area_df.rename(columns={'Entity': 'Country', 'Land area (sq. km)': 'Area'})

# Add 'Area' Column to df
area_df = area_df.groupby('Country', as_index=False).agg({'Area': 'first'})
df = df.merge(area_df[['Country', 'Area']], on='Country', how='left')

# Drop Duplicate Rows
df = df.drop_duplicates()

# Create Population Density Column
df['PopulationDensity'] = df['Population'] / df['Area']

# Remove Area Column
df = df.drop('Area', axis=1)

BMI Classification

In [53]:
# Create columns for each BMI classification based on the BMI range
df['UnderweightBMI'] = (df[' BMI '] < 18.5).astype(int)
df['HealthyWeightBMI'] = ((df[' BMI '] >= 18.5) & (df[' BMI '] < 25)).astype(int)
df['OverweightBMI'] = ((df[' BMI '] >= 25) & (df[' BMI '] < 30)).astype(int)
df['ObesityBMI'] = (df[' BMI '] >= 30).astype(int)

Income Classification

In [54]:
# Low-income economies: GNI per capita of $1,135 or less
# Lower-middle-income economies: GNI per capita between $1,136 and $4,465
# Upper-middle-income economies: GNI per capita between $4,466 and $13,845
# High-income economies: GNI per capita of $13,846 or more
df['LowIncome'] = (df['GDP'] < 1136).astype(int)
df['LowerMiddleIncome'] = ((df['GDP'] >= 1136) & (df['GDP'] < 4466)).astype(int)
df['UpperMiddleIncome'] = ((df['GDP'] >= 4466) & (df['GDP'] < 13846)).astype(int)
df['HighIncome'] = (df['GDP'] >= 13846).astype(int)

In [None]:
df.to_csv('LifeExpectancyClean.csv')