# Life Expectancy Data Cleaning

In [39]:
import numpy as np
import pandas as pd

Load Raw Data

In [40]:
# Load Data
df = pd.read_csv("LifeExpectancyRaw.csv")

Drop Rows

In [None]:
# Convert Status Column to Binary
df['Status'].replace({'Developed': 1, 'Developing': 0}, inplace=True)

# Drop Countries Due to Large Portion of Missing Data
df = df[~df['Country'].isin(['South Sudan', 'Cook Islands', 'Niue', 'Saint Kitts and Nevis'])]

Convert Columns to Binary

In [42]:
# Convert Status Column to Binary
df['Status'].replace({'Developed': 1, 'Developing': 0}, inplace=True)

Manually Insert Data

In [43]:
# Manually insert population
# Load Data
pop_df = pd.read_csv("population-and-demography.csv")
pop_df = pop_df[pop_df['Entity'].isin(['Antigua and Barbuda', 'Bahamas', 'Bahrain', 'Barbados',
                 'Bolivia', 'Brunei', 'Congo','Cuba', 'Czechia', "Cote d'Ivoire",
                 "North Korea", 'Democratic Republic of Congo', 'Egypt', 'Eritrea',
                 'Gambia', 'Grenada', 'Iran', 'Kuwait', 'Kyrgyzstan', "Laos", 'Libya',
                 'Micronesia (country)', 'New Zealand', 'Oman', 'Qatar', 'South Korea',
                 'Moldova', 'Saint Lucia', 'Saint Vincent and the Grenadines',
                 'Saudi Arabia', 'Singapore', 'Slovakia', 'Somalia', 'North Macedonia',
                 'United Arab Emirates', 'United Kingdom', 'Tanzania', 'United States',
                 'Venezuela', 'Vietnam', 'Yemen'])]

pop_df['Entity'] = pop_df['Entity'].replace({
    'Bolivia': 'Bolivia (Plurinational State of)',
    'Brunei': 'Brunei Darussalam',
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Democratic Republic of Congo': 'Democratic Republic of the Congo',
    'North Korea': "Democratic People's Republic of Korea",
    'Iran': 'Iran (Islamic Republic of)',
    'South Korea': 'Republic of Korea',
    'Laos': "Lao People's Democratic Republic",
    'Micronesia (country)': 'Micronesia (Federated States of)',
    'Moldova': 'Republic of Moldova',
    'North Macedonia': 'The former Yugoslav republic of Macedonia',
    'Tanzania': 'United Republic of Tanzania',
    'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
    'United States': 'United States of America',
    'Venezuela': 'Venezuela (Bolivarian Republic of)',
    'Vietnam': 'Viet Nam'
})

pop_df = pop_df.rename(columns={
    'Entity': 'Country',
    "Population - Sex: all - Age: all - Variant: estimates": "Population"})

# Merge df with pop_df on Country and Year
df = df.merge(pop_df[['Country', 'Year', 'Population']], on=['Country', 'Year'], how='left', suffixes=('', '_pop_df'))

# Fill missing values in df's Population column with values from pop_df
df['Population'] = df['Population'].fillna(df['Population_pop_df'])

# Drop the additional Population column from pop_df after filling in missing values
df = df.drop(columns=['Population_pop_df'])

In [None]:
# Load measles data
measles_df = pd.read_csv('MeaslesCoverage.csv', encoding='latin1')

# Replace original values with NaN
df['Measles '] = np.nan

# Merge df with measles_df on Country and Year
measles_df = measles_df.rename(columns={'NAME': 'Country', 'YEAR': 'Year'})
merged_df = df.merge(measles_df[['Country', 'Year', 'COVERAGE']], on=['Country', 'Year'], how='left')

# Fill NaN values in the 'Measles' column with corresponding 'COVERAGE' values from measles_df
df['Measles '] = merged_df['Measles '].fillna(merged_df['COVERAGE'])

# Replace values in 'Measles' column that fall outside the expected percentage range with NaN
df.loc[df['Measles '] > 100, 'Measles '] = np.nan

In [None]:
# Manually insert child mortality
# Manually change under-five deaths for India
replacement_data = {2000: 92, 2001: 88, 2002: 85, 2003: 81, 2004: 78, 2005: 74, 2006: 71, 2007: 68,
                    2008: 65, 2009: 61, 2010: 58, 2011: 55, 2012: 52, 2013: 49, 2014: 46, 2015: 44}

# Apply replacement data using loc and map
df.loc[df['Country'] == 'India', 'under-five deaths '] = df['Year'].map(replacement_data).copy()

# Load Data
mortality_df = pd.read_csv("child-mortality.csv")

# Filter down to relevant information
mortality_df = mortality_df[mortality_df['Entity'].isin(['Antigua and Barbuda', 'Bahamas', 'Bahrain', 'Barbados',
                 'Bolivia', 'Brunei', 'Congo','Cuba', 'Czechia', "Cote d'Ivoire",
                 "North Korea", 'Democratic Republic of Congo', 'Egypt', 'Eritrea',
                 'Gambia', 'Grenada', 'Iran', 'Kuwait', 'Kyrgyzstan', "Laos", 'Libya',
                 'Micronesia (country)', 'New Zealand', 'Oman', 'Qatar', 'South Korea',
                 'Moldova', 'Saint Lucia', 'Saint Vincent and the Grenadines',
                 'Saudi Arabia', 'Singapore', 'Slovakia', 'Somalia', 'North Macedonia',
                 'United Arab Emirates', 'United Kingdom', 'Tanzania', 'United States',
                 'Venezuela', 'Vietnam', 'Yemen'])]

# Rename values and columns for merging
mortality_df['Entity'] = mortality_df['Entity'].replace({
    'Bolivia': 'Bolivia (Plurinational State of)',
    'Brunei': 'Brunei Darussalam',
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Democratic Republic of Congo': 'Democratic Republic of the Congo',
    'North Korea': "Democratic People's Republic of Korea",
    'Iran': 'Iran (Islamic Republic of)',
    'South Korea': 'Republic of Korea',
    'Laos': "Lao People's Democratic Republic",
    'Micronesia (country)': 'Micronesia (Federated States of)',
    'Moldova': 'Republic of Moldova',
    'North Macedonia': 'The former Yugoslav republic of Macedonia',
    'Tanzania': 'United Republic of Tanzania',
    'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
    'United States': 'United States of America',
    'Venezuela': 'Venezuela (Bolivarian Republic of)',
    'Vietnam': 'Viet Nam'
})

mortality_df = mortality_df.rename(columns={
    'Entity': 'Country',
    'Under-five mortality rate': 'under-five deaths '})

# Merge df with pop_df on Country and Year
df = df.merge(mortality_df[['Country', 'Year', 'under-five deaths ']], on=['Country', 'Year'], how='left', suffixes=('', '_mortality_df'))

# Replace instances where under-five deaths == 0 with values from mortality_df
df['under-five deaths '] = df['under-five deaths '].astype(float) # Make columns the same dtype to avoid errors
df.loc[df['under-five deaths '] == 0, 'under-five deaths '] = df.loc[df['under-five deaths '] == 0, 'under-five deaths _mortality_df']

# Drop the additional column from mortality_df after filling in values
df = df.drop(columns=['under-five deaths _mortality_df'])

In [None]:
# Manually insert Life Expectancy
# Manually insert Life Expectancy for 10 missing countries 
replacement_data = {'Monaco': 84.9, 'San Marino': 82.4, 'Dominica': 68.9, 'Tuvalu': 64.0, 'Palau': 64.2, 'Marshall Islands': 63.5, 'Nauru': 61.2}


# Update the DataFrame using .loc and the dictionary
for country, life_expectancy in replacement_data.items():
    df.loc[(df['Year'] == 2013) & (df['Country'] == country), 'Life expectancy '] = life_expectancy

Forward/Backward Fill and Linear Interpolation by Country

In [45]:
def fill_missing_values(group, col):
    # Interpolate values for years 2001-2014
    group.loc[(group['Year'] > 2000) & (group['Year'] < 2015), col] = group[col].interpolate(method='linear')

    # For year 2000, fill with the next year's value if available
    group.loc[group['Year'] == 2000, col] = group.loc[group['Year'] == 2000, col].fillna(group.loc[group['Year'] == 2001, col])

    # For year 2015, fill with the previous year's value if available
    group.loc[group['Year'] == 2015, col] = group.loc[group['Year'] == 2015, col].fillna(group.loc[group['Year'] == 2014, col])

    # Optionally use forward fill and backward fill for remaining NaNs
    group[col] = group[col].fillna(method='ffill').fillna(method='bfill')

    return group

start_col = df.columns.get_loc('Status')
end_col = df.columns.get_loc('Schooling')
for col in df.columns[start_col:end_col + 1]:
  df = df.groupby('Country', group_keys=False).apply(lambda group: fill_missing_values(group, col))
  df = df.reset_index(drop=True)

KNN

In [46]:
### NOTE ###
# I'm commenting this code out for now, since it will take care of ALL remaining Null values.

# # Store the 'Country' column temporarily in order to perform KNN on Numeric Data
# country_col = df['Country']
# df = df.drop(columns=['Country'])

# # Apply KNN Imputer for Remaining NaN Values
# knn_imputer = KNNImputer(n_neighbors=2)
# df = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# # Re-add 'Country' to the DataFrame
# df['Country'] = country_col.values

Formatting

In [47]:
# TO-DO

Assertions

In [48]:
assert df.duplicated().sum() == 0, "The DataFrame contains duplicate rows"
# assert df.isnull().sum().sum() == 0, "There are null values in the DataFrame."

Export Clean Data to CSV

In [49]:
df.to_csv('LifeExpectancyClean.csv')

Number of Null Values Remaining in Data:

In [50]:
print("Number of null values in each column:")
null_counts = df.isnull().sum()
print(null_counts)

Number of null values in each column:
Country                              0
Year                                 0
Status                               0
Life expectancy                      0
Adult Mortality                      0
infant deaths                        0
Alcohol                             16
percentage expenditure               0
Hepatitis B                        144
Measles                              0
 BMI                                32
under-five deaths                    0
Polio                                0
Total expenditure                   32
Diphtheria                           0
 HIV/AIDS                            0
GDP                                400
Population                           0
 thinness  1-19 years               32
 thinness 5-9 years                 32
Income composition of resources    160
Schooling                          160
dtype: int64
