## Function

This file contains the function that predicts life expectancy based on user input.

In [43]:
#Importing the four main libraries
import numpy as np # for maths
import seaborn as sns # to augment matplotlib/visuals
import matplotlib.pyplot as plt # visuals
import pandas as pd # data
import joblib # to load scaler

# Scaling
from sklearn.preprocessing import RobustScaler


In [45]:
# Import scaler - fitted on the training data
scaler = joblib.load('robust_scaler.pkl')

In [47]:
# Read in the data - to compare results if available
df = pd.read_csv('Life Expectancy Data.csv')

In [49]:
# Feature columns for sensitive data
feature_cols_sens = ['const', 'Region_Asia',
       'Region_Central America and Caribbean', 'Region_European Union',
       'Region_Middle East', 'Region_North America', 'Region_Oceania',
       'Region_Rest of Europe', 'Region_South America', 'Under_five_deaths',
       'Adult_mortality', 'Hepatitis_B', 'GDP_per_capita',
        'Schooling', 'Economy_status_Developed']

In [51]:
# Basic feature columns
feature_cols = ['const',
       'Adult_mortality', 'GDP_per_capita', 'Economy_status_Developed'] 

In [53]:
# Scaling columns
scaling_columns = ['Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
       'Alcohol_consumption', 'Hepatitis_B', 'Measles', 'BMI', 'Polio',
       'Diphtheria', 'Incidents_HIV', 'GDP_per_capita', 'Population_mln',
       'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling']

In [55]:
# Results - basic parameters
results_basic = pd.read_csv("Results_basic.csv")
results_basic = pd.Series(data=results_basic['0'].values, index=results_basic['Unnamed: 0'].values)

In [57]:
# Results - sensitive parameters
results_sensitive = pd.read_csv("Results_sensitive.csv")
results_sensitive = pd.Series(data=results_sensitive['0'].values, index=results_sensitive['Unnamed: 0'].values)

In [59]:
# Life Expectancy Model
def life_expectancy(features, coeff):
    # Lining up features and coefficients
    features = features.sort_index()
    coeff = coeff.sort_index()
    combined = list(zip(features, coeff)) #combine to multiply in the rigt order
    # multiplying coefficients with scaled features values 
    prediction = sum(x * y for x, y in combined)
    return round(prediction, 2)

In [61]:
# User Input Function
def get_user_input(sensitive):
    features = pd.DataFrame(columns=scaling_columns) # creates a features column ready for scaling
    features.loc[0] = 0  # row of zeros for scaling
    while True: # loop until either all basic and optimised features are inputted
        try:
            if sensitive == 'y': # optimised model features
                Region = input('What is your region: ')
                if Region not in feature_cols_sens:
                    raise ValueError("Invalid region. Please enter a valid region.")
                features[Region] = 1
                features['const'] = 1
                features['Under_five_deaths'] = float(input('What is the rate of under five deaths? '))
                features['Adult_mortality'] = float(input('What is the rate of adult mortality? '))
                features['Hepatitis_B'] = float(input('What is the vaccine rate of hepatitis B? '))
                features['GDP_per_capita'] = float(input('What is the GDP per capita? '))
                features['Schooling'] = float(input('What is the average schooling? '))
                features['Economy_status_Developed'] = float(input('Is the country developed (1 for yes and 0 for no)? '))
                break
            elif sensitive == 'n': # basic features
                features['const'] = 1
                features['Adult_mortality'] = float(input('What is the rate of adult mortality? '))
                features['GDP_per_capita'] = float(input('What is the GDP per capita? '))
                features['Economy_status_Developed'] = float(input('Is the country developed (1 for yes and 0 for no)? '))
                break
            else:
                raise ValueError("Invalid input for sensitive. Please enter 'y' or 'n'.")
        except ValueError as e:
            print(f"Input error: {e}. Please try again.")
            sensitive = input('Do you consent to using advance population data, which may include protected data, for a more accurate model? Y/N').lower()
    return features

In [63]:
# Scale Features function - so right columns are scaled
def scale_features(features, scaling_columns):
    df_f = features.copy()
    df_f[scaling_columns] = scaler.transform(features[scaling_columns])
    return df_f

In [65]:
# Missing Columns Functions - so all columns are included even if there is no value
def add_missing_columns(df_f, feature_cols):
    missing_cols = [col for col in feature_cols if col not in df_f.columns]
    for col in missing_cols:
        df_f[col] = 0
    return df_f

In [67]:
def main():
    sensitive = input('Do you consent to using advance population data, which may include protected data, for a more accurate model? Y/N').lower()
    
    features = get_user_input(sensitive)
    
    if sensitive == 'y':
        df_f = scale_features(features, scaling_columns)
        df_f = add_missing_columns(df_f, feature_cols_sens)
        df_f = df_f[feature_cols_sens]
        df_series = df_f.iloc[0]
        print(f'The average life expectancy for the more accurate model is: {life_expectancy(df_series, results_sensitive)}')
        
    elif sensitive == 'n':
        df_f = scale_features(features, scaling_columns)
        df_f = add_missing_columns(df_f, feature_cols)
        df_f = df_f[feature_cols]
        df_series = df_f.iloc[0]
        print(f'The average life expectancy for the basic model is: {life_expectancy(df_series, results_basic)}')

if __name__ == "__main__":
    main()

Do you consent to using advance population data, which may include protected data, for a more accurate model? Y/N y
What is your region:  a


Input error: Invalid region. Please enter a valid region.. Please try again.


Do you consent to using advance population data, which may include protected data, for a more accurate model? Y/N Region_Middle East


Input error: Invalid input for sensitive. Please enter 'y' or 'n'.. Please try again.


Do you consent to using advance population data, which may include protected data, for a more accurate model? Y/N y
What is your region:  Region_Middle East
What is the rate of under five deaths?  a


Input error: could not convert string to float: 'a'. Please try again.


KeyboardInterrupt: Interrupted by user

In [133]:
df[df['Economy_status_Developed'] == 1].head(2)

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
1,Spain,European Union,2015,2.7,3.3,57.9025,10.35,97,94,26.0,97,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
4,Israel,Middle East,2012,3.4,4.3,57.951,2.89,97,89,27.0,94,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7
