In [25]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.tools
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
#import data
data = pd.read_csv("Life Expectancy Data.csv")


In [21]:
def stepwise_selection(X, y, threshold_in=0.01, threshold_out=0.05, verbose=False):
    included = []  # This is going to be the list of features we keep
    while True:
        changed = False
        # Forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded, dtype='float64')
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        # We add the feature with the lowest (best) p-value under the threshold to our 'included' list
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # Backward step: removing features if new features added to the list make them statistically insignificant
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        
        # Use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # Null if pvalues is empty
        # If the p-value exceeds the upper threshold, the feature will be dropped from the 'included' list
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return model, included


In [9]:
def ask_for_consent():
    response = input("Do you consent to using advanced population data, which may include protected information, for better accuracy? (Yes/No): ")
    
    if response.strip().lower() == 'yes':
        print("Thank you for your consent.")
        return True
    elif response.strip().lower() == 'no':
        print("You did not give consent.")
        return False
    else:
        print("Invalid response. Please enter 'Yes' or 'No'.")
        return ask_for_consent()

def ask_for_year_range():
    try:
        year = int(input("Enter the year you wish to predict (between 2000 and 2025): "))
        
        if 2000 <= year <= 2025:
            print(f"You have selected the year: {year}")
            return year
        else:
            print("Invalid range. Please ensure the year is between 2000 and 2025.")
            return ask_for_year_range()
    except ValueError:
        print("Invalid input. Please enter a valid year.")
        return ask_for_year_range()

def ask_for_region():
    region_prompt = (
        "What region is the country you are investigating? Please type:\n"
        "1 for Africa\n"
        "2 for Asia\n"
        "3 for Central America and Caribbean\n"
        "4 for European Union\n"
        "5 for Middle East\n"
        "6 for North America\n"
        "7 for Oceania\n"
        "8 for Rest of Europe\n"
        "9 for South America\n"
        "(range 1 to 9): "
    )
    try:
        region = int(input(region_prompt))
        if 1 <= region <= 9:
            region_names = {
                1: "Africa",
                2: "Asia",
                3: "Central America and Caribbean",
                4: "European Union",
                5: "Middle East",
                6: "North America",
                7: "Oceania",
                8: "Rest of Europe",
                9: "South America"
            }
            print(f"You have selected: {region_names[region]}")
            return region
        else:
            print("Invalid choice. Please select a number between 1 and 9.")
            return ask_for_region()
    except ValueError:
        print("Invalid input. Please enter a number between 1 and 9.")
        return ask_for_region()

def get_population():
    while True:
        try:
            population = float(input("What is the population of the country in millions? "))
            if 0 <= population <= 1400:
                return population
            else:
                print("Please enter a number between 0 and 1400.")
        except ValueError:
            print("Invalid input. Please enter an integer.")

def get_gdp_per_capita():
    while True:
        try:
            gdp_per_capita = float(input("What is the GDP per capita of the country in USD ($)? "))
            if 140 <= gdp_per_capita <= 113000:
                return gdp_per_capita
            else:
                print("Please enter a number between 140 and 113000.")
        except ValueError:
            print("Invalid input. Please enter an integer.")

def ask_for_economy_status():
    while True:
        response = input("Does the country have a developed economy? (Yes/No): ").strip().lower()
        if response == 'yes':
            print("The country has a developed economy.")
            return True
        elif response == 'no':
            print("The country has an undeveloped economy.")
            return False
        else:
            print("Invalid response. Please enter 'Y' or 'N'.")

def get_infant_deaths():
    while True:
        try:
            infant_deaths = float(input("What are the number of infant deaths per 1000 population of the country? "))
            if 1 <= infant_deaths <= 140:
                return infant_deaths
            else:
                print("Please enter a number between 1 and 140.")
        except ValueError:
            print("Invalid input. Please enter a float.")

def get_under_five_deaths():
    while True:
        try:
            under_five_deaths = float(input("What are the number of under-five deaths per 1000 population of the country? "))
            if 0 <= under_five_deaths <= 225:
                return under_five_deaths
            else:
                print("Please enter a number between 0 and 225.")
        except ValueError:
            print("Invalid input. Please enter a float.")

def get_thinness_10_to_19():
    while True:
        try:
            thinness_10_to_19 = float(input("What is the prevalence (%) of thinness between the ages of 10 to 19 years in the country? "))
            if 0 <= thinness_10_to_19 <= 30:
                return thinness_10_to_19
            else:
                print("Please enter a percentage between 0 and 30.")
        except ValueError:
            print("Invalid input. Please enter a float.")

def get_thinness_5_to_9():
    while True:
        try:
            thinness_5_to_9 = float(input("What is the prevalence (%) of thinness between the ages of 5 to 9 years in the country? "))
            if 0 <= thinness_5_to_9 <= 30:
                return thinness_5_to_9
            else:
                print("Please enter a percentage between 0 and 30.")
        except ValueError:
            print("Invalid input. Please enter a float.")

def get_years_of_schooling():
    while True:
        try:
            years_of_schooling = float(input("What are the average number of years of schooling in the country? "))
            if 1 <= years_of_schooling <= 15:
                return years_of_schooling
            else:
                print("Please enter a number between 1 and 15.")
        except ValueError:
            print("Invalid input. Please enter a float.")

def get_adult_mortality():
    while True:
        try:
            adult_mortality = float(input("What is the combined adult mortality rates of men and women between 15 and 60 years per 1000 population of the country? "))
            if 49 <= adult_mortality <= 720:
                return adult_mortality
            else:
                print("Please enter a number between 49 and 720.")
        except ValueError:
            print("Invalid input. Please enter a float.")

def get_alcohol_consumption():
    while True:
        try:
            alcohol_consumption = float(input("What is the alcohol consumption, measured in litres of pure alcohol per person of 15 years of age or older, in the country (litres)? "))
            if 0 <= alcohol_consumption <= 18:
                return alcohol_consumption
            else:
                print("Please enter a number between 0 and 18.")
        except ValueError:
            print("Invalid input. Please enter a float.")

def get_hepb_coverage():
    while True:
        try:
            hepb_coverage = float(input("What is the percentage of one-year-olds who have received the Hepatitis B (HepB) immunization coverage in a given year in the country (%)? "))
            if 11 <= hepb_coverage <= 100:
                return hepb_coverage
            else:
                print("Please enter a number between 11 and 100.")
        except ValueError:
            print("Invalid input. Please enter an integer.")

def get_measles_cases():
    while True:
        try:
            measles_cases = float(input("What is the number of reported cases of measles per 1000 population in the country? "))
            if 10 <= measles_cases <= 100:
                return measles_cases
            else:
                print("Please enter a number between 10 and 100.")
        except ValueError:
            print("Invalid input. Please enter an integer.")

def get_bmi():
    while True:
        try:
            bmi = float(input("What is the average Body Mass Index of entire population in the country? "))
            if 19 <= bmi <= 33:
                return bmi
            else:
                print("Please enter a number between 19 and 33.")
        except ValueError:
            print("Invalid input. Please enter a float.")

def get_polio_coverage():
    while True:
        try:
            polio_coverage = float(input("What is the percentage of one-year-olds who have received the Polio (Pol3) immunization coverage in a given year in the country (%)? "))
            if 7 <= polio_coverage <= 100:
                return polio_coverage
            else:
                print("Please enter a number between 7 and 100.")
        except ValueError:
            print("Invalid input. Please enter an integer.")

def get_dtp3_coverage():
    while True:
        try:
            dtp3_coverage = float(input("What is the percentage of one-year-olds who have received the Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage in a given year in the country (%)? "))
            if 15 <= dtp3_coverage <= 100:
                return dtp3_coverage
            else:
                print("Please enter a number between 15 and 100.")
        except ValueError:
            print("Invalid input. Please enter an integer.")

def main():
    consent_given = ask_for_consent()
    if consent_given:
        year_range = ask_for_year_range()
        region = ask_for_region()
        population = get_population()
        gdp_per_capita = get_gdp_per_capita()
        economy_status = ask_for_economy_status()
        infant_deaths = get_infant_deaths()
        under_five_deaths = get_under_five_deaths()
        thinness_10_to_19 = get_thinness_10_to_19()
        thinness_5_to_9 = get_thinness_5_to_9()
        years_of_schooling = get_years_of_schooling()
        adult_mortality = get_adult_mortality()
        alcohol_consumption = get_alcohol_consumption()
        hepb_coverage = get_hepb_coverage()
        measles_cases = get_measles_cases()
        bmi = get_bmi()
        polio_coverage = get_polio_coverage()
        dtp3_coverage = get_dtp3_coverage()
        
        # Create a dictionary with the input data
        input_data = {
            'Year': year_range,
            'Population_mln': population,
            'GDP_per_capita': gdp_per_capita,
            'Economy_status_Developing': 1 if economy_status else 0,
            'Infant_deaths': infant_deaths,
            'Under_five_deaths': under_five_deaths,
            'Thinness_ten_nineteen_years': thinness_10_to_19,
            'Thinness_five_nine_years': thinness_5_to_9,
            'Schooling': years_of_schooling,
            'Adult_mortality': adult_mortality,
            'Alcohol_consumption': alcohol_consumption,
            'Hepatitis_B': hepb_coverage,
            'Measles': measles_cases,
            'BMI': bmi,
            'Polio': polio_coverage,
            'Diphtheria': dtp3_coverage
        }
        
        
            # One-hot encode the region and merge with the input data
        regions = ["Africa", "Asia", "Central America and Caribbean", "European Union", "Middle East", "North America", "Oceania", "Rest of Europe", "South America"]
        # One-hot encode the region and merge with the input data
        region_encoded = pd.get_dummies([region], prefix='Region').reindex(columns=[f'Region_{r}' for r in regions], fill_value=0)

        # Convert input data to DataFrame and concatenate with encoded region
        input_df = pd.DataFrame([input_data])
        user_input_df = pd.concat([input_df, region_encoded], axis=1)

        # Ensure the columns match the training data
        model_columns = ['Year',
            'Population_mln',
            'GDP_per_capita',
            'Economy_status_Developing',
            'Infant_deaths',
            'Under_five_deaths',
            'Thinness_ten_nineteen_years',
            'Thinness_five_nine_years',
            'Schooling',
            'Adult_mortality',
            'Alcohol_consumption',
            'Hepatitis_B',
            'Measles',
            'BMI',
            'Polio',
            'Diphtheria'] + [f'Region__{r}' for r in regions]

        # Reindex to ensure all necessary columns are present
        user_input_df = user_input_df.reindex(columns=model_columns, fill_value=0)
            # Create a DataFrame from the collected data
        user_input = pd.DataFrame(user_input_df)
            # Additional code to handle the given data can go here
        print(f"Year: {year_range}, Region: {region}, Population: {population} million, GDP per capita: ${gdp_per_capita}, Developed economy: {'Yes' if economy_status else 'No'}, Infant deaths per 1000: {infant_deaths}, Under-five deaths per 1000: {under_five_deaths}, Thinness 10-19: {thinness_10_to_19}%, Thinness 5-9: {thinness_5_to_9}%, Years of schooling: {years_of_schooling}, Adult mortality: {adult_mortality}, Alcohol consumption: {alcohol_consumption} litres, HepB coverage: {hepb_coverage}%, Measles cases per 1000: {measles_cases}, Average BMI: {bmi}, Polio coverage: {polio_coverage}%, DTP3 coverage: {dtp3_coverage}%")

        return consent_given, user_input
    elif consent_given == False:
        year_range = ask_for_year_range()
        population = get_population()
        gdp_per_capita = get_gdp_per_capita()
        economy_status = ask_for_economy_status()
        infant_deaths = get_infant_deaths()
        under_five_deaths = get_under_five_deaths()
        thinness_10_to_19 = get_thinness_10_to_19()
        thinness_5_to_9 = get_thinness_5_to_9()
        years_of_schooling = get_years_of_schooling()
        adult_mortality = get_adult_mortality()
        alcohol_consumption = get_alcohol_consumption()
        measles_cases = get_measles_cases()
        bmi = get_bmi()
        polio_coverage = get_polio_coverage()
        dtp3_coverage = get_dtp3_coverage()
        
        # Create a dictionary with the input data
        input_data = {
            'Year': year_range,
            'Population_mln': population,
            'GDP_per_capita': gdp_per_capita,
            'Economy_status_Developing': 1 if economy_status else 0,
            'Infant_deaths': infant_deaths,
            'Under_five_deaths': under_five_deaths,
            'Thinness_ten_nineteen_years': thinness_10_to_19,
            'Thinness_five_nine_years': thinness_5_to_9,
            'Schooling': years_of_schooling,
            'Adult_mortality': adult_mortality,
            'Alcohol_consumption': alcohol_consumption,
            'Measles': measles_cases,
            'BMI': bmi,
            'Polio': polio_coverage,
            'Diphtheria': dtp3_coverage
        }
          
        # Convert input data to DataFrame and concatenate
        input_df = pd.DataFrame([input_data])

    else:
        print("Please give a valid answer.")
        main()
    return consent_given, input_df
if __name__ == "__main__":
    consent, user = main()

        


Do you consent to using advanced population data, which may include protected information, for better accuracy? (Yes/No):  yes


Thank you for your consent.


Enter the year you wish to predict (between 2000 and 2025):  2020


You have selected the year: 2020


What region is the country you are investigating? Please type:
1 for Africa
2 for Asia
3 for Central America and Caribbean
4 for European Union
5 for Middle East
6 for North America
7 for Oceania
8 for Rest of Europe
9 for South America
(range 1 to 9):  4


You have selected: European Union


What is the population of the country in millions?  42
What is the GDP per capita of the country in USD ($)?  30000
Does the country have a developed economy? (Yes/No):  yes


The country has a developed economy.


What are the number of infant deaths per 1000 population of the country?  21
What are the number of under-five deaths per 1000 population of the country?  21
What is the prevalence (%) of thinness between the ages of 10 to 19 years in the country?  2
What is the prevalence (%) of thinness between the ages of 5 to 9 years in the country?  2
What are the average number of years of schooling in the country?  12
What is the combined adult mortality rates of men and women between 15 and 60 years per 1000 population of the country?  21


Please enter a number between 49 and 720.


What is the combined adult mortality rates of men and women between 15 and 60 years per 1000 population of the country?  52
What is the alcohol consumption, measured in litres of pure alcohol per person of 15 years of age or older, in the country (litres)?  4
What is the percentage of one-year-olds who have received the Hepatitis B (HepB) immunization coverage in a given year in the country (%)?  78
What is the number of reported cases of measles per 1000 population in the country?  13
What is the average Body Mass Index of entire population in the country?  29
What is the percentage of one-year-olds who have received the Polio (Pol3) immunization coverage in a given year in the country (%)?  89
What is the percentage of one-year-olds who have received the Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage in a given year in the country (%)?  98


Year: 2020, Region: 4, Population: 42.0 million, GDP per capita: $30000.0, Developed economy: Yes, Infant deaths per 1000: 21.0, Under-five deaths per 1000: 21.0, Thinness 10-19: 2.0%, Thinness 5-9: 2.0%, Years of schooling: 12.0, Adult mortality: 52.0, Alcohol consumption: 4.0 litres, HepB coverage: 78.0%, Measles cases per 1000: 13.0, Average BMI: 29.0, Polio coverage: 89.0%, DTP3 coverage: 98.0%


In [27]:
if consent == True:
    #some feature engineering - dropping columns, OHE
    data = data.drop(columns = ['Country', 'Economy_status_Developed', 'Incidents_HIV'])
    data = pd.get_dummies(data, columns = ['Region'], prefix = 'Region_', dtype=int)
elif consent == False:
    data = data.drop(columns=['Region', 'Country', 'Economy_status_Developed', 'Incidents_HIV'])


    
#splitting data into X and Y for the model
feature_cols = list(data.columns)
feature_cols.remove('Life_expectancy')
X = data[feature_cols]
y = data['Life_expectancy']

#test train split for model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.375, random_state=42)

# Columns not to be scaled
columns_not_to_scale = ['Year', 'Economy_status_Developing', "Region__Africa", "Region__Asia", "Region__Central America and Caribbean", "Region__European Union", "Region__Middle East", "Region__North America", "Region__Oceania", "Region__Rest of Europe", "Region__South America"]

# Ensure we only exclude columns that are present in the DataFrame
columns_not_to_scale_present = [col for col in columns_not_to_scale if col in X_train.columns]

# Columns to be scaled
columns_to_scale = [col for col in X_train.columns if col not in columns_not_to_scale_present]

# Initialize the scaler
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.fit_transform(X_test[columns_to_scale])

user_scaled = user.copy()
user_scaled[columns_to_scale] = scaler.transform(user[columns_to_scale])

#creating a model
result, selected_features = stepwise_selection(X_train_scaled, y_train)

In [29]:
user_selected = user_scaled[selected_features]


new_pred = result.predict(sm.add_constant(user_selected, has_constant = 'add')).round(1)
print(f'For the given data, the model predicts a life expectancy of: {new_pred.iloc[0]} years')

0    77.0
dtype: float64
For the given data, the model predicts a life expectancy of: 77.0 years


In [15]:
result.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.984
Model:,OLS,Adj. R-squared:,0.984
Method:,Least Squares,F-statistic:,7726.0
Date:,"Wed, 29 May 2024",Prob (F-statistic):,0.0
Time:,10:33:02,Log-Likelihood:,-2888.8
No. Observations:,1790,AIC:,5808.0
Df Residuals:,1775,BIC:,5890.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.5478,13.179,0.421,0.674,-20.301,31.396
Under_five_deaths,-2.1883,0.206,-10.598,0.000,-2.593,-1.783
Adult_mortality,-5.1429,0.057,-90.212,0.000,-5.255,-5.031
Economy_status_Developing,-2.7303,0.176,-15.491,0.000,-3.076,-2.385
Region__Central America and Caribbean,1.6403,0.108,15.133,0.000,1.428,1.853
Region__South America,1.4354,0.123,11.701,0.000,1.195,1.676
GDP_per_capita,0.3518,0.046,7.706,0.000,0.262,0.441
Region__Oceania,-1.2283,0.138,-8.879,0.000,-1.500,-0.957
Region__European Union,-1.1004,0.155,-7.106,0.000,-1.404,-0.797

0,1,2,3
Omnibus:,24.097,Durbin-Watson:,2.068
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.227
Skew:,0.261,Prob(JB):,3.33e-06
Kurtosis:,3.258,Cond. No.,917000.0


In [17]:
# Make predictions using the selected features
X_train_selected = X_train_scaled[selected_features]
y_pred = result.predict(sm.add_constant(X_train_selected))


# Calculate RMSE
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)
print(f'For the training set, the magnitude of the error is: {rmse}')

For the training set, the magnitude of the error is: 1.215181735301061


In [19]:
# Filter X_test to include only the selected features
X_test_selected = X_test_scaled[selected_features]
y_test_pred = result.predict(sm.add_constant(X_test_selected))

# Calculate RMSE for the test set
test_rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
print(f'For the test set, the magnitude of the error is: {test_rmse}')

For the test set, the magnitude of the error is: 1.293935915094751
