In [75]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

from keras.utils import np_utils
from keras.datasets import fashion_mnist
import time

In [3]:
life = pd.read_csv('Life_Expectancy_Data.csv')
# Data cleaning code
# rename columns
life.rename(columns={'Life expectancy ':'Life Expectancy',
                     'infant deaths':'Infant Deaths',
                     'percentage expenditure':'Percentage Expenditure',
                     'Measles ':'Measles',
                     ' BMI ':'BMI',
                     'under-five deaths ':'Under 5 Deaths',
                     'Diphtheria ':'Diptheria',
                     ' HIV/AIDS':'HIV/AIDS',
                     ' thinness  1-19 years':'Thinness 1-19 years',
                     ' thinness 5-9 years':'Thinness 5-9 years',
                     'Income composition of resources':'Income Comp'},
            inplace=True)

In [4]:
def replace(group, stds):
    """Give a column `group`, find all values that are more than 3 standard 
    deviations from the mean, and convert those to NaN."""
    group[np.abs(group - group.mean()) > stds * group.std()] = np.nan
    return group

In [5]:
def replace_false_values(df):
    """Replaces false life expectancy values for specific countries/years with the true value (per World Bank) """

    cleaner_life = df.copy()
    
    idx = np.where((cleaner_life['Country'] == 'Nicaragua') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.3
    idx = np.where((cleaner_life['Country'] == 'Kuwait') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 73.1
    idx = np.where((cleaner_life['Country'] == 'Ghana') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 57
    idx = np.where((cleaner_life['Country'] == 'Zimbabwe') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 59.5
    idx = np.where((cleaner_life['Country'] == 'Zambia') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.8
    idx = np.where((cleaner_life['Country'] == 'Yemen') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.7
    idx = np.where((cleaner_life['Country'] == 'Vanuatu') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69
    idx = np.where((cleaner_life['Country'] == 'Vanuatu') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 68.8
    idx = np.where((cleaner_life['Country'] == 'Vanuatu') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 68.7
    idx = np.where((cleaner_life['Country'] == 'United Republic of Tanzia') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 62.3
    idx = np.where((cleaner_life['Country'] == 'United Republic of Tanzia') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 52.9
    idx = np.where((cleaner_life['Country'] == 'United Kingdom of Great Britain and Northern Ireland') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'United Kingdom of Great Britain and Northern Ireland') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'United Kingdom of Great Britain and Northern Ireland') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Ukraine') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.2
    idx = np.where((cleaner_life['Country'] == 'Ukraine') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Ukraine') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.8
    idx = np.where((cleaner_life['Country'] == 'Uganda') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 59.9
    idx = np.where((cleaner_life['Country'] == 'Turkey') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Turkey') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70
    idx = np.where((cleaner_life['Country'] == 'Trinidad and Tobago Republic') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.4
    idx = np.where((cleaner_life['Country'] == 'Trinidad and Tobago Republic') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.2
    idx = np.where((cleaner_life['Country'] == 'Trinidad and Tobago Republic') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72
    idx = np.where((cleaner_life['Country'] == 'Syrian Arab Republic') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.9
    idx = np.where((cleaner_life['Country'] == 'Syrian Arab Republic') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.8
    idx = np.where((cleaner_life['Country'] == 'Syrian Arab Republic') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.1
    idx = np.where((cleaner_life['Country'] == 'Syrian Arab Republic') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Switzerland') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Switzerland') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Switzerland') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.2
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Suriname') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.7
    idx = np.where((cleaner_life['Country'] == 'Suriname') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.5
    idx = np.where((cleaner_life['Country'] == 'Sudan') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.5
    idx = np.where((cleaner_life['Country'] == 'Sri Lanka') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 74.3
    idx = np.where((cleaner_life['Country'] == 'Spain') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'Spain') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Spain') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'South Sudan') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.6
    idx = np.where((cleaner_life['Country'] == 'South Sudan') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.1
    idx = np.where((cleaner_life['Country'] == 'South Sudan') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.2
    idx = np.where((cleaner_life['Country'] == 'South Africa') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.1
    idx = np.where((cleaner_life['Country'] == 'Somalia') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 51.5
    idx = np.where((cleaner_life['Country'] == 'Somalia') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 51.2
    idx = np.where((cleaner_life['Country'] == 'Somalia') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.9
    idx = np.where((cleaner_life['Country'] == 'Slovenia') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Slovenia') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'Slovenia') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Slovenia') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Singapore') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Singapore') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80
    idx = np.where((cleaner_life['Country'] == 'Sierra Leone') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 52.4
    idx = np.where((cleaner_life['Country'] == 'Sierra Leone') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 51.7
    idx = np.where((cleaner_life['Country'] == 'Sierra Leone') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 41.4
    idx = np.where((cleaner_life['Country'] == 'Senegal') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.6
    idx = np.where((cleaner_life['Country'] == 'Samoa') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.4
    idx = np.where((cleaner_life['Country'] == 'Samoa') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.8
    idx = np.where((cleaner_life['Country'] == 'Samoa') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.5
    idx = np.where((cleaner_life['Country'] == 'Samoa') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.1
    idx = np.where((cleaner_life['Country'] == 'Saint Vincent and the Grenadines') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71
    idx = np.where((cleaner_life['Country'] == 'Saint Vincent and the Grenadines') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Saint Vincent and the Grenadines') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.8
    idx = np.where((cleaner_life['Country'] == 'Rwanda') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 51
    idx = np.where((cleaner_life['Country'] == 'Rwanda') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.6
    idx = np.where((cleaner_life['Country'] == 'Russian Federation') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.7
    idx = np.where((cleaner_life['Country'] == 'Russian Federation') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.2
    idx = np.where((cleaner_life['Country'] == 'Romania') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71
    idx = np.where((cleaner_life['Country'] == 'Romania') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.2
    idx = np.where((cleaner_life['Country'] == 'Romania') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.2
    idx = np.where((cleaner_life['Country'] == 'Republic of Moldova') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Republic of Moldova') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.1
    idx = np.where((cleaner_life['Country'] == 'Republic of Korea') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Republic of Korea') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80
    idx = np.where((cleaner_life['Country'] == 'Republic of Korea') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 79.5
    idx = np.where((cleaner_life['Country'] == 'Cuba') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 76.7
    idx = np.where((cleaner_life['Country'] == 'Chile') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 79.5
    idx = np.where((cleaner_life['Country'] == 'Chile') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 79.7
    idx = np.where((cleaner_life['Country'] == 'Canada') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Portugal') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Portugal') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Portugal') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Portugal') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'Paraguay') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Papua New Guinea') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.8
    idx = np.where((cleaner_life['Country'] == 'Papua New Guinea') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.5
    idx = np.where((cleaner_life['Country'] == 'Norway') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Norway') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Norway') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Norway') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Nigeria') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.9
    idx = np.where((cleaner_life['Country'] == 'Nigeria') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.4
    idx = np.where((cleaner_life['Country'] == 'Niger') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.6
    idx = np.where((cleaner_life['Country'] == 'Niger') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 59.5
    idx = np.where((cleaner_life['Country'] == 'New Zealand') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'New Zealand') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'New Zealand') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Netherlands') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Netherlands') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Netherlands') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Netherlands') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Myanmar') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 62.7
    idx = np.where((cleaner_life['Country'] == 'Mozambique') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.1
    idx = np.where((cleaner_life['Country'] == 'Mozambique') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.4
    idx = np.where((cleaner_life['Country'] == 'Morocco') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.8
    idx = np.where((cleaner_life['Country'] == 'Mauritania') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.2
    idx = np.where((cleaner_life['Country'] == 'Mauritania') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.4
    idx = np.where((cleaner_life['Country'] == 'Mauritania') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.6
    idx = np.where((cleaner_life['Country'] == 'Malta') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.2
    idx = np.where((cleaner_life['Country'] == 'Malta') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.4
    idx = np.where((cleaner_life['Country'] == 'Malta') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Mali') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 48.8
    idx = np.where((cleaner_life['Country'] == 'Maldives') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71
    idx = np.where((cleaner_life['Country'] == 'Madagascar') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.2
    idx = np.where((cleaner_life['Country'] == 'Madagascar') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.8
    idx = np.where((cleaner_life['Country'] == 'Luxembourg') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Luxembourg') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Luxembourg') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Lithuania') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.1
    idx = np.where((cleaner_life['Country'] == 'Lithuania') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.3
    idx = np.where((cleaner_life['Country'] == 'Libya') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Liberia') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.7
    idx = np.where((cleaner_life['Country'] == 'Latvia') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71
    idx = np.where((cleaner_life['Country'] == 'Latvia') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Latvia') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.4
    idx = np.where((cleaner_life['Country'] == 'Latvia') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.3
    idx = np.where((cleaner_life['Country'] == 'Kyrgyzstan') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.4
    idx = np.where((cleaner_life['Country'] == 'Kyrgyzstan') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.2
    idx = np.where((cleaner_life['Country'] == 'Italy') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Italy') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Israel') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Israel') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Ireland') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'Ireland') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Ireland') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Iran') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.3
    idx = np.where((cleaner_life['Country'] == 'Iran') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Iran') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.2
    idx = np.where((cleaner_life['Country'] == 'Indonesia') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 67
    idx = np.where((cleaner_life['Country'] == 'Iceland') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Iceland') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Iceland') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Haiti') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.5
    idx = np.where((cleaner_life['Country'] == 'Guatemala') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.5
    idx = np.where((cleaner_life['Country'] == 'Guatemala') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.1
    idx = np.where((cleaner_life['Country'] == 'Guatemala') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.4
    idx = np.where((cleaner_life['Country'] == 'Guatemala') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.8
    idx = np.where((cleaner_life['Country'] == 'Grenada') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.9
    idx = np.where((cleaner_life['Country'] == 'Grenada') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.8
    idx = np.where((cleaner_life['Country'] == 'Grenada') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.7
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.3
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.4
    idx = np.where((cleaner_life['Country'] == 'Ghana') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61
    idx = np.where((cleaner_life['Country'] == 'Ghana') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.6
    idx = np.where((cleaner_life['Country'] == 'Ghana') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.2
    idx = np.where((cleaner_life['Country'] == 'Germany') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Germany') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Germany') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Germany') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'Gambia') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.7
    idx = np.where((cleaner_life['Country'] == 'Gambia') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.4
    idx = np.where((cleaner_life['Country'] == 'Gabon') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 58.2
    idx = np.where((cleaner_life['Country'] == 'France') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.2
    idx = np.where((cleaner_life['Country'] == 'France') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'France') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Finland') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Finland') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Finland') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Finland') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.2
    idx = np.where((cleaner_life['Country'] == 'Ethiopia') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.6
    idx = np.where((cleaner_life['Country'] == 'Estonia') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.3
    idx = np.where((cleaner_life['Country'] == 'Estonia') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.4
    idx = np.where((cleaner_life['Country'] == 'El Salvador') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.3
    idx = np.where((cleaner_life['Country'] == 'El Salvador') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.4
    idx = np.where((cleaner_life['Country'] == 'Egypt') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Egypt') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.1
    idx = np.where((cleaner_life['Country'] == 'Egypt') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.3
    idx = np.where((cleaner_life['Country'] == 'Djibouti') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 59.6
    idx = np.where((cleaner_life['Country'] == 'Denmark') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Denmark') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Cyprus') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80
    idx = np.where((cleaner_life['Country'] == 'Cyprus') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.2
    idx = np.where((cleaner_life['Country'] == 'Cyprus') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Congo') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 56.4
    idx = np.where((cleaner_life['Country'] == 'Comoros') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.4
    idx = np.where((cleaner_life['Country'] == 'Comoros') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.7
    idx = np.where((cleaner_life['Country'] == 'Chad') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.4
    idx = np.where((cleaner_life['Country'] == 'Central African Republic') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.1
    idx = np.where((cleaner_life['Country'] == 'Cabo Verde') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Canada') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Canada') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Cabo Verde') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.6
    idx = np.where((cleaner_life['Country'] == 'Cabo Verde') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.1
    idx = np.where((cleaner_life['Country'] == 'Australia') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Australia') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.2
    idx = np.where((cleaner_life['Country'] == 'Austria') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Austria') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Azerbaijan') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Bangladesh') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Bangladesh') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.3
    idx = np.where((cleaner_life['Country'] == 'Belgium') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.3
    idx = np.where((cleaner_life['Country'] == 'Belgium') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Belgium') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Belgium') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Brazil') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.1
    idx = np.where((cleaner_life['Country'] == 'Burkina Faso') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.9
    idx = np.where((cleaner_life['Country'] == 'Burundi') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.1
    cleaner_life.at[736,'Life Expectancy'] = 50
    cleaner_life.at[865,'Life Expectancy'] = 55.3
    cleaner_life.at[857,'Life Expectancy'] = 61
    cleaner_life.at[913,'Life Expectancy'] = 65.7
    cleaner_life.at[993,'Life Expectancy'] = 69.9

    return cleaner_life

In [6]:
def clean_na(df):
    col = df.columns
    col = col.drop('Country')
    # get unique countries
    countries = df.Country.unique()
    for country in countries:
        cur_country = df[df.Country==country]
        ind = cur_country.index
        for c in col:
            na_ind = cur_country[cur_country[c].isna()].index
            # proceed with calculations if there are any na values
            if len(na_ind) > 0:
                value_ind = [i for i in ind if i not in na_ind]
                for i in na_ind:
                    # if there is a value in that country in the year before and the year after
                    # fill it with the average between those two
                    if i-1 in value_ind and i+1 in value_ind:
                        df.at[i,c] = np.mean(df.loc[[i-1,i+1],c])
                        # if it is the first or last year, or surrounded by na, just take average of whole column
                    else:
                        df.at[i,c] = np.mean(df.loc[value_ind,c])
    return df

In [7]:
def clean_na_col(df):
    col = df.columns
    col = col.drop('Country')
    # get unique countries
    countries = df.Country.unique()
    for country in countries:
        cur_country = df[df.Country==country]
        ind = cur_country.index
        for c in col:
            na_ind = cur_country[cur_country[c].isna()].index
            # proceed with calculations if there are any na values
            if len(na_ind) > 0:
                value_ind = [i for i in ind if i not in na_ind]
                if len(value_ind) == 0:
                    # developing or developed
                    status = list(cur_country.Status)[0]
                    same_status = df[df.Status==status]
                    # get each year
                    for i in na_ind:
                        year = df.loc[i].Year
                        # find average for respective status and year
                        year_status = same_status[same_status["Year"] == year]
                        df.at[i,c] = np.mean(year_status[c])
    return df

In [8]:
# df is your DataFrame
col = life.columns.drop('Country')
col = col.drop('Status')
life.at[:, col] = life.groupby('Country').transform(lambda g: replace(g, 3))

life = clean_na(life)
life = clean_na_col(life)

# replace false values
life = replace_false_values(life)
life = pd.get_dummies(life, columns=['Status','Country'], drop_first=True)

X = life.drop(columns=['Life Expectancy'])
y = life['Life Expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [9]:
X.head()

Unnamed: 0,Year,Adult Mortality,Infant Deaths,Alcohol,Percentage Expenditure,Hepatitis B,Measles,BMI,Under 5 Deaths,Polio,...,Country_United Republic of Tanzania,Country_United States of America,Country_Uruguay,Country_Uzbekistan,Country_Vanuatu,Country_Venezuela (Bolivarian Republic of),Country_Viet Nam,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,2015,263.0,62.0,0.01,71.279624,65.0,1154.0,19.1,83.0,6.0,...,0,0,0,0,0,0,0,0,0,0
1,2014,271.0,64.0,0.01,73.523582,62.0,492.0,18.6,86.0,58.0,...,0,0,0,0,0,0,0,0,0,0
2,2013,268.0,66.0,0.01,73.219243,64.0,430.0,18.1,89.0,62.0,...,0,0,0,0,0,0,0,0,0,0
3,2012,272.0,69.0,0.01,78.184215,67.0,2787.0,17.6,93.0,67.0,...,0,0,0,0,0,0,0,0,0,0
4,2011,275.0,71.0,0.01,7.097109,68.0,3013.0,17.2,97.0,68.0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#std dev
def my_score(y_test,y_pred,sd=9.3):
    diff = np.abs(y_pred-y_test)
    true = diff[diff<sd/2]
    res = len(true)/len(y_test)
    return res

mses = []
maes = []
accs = []
scores = []
start = time.time()

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    model = GradientBoostingRegressor(n_estimators=250,
                                    learning_rate=0.02,
                                    min_samples_split = 3,
                                    max_depth = 3,
                                    warm_start=False).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    acc = my_score(y_test, y_pred)
    score = model.score(X_test, y_test)
    mses.append(mse)
    maes.append(mae)
    accs.append(acc)
    scores.append(score)
    # Check similar attribute importances
    # feature_importance = model.feature_importances_
    # indices = np.argsort(feature_importance)[::-1]
    # print(X.columns[indices][:15])
end = time.time()
print("Time: " + str(end - start))
#print(mses)
#print(maes)
#print(accs)
#print(scores)

Time: 24.091590642929077


In [12]:
print(sum(mses)/len(mses))
print(sum(maes)/len(maes))
print(sum(accs)/len(accs))
print(sum(scores)/len(scores))


3.4585671296108815
1.3983584397445141
0.9786848072562357
0.9606520487607801


In [13]:
feature_importance = model.feature_importances_
print(feature_importance[:23])

[4.41751742e-03 1.29086429e-01 6.06112940e-03 2.17602204e-03
 6.65340893e-04 0.00000000e+00 1.01585193e-04 1.87669558e-02
 2.37096825e-02 4.17477469e-03 7.33894381e-04 8.01613204e-03
 4.90910991e-01 4.05165451e-04 1.78255671e-04 9.43040359e-04
 7.82466483e-03 2.79372516e-01 1.08788858e-02 0.00000000e+00
 0.00000000e+00 0.00000000e+00 3.81957457e-04]


In [14]:
indices = np.argsort(feature_importance)[::-1]
print(indices)
print(X.columns[indices][:20])

[ 12  17   1   8   7  18  11  16   2 156   0   9 171   3  15  10   4  76
  60  13  22 159 135 211  92  14   6  63  28 140  91  58  39  50 186 107
 176  77 121  56  74 148 173  64  62  61 193  53  59  52  57  51  65  55
  54 187  70  66  67  86  85  84  83  82  81  80  79  78 192  75  73  72
  71  48  69  68  49  44  47  46  21  20  19 195 196 197 198 199 200 201
 202 203 204 205   5 206 207 208 209 194  23  24  36  45  88  43  42  41
  40  38  37  35  25  34  33  32  31  30  29  27  26  87  93  89 139 161
 160 189 158 157 190 155 154 153 152 151 150 149 147 146 145 144 143 142
 162 163 164 177 185 184 183 182 181 180 179 178 175 165 174 172 188 170
 169 168 167 166 141 138  90 137 113 112 111 110 109 108 106 210 104 103
 102 101 100  99  98  97  96  95  94 114 115 116 128 136 191 134 133 132
 131 130 129 127 117 126 125 124 123 122 120 119 118 105]
Index(['HIV/AIDS', 'Income Comp', 'Adult Mortality', 'Under 5 Deaths', 'BMI',
       'Schooling', 'Diptheria', 'Thinness 5-9 years', 'Infan

In [35]:
def evaluate_model(model, param_grid, X_train, X_test, y_train, y_test):
    # Grid Search
    model_gs = GridSearchCV(model, param_grid, n_jobs=-1)
    model_gs.fit(X_train, y_train)

    # Results
    model_gs_score = model_gs.score(X_test, y_test)
    model_gs_best = model_gs.best_estimator_
    model_gs_best_params = model_gs.best_params_
    model_gs_best_index = model_gs.best_index_

    # Calculate error of test set
    model_predictions = model_gs_best.predict(X_test)
    model_mse = mean_squared_error(y_test, model_predictions)
    model_mae = mean_absolute_error(y_test, model_predictions)

    return model_gs, model_gs_score, model_gs_best, model_gs_best_params, model_gs_best_index, model_mse, model_mae

In [36]:
# Lasso Regression

param_grid = {"normalize": [True, False],
              "alpha": [10**i for i in range(-8, 4)]}

# Model
lasso_reg = Lasso()

# Results
res = evaluate_model(lasso_reg, param_grid, X_train, X_test, y_train, y_test)
lasso_reg_gs = res[0]
lasso_reg_gs_best = res[1]
lasso_reg_gs_best_params = res[2]
lasso_reg_gs_best_index = res[3]
lasso_reg_mse = res[4]
lasso_reg_mae = res[5]

print()

print("MSE: " + str(lasso_reg_mse))
print("MAE: " + str(lasso_reg_mae))



MSE: 9
MAE: 1.8543627601606938


  model = cd_fast.enet_coordinate_descent(


In [37]:
print(lasso_reg_gs_best_params)

Lasso(alpha=0.0001)


In [58]:
lasso_reg = Lasso(alpha=.0001).fit(X_train, y_train)
coeffs = lasso_reg.coef_
print(coeffs[:23])
indices = [i for i, coef in enumerate(coeffs) if abs(coef) < 10e-4]
print(indices)
print(X.columns[indices])
print(len(X.columns[indices]))

[ 2.43731435e-01 -2.24106498e-03  2.65414204e-02 -5.05325199e-02
  5.04659139e-05 -2.61281279e-03 -7.78485005e-06  2.02871207e-03
 -2.69024674e-02  3.68887575e-03 -1.05480634e-02  9.11912510e-03
 -3.16023089e-01 -5.57393954e-06  2.13177451e-09 -4.60154610e-02
  5.91171923e-03 -1.26769950e-01  2.09372952e-01 -1.56110475e+01
  1.19989417e+01  1.03341912e+01 -9.65944035e+00]
[4, 6, 13, 14, 83, 161]
Index(['Percentage Expenditure', 'Measles', 'GDP', 'Population',
       'Country_Germany', 'Country_Saint Kitts and Nevis'],
      dtype='object')
6


  model = cd_fast.enet_coordinate_descent(


In [62]:
mses = []
maes = []
accs = []
scores = []
start = time.time()

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    model = LinearRegression().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    acc = my_score(y_test, y_pred)
    score = model.score(X_test, y_test)
    mses.append(mse)
    maes.append(mae)
    accs.append(acc)
    scores.append(score)
end = time.time()
print("Time: " + str(end - start))

Time: 0.3089942932128906


In [63]:
print(sum(mses)/len(mses))
print(sum(maes)/len(maes))
print(sum(accs)/len(accs))
print(sum(scores)/len(scores))


1.74720533436526
0.7920394801033187
0.9852607709750565
0.9797972700498045


In [64]:
mses = []
maes = []
accs = []
scores = []
start = time.time()

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    model = Lasso(alpha=.0001).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    acc = my_score(y_test, y_pred)
    score = model.score(X_test, y_test)
    mses.append(mse)
    maes.append(mae)
    accs.append(acc)
    scores.append(score)
end = time.time()
print("Time: " + str(end - start))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Time: 2.816836357116699


  model = cd_fast.enet_coordinate_descent(


In [65]:
print(sum(mses)/len(mses))
print(sum(maes)/len(maes))
print(sum(accs)/len(accs))
print(sum(scores)/len(scores))

1.5302148860015432
0.7932605253286853
0.9878684807256235
0.9821635084473705


In [67]:
mses = []
maes = []
accs = []
scores = []
start = time.time()

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    model = Ridge(alpha=.0001).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    acc = my_score(y_test, y_pred)
    score = model.score(X_test, y_test)
    mses.append(mse)
    maes.append(mae)
    accs.append(acc)
    scores.append(score)
end = time.time()
print("Time: " + str(end - start))

  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


Time: 0.292999267578125


  return linalg.solve(A, Xy, sym_pos=True,


In [68]:
print(sum(mses)/len(mses))
print(sum(maes)/len(maes))
print(sum(accs)/len(accs))
print(sum(scores)/len(scores))

1.674573063694769
0.786071334693432
0.9877551020408163
0.9808159705963518


In [71]:
mses = []
maes = []
accs = []
scores = []
start = time.time()

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    model = ElasticNet().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    acc = my_score(y_test, y_pred)
    score = model.score(X_test, y_test)
    mses.append(mse)
    maes.append(mae)
    accs.append(acc)
    scores.append(score)
end = time.time()
print("Time: " + str(end - start))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Time: 1.6101031303405762


In [72]:
print(sum(mses)/len(mses))
print(sum(maes)/len(maes))
print(sum(accs)/len(accs))
print(sum(scores)/len(scores))

14.409823467227838
2.8223553892526048
0.8319727891156463
0.8327974124507845


In [76]:
# Logistic Regression
def evaluate_model_bins(model, param_grid, X_train, X_test, y_train, y_test):
    # Grid Search
    model_gs = GridSearchCV(model, param_grid, n_jobs=-1)
    model_gs.fit(X_train, y_train)

    # Results
    model_gs_score = model_gs.score(X_test, y_test)
    model_gs_best = model_gs.best_estimator_
    model_gs_best_params = model_gs.best_params_
    model_gs_best_index = model_gs.best_index_

    # Calculate error of test set
    model_predictions = model_gs_best.predict(X_test)
    model_mse = mean_squared_error(y_test, model_predictions)
    model_mae = mean_absolute_error(y_test, model_predictions)

    def my_score(y_test,y_pred,sd=8.796834):
        diff = np.abs(y_pred-y_test)
        true = diff[diff<sd/2]
        return len(true)/len(y_test)

    accuracy = my_score(y_test, model_predictions)

    return model_gs, accuracy, model_gs_score, model_gs_best, model_gs_best_params, model_gs_best_index, model_gs_best_index, model_mse, model_mae

life['age bin'] = 40.
ages = [45.,50.,55.,60.,65.,70.,75.,80.,85.]
for age in ages:
    mask = life['Life Expectancy'] >= age
    life.loc[mask,'age bin'] = age
y_bins = life['age bin']

X_train_bins, X_test_bins, y_train_bins, y_test_bins = train_test_split(life.drop(columns=['Life Expectancy','age bin']), y_bins,test_size=.3)

param_grid = {"penalty": ['none','l2'],
              "solver":['newton-cg','lbfgs'],
              "C":[1,.1],
              "multi_class": ['ovr','multinomial']
              }

# Model
log_reg = LogisticRegression(max_iter=1000)


log_gs = GridSearchCV(log_reg, param_grid, n_jobs=-1)
log_gs.fit(X_train_bins, y_train_bins)
log_best = log_gs.best_estimator_

print(log_gs.best_params_)

MSEs = []
MAEs = []
scores = []
for _ in range(10):
  X_train_bins, X_test_bins, y_train_bins, y_test_bins = train_test_split(life.drop(columns=['Life Expectancy','age bin']), y_bins,test_size=.3)
  log_best.fit(X_train_bins, y_train_bins)
  score = log_best.score(X_test_bins,y_test_bins)

  pred = log_best.predict(X_test_bins)

  mse = mean_squared_error(y_test_bins,pred)
  mae = mean_absolute_error(y_test_bins,pred)
  MSEs.append(mse)
  MAEs.append(mae)
  scores.append(score)

print(f'MSE: {np.mean(MSEs)}, MAE: {np.mean(MAEs)}, score: {np.mean(scores)}')



















































{'C': 1, 'multi_class': 'ovr', 'penalty': 'none', 'solver': 'newton-cg'}












































































































































































































































































































































































































































































































































MSE: 8.129251700680271, MAE: 1.3628117913832198, score: 0.7452380952380951


