In [47]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from keras.utils import np_utils
from keras.datasets import fashion_mnist
import time

In [22]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
input_dim = 784 #28*28
X_train = X_train.reshape(60000, input_dim)
X_test = X_test.reshape(10000, input_dim)
print(X_train.shape)
print(y_train.shape)
block = np.hstack((X_train, np.array([y_train]).T))
np.random.shuffle(block)

sample_size = 1000

X_train_sample = block[:sample_size, :-1]
y_train_sample = block[:sample_size, -1]

print(y_train_sample.shape)

X_train_sample = X_train_sample/255
X_test = X_test/255

(60000, 784)
(60000,)
(1000,)


In [27]:
start = time.time()
model = GradientBoostingClassifier(n_estimators=50,
                                    learning_rate=0.02,
                                    min_samples_split = 3).fit(X_train_sample, y_train_sample)
end = time.time()
print("Time to train: " + str(end - start))
print("Fitted")
score = model.score(X_test, y_test)
print(score)

Time to train: 37.33916091918945
Fitted
0.7408


In [30]:
start = time.time()
model = RandomForestClassifier(n_estimators=50,
                                min_samples_split = 3).fit(X_train_sample, y_train_sample)
end = time.time()
print("Time to train: " + str(end - start))
print("Fitted")
score = model.score(X_test, y_test)
print(score)

Time to train: 0.34105491638183594
Fitted
0.7993


In [34]:
life = pd.read_csv('Life_Expectancy_Data.csv')
# Data cleaning code
# rename columns
life.rename(columns={'Life expectancy ':'Life Expectancy',
                     'infant deaths':'Infant Deaths',
                     'percentage expenditure':'Percentage Expenditure',
                     'Measles ':'Measles',
                     ' BMI ':'BMI',
                     'under-five deaths ':'Under 5 Deaths',
                     'Diphtheria ':'Diptheria',
                     ' HIV/AIDS':'HIV/AIDS',
                     ' thinness  1-19 years':'Thinness 1-19 years',
                     ' thinness 5-9 years':'Thinness 5-9 years',
                     'Income composition of resources':'Income Comp'},
            inplace=True)

In [82]:
def replace(group, stds):
    """Give a column `group`, find all values that are more than 3 standard 
    deviations from the mean, and convert those to NaN."""
    group[np.abs(group - group.mean()) > stds * group.std()] = np.nan
    return group

In [83]:
def replace_false_values(df):
    """Replaces false life expectancy values for specific countries/years with the true value (per World Bank) """

    cleaner_life = df.copy()
    
    idx = np.where((cleaner_life['Country'] == 'Nicaragua') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.3
    idx = np.where((cleaner_life['Country'] == 'Kuwait') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 73.1
    idx = np.where((cleaner_life['Country'] == 'Ghana') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 57
    idx = np.where((cleaner_life['Country'] == 'Zimbabwe') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 59.5
    idx = np.where((cleaner_life['Country'] == 'Zambia') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.8
    idx = np.where((cleaner_life['Country'] == 'Yemen') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.7
    idx = np.where((cleaner_life['Country'] == 'Vanuatu') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69
    idx = np.where((cleaner_life['Country'] == 'Vanuatu') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 68.8
    idx = np.where((cleaner_life['Country'] == 'Vanuatu') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 68.7
    idx = np.where((cleaner_life['Country'] == 'United Republic of Tanzia') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 62.3
    idx = np.where((cleaner_life['Country'] == 'United Republic of Tanzia') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 52.9
    idx = np.where((cleaner_life['Country'] == 'United Kingdom of Great Britain and Northern Ireland') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'United Kingdom of Great Britain and Northern Ireland') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'United Kingdom of Great Britain and Northern Ireland') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Ukraine') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.2
    idx = np.where((cleaner_life['Country'] == 'Ukraine') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Ukraine') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.8
    idx = np.where((cleaner_life['Country'] == 'Uganda') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 59.9
    idx = np.where((cleaner_life['Country'] == 'Turkey') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Turkey') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70
    idx = np.where((cleaner_life['Country'] == 'Trinidad and Tobago Republic') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.4
    idx = np.where((cleaner_life['Country'] == 'Trinidad and Tobago Republic') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.2
    idx = np.where((cleaner_life['Country'] == 'Trinidad and Tobago Republic') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72
    idx = np.where((cleaner_life['Country'] == 'Syrian Arab Republic') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.9
    idx = np.where((cleaner_life['Country'] == 'Syrian Arab Republic') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.8
    idx = np.where((cleaner_life['Country'] == 'Syrian Arab Republic') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.1
    idx = np.where((cleaner_life['Country'] == 'Syrian Arab Republic') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Switzerland') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Switzerland') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Switzerland') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.2
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Sweden') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Suriname') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.7
    idx = np.where((cleaner_life['Country'] == 'Suriname') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.5
    idx = np.where((cleaner_life['Country'] == 'Sudan') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.5
    idx = np.where((cleaner_life['Country'] == 'Sri Lanka') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 74.3
    idx = np.where((cleaner_life['Country'] == 'Spain') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'Spain') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Spain') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'South Sudan') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.6
    idx = np.where((cleaner_life['Country'] == 'South Sudan') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.1
    idx = np.where((cleaner_life['Country'] == 'South Sudan') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.2
    idx = np.where((cleaner_life['Country'] == 'South Africa') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.1
    idx = np.where((cleaner_life['Country'] == 'Somalia') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 51.5
    idx = np.where((cleaner_life['Country'] == 'Somalia') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 51.2
    idx = np.where((cleaner_life['Country'] == 'Somalia') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.9
    idx = np.where((cleaner_life['Country'] == 'Slovenia') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Slovenia') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'Slovenia') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Slovenia') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Singapore') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Singapore') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80
    idx = np.where((cleaner_life['Country'] == 'Sierra Leone') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 52.4
    idx = np.where((cleaner_life['Country'] == 'Sierra Leone') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 51.7
    idx = np.where((cleaner_life['Country'] == 'Sierra Leone') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 41.4
    idx = np.where((cleaner_life['Country'] == 'Senegal') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.6
    idx = np.where((cleaner_life['Country'] == 'Samoa') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.4
    idx = np.where((cleaner_life['Country'] == 'Samoa') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.8
    idx = np.where((cleaner_life['Country'] == 'Samoa') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.5
    idx = np.where((cleaner_life['Country'] == 'Samoa') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.1
    idx = np.where((cleaner_life['Country'] == 'Saint Vincent and the Grenadines') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71
    idx = np.where((cleaner_life['Country'] == 'Saint Vincent and the Grenadines') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Saint Vincent and the Grenadines') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.8
    idx = np.where((cleaner_life['Country'] == 'Rwanda') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 51
    idx = np.where((cleaner_life['Country'] == 'Rwanda') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.6
    idx = np.where((cleaner_life['Country'] == 'Russian Federation') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.7
    idx = np.where((cleaner_life['Country'] == 'Russian Federation') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.2
    idx = np.where((cleaner_life['Country'] == 'Romania') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71
    idx = np.where((cleaner_life['Country'] == 'Romania') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.2
    idx = np.where((cleaner_life['Country'] == 'Romania') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.2
    idx = np.where((cleaner_life['Country'] == 'Republic of Moldova') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Republic of Moldova') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.1
    idx = np.where((cleaner_life['Country'] == 'Republic of Korea') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Republic of Korea') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80
    idx = np.where((cleaner_life['Country'] == 'Republic of Korea') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 79.5
    idx = np.where((cleaner_life['Country'] == 'Cuba') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 76.7
    idx = np.where((cleaner_life['Country'] == 'Chile') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 79.5
    idx = np.where((cleaner_life['Country'] == 'Chile') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 79.7
    idx = np.where((cleaner_life['Country'] == 'Canada') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Portugal') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Portugal') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Portugal') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Portugal') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'Paraguay') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Papua New Guinea') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.8
    idx = np.where((cleaner_life['Country'] == 'Papua New Guinea') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.5
    idx = np.where((cleaner_life['Country'] == 'Norway') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Norway') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Norway') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Norway') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Nigeria') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.9
    idx = np.where((cleaner_life['Country'] == 'Nigeria') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.4
    idx = np.where((cleaner_life['Country'] == 'Niger') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.6
    idx = np.where((cleaner_life['Country'] == 'Niger') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 59.5
    idx = np.where((cleaner_life['Country'] == 'New Zealand') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'New Zealand') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'New Zealand') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Netherlands') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.1
    idx = np.where((cleaner_life['Country'] == 'Netherlands') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Netherlands') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Netherlands') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Myanmar') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 62.7
    idx = np.where((cleaner_life['Country'] == 'Mozambique') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.1
    idx = np.where((cleaner_life['Country'] == 'Mozambique') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.4
    idx = np.where((cleaner_life['Country'] == 'Morocco') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.8
    idx = np.where((cleaner_life['Country'] == 'Mauritania') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.2
    idx = np.where((cleaner_life['Country'] == 'Mauritania') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.4
    idx = np.where((cleaner_life['Country'] == 'Mauritania') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.6
    idx = np.where((cleaner_life['Country'] == 'Malta') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.2
    idx = np.where((cleaner_life['Country'] == 'Malta') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.4
    idx = np.where((cleaner_life['Country'] == 'Malta') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Mali') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 48.8
    idx = np.where((cleaner_life['Country'] == 'Maldives') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71
    idx = np.where((cleaner_life['Country'] == 'Madagascar') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61.2
    idx = np.where((cleaner_life['Country'] == 'Madagascar') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.8
    idx = np.where((cleaner_life['Country'] == 'Luxembourg') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Luxembourg') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Luxembourg') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Lithuania') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.1
    idx = np.where((cleaner_life['Country'] == 'Lithuania') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.3
    idx = np.where((cleaner_life['Country'] == 'Libya') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Liberia') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.7
    idx = np.where((cleaner_life['Country'] == 'Latvia') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71
    idx = np.where((cleaner_life['Country'] == 'Latvia') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Latvia') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.4
    idx = np.where((cleaner_life['Country'] == 'Latvia') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.3
    idx = np.where((cleaner_life['Country'] == 'Kyrgyzstan') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.4
    idx = np.where((cleaner_life['Country'] == 'Kyrgyzstan') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.2
    idx = np.where((cleaner_life['Country'] == 'Italy') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Italy') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Israel') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Israel') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Ireland') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.9
    idx = np.where((cleaner_life['Country'] == 'Ireland') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Ireland') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Iran') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.3
    idx = np.where((cleaner_life['Country'] == 'Iran') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Iran') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.2
    idx = np.where((cleaner_life['Country'] == 'Indonesia') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 67
    idx = np.where((cleaner_life['Country'] == 'Iceland') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Iceland') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Iceland') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Haiti') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.5
    idx = np.where((cleaner_life['Country'] == 'Guatemala') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.5
    idx = np.where((cleaner_life['Country'] == 'Guatemala') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.1
    idx = np.where((cleaner_life['Country'] == 'Guatemala') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.4
    idx = np.where((cleaner_life['Country'] == 'Guatemala') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.8
    idx = np.where((cleaner_life['Country'] == 'Grenada') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.9
    idx = np.where((cleaner_life['Country'] == 'Grenada') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.8
    idx = np.where((cleaner_life['Country'] == 'Grenada') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 72.7
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.3
    idx = np.where((cleaner_life['Country'] == 'Greece') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.4
    idx = np.where((cleaner_life['Country'] == 'Ghana') & (cleaner_life['Year'] == 2010))
    cleaner_life.at[idx[0],'Life Expectancy'] = 61
    idx = np.where((cleaner_life['Country'] == 'Ghana') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.6
    idx = np.where((cleaner_life['Country'] == 'Ghana') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.2
    idx = np.where((cleaner_life['Country'] == 'Germany') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Germany') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Germany') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Germany') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'Gambia') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.7
    idx = np.where((cleaner_life['Country'] == 'Gambia') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.4
    idx = np.where((cleaner_life['Country'] == 'Gabon') & (cleaner_life['Year'] == 2005))
    cleaner_life.at[idx[0],'Life Expectancy'] = 58.2
    idx = np.where((cleaner_life['Country'] == 'France') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.2
    idx = np.where((cleaner_life['Country'] == 'France') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.1
    idx = np.where((cleaner_life['Country'] == 'France') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.8
    idx = np.where((cleaner_life['Country'] == 'Finland') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Finland') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Finland') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Finland') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.2
    idx = np.where((cleaner_life['Country'] == 'Ethiopia') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.6
    idx = np.where((cleaner_life['Country'] == 'Estonia') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.3
    idx = np.where((cleaner_life['Country'] == 'Estonia') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.4
    idx = np.where((cleaner_life['Country'] == 'El Salvador') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.3
    idx = np.where((cleaner_life['Country'] == 'El Salvador') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.4
    idx = np.where((cleaner_life['Country'] == 'Egypt') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.9
    idx = np.where((cleaner_life['Country'] == 'Egypt') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.1
    idx = np.where((cleaner_life['Country'] == 'Egypt') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 71.3
    idx = np.where((cleaner_life['Country'] == 'Djibouti') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 59.6
    idx = np.where((cleaner_life['Country'] == 'Denmark') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Denmark') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Cyprus') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80
    idx = np.where((cleaner_life['Country'] == 'Cyprus') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.2
    idx = np.where((cleaner_life['Country'] == 'Cyprus') & (cleaner_life['Year'] == 2015))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.3
    idx = np.where((cleaner_life['Country'] == 'Congo') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 56.4
    idx = np.where((cleaner_life['Country'] == 'Comoros') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.4
    idx = np.where((cleaner_life['Country'] == 'Comoros') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 60.7
    idx = np.where((cleaner_life['Country'] == 'Chad') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.4
    idx = np.where((cleaner_life['Country'] == 'Central African Republic') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.1
    idx = np.where((cleaner_life['Country'] == 'Cabo Verde') & (cleaner_life['Year'] == 2006))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Canada') & (cleaner_life['Year'] == 2007))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Canada') & (cleaner_life['Year'] == 2008))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.7
    idx = np.where((cleaner_life['Country'] == 'Cabo Verde') & (cleaner_life['Year'] == 2002))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.6
    idx = np.where((cleaner_life['Country'] == 'Cabo Verde') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 69.1
    idx = np.where((cleaner_life['Country'] == 'Australia') & (cleaner_life['Year'] == 2004))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.5
    idx = np.where((cleaner_life['Country'] == 'Australia') & (cleaner_life['Year'] == 2003))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.2
    idx = np.where((cleaner_life['Country'] == 'Austria') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Austria') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81
    idx = np.where((cleaner_life['Country'] == 'Azerbaijan') & (cleaner_life['Year'] == 2009))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Bangladesh') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.6
    idx = np.where((cleaner_life['Country'] == 'Bangladesh') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.3
    idx = np.where((cleaner_life['Country'] == 'Belgium') & (cleaner_life['Year'] == 2014))
    cleaner_life.at[idx[0],'Life Expectancy'] = 81.3
    idx = np.where((cleaner_life['Country'] == 'Belgium') & (cleaner_life['Year'] == 2013))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Belgium') & (cleaner_life['Year'] == 2012))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.4
    idx = np.where((cleaner_life['Country'] == 'Belgium') & (cleaner_life['Year'] == 2011))
    cleaner_life.at[idx[0],'Life Expectancy'] = 80.6
    idx = np.where((cleaner_life['Country'] == 'Brazil') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 70.1
    idx = np.where((cleaner_life['Country'] == 'Burkina Faso') & (cleaner_life['Year'] == 2001))
    cleaner_life.at[idx[0],'Life Expectancy'] = 50.9
    idx = np.where((cleaner_life['Country'] == 'Burundi') & (cleaner_life['Year'] == 2000))
    cleaner_life.at[idx[0],'Life Expectancy'] = 49.1
    cleaner_life.at[736,'Life Expectancy'] = 50
    cleaner_life.at[865,'Life Expectancy'] = 55.3
    cleaner_life.at[857,'Life Expectancy'] = 61
    cleaner_life.at[913,'Life Expectancy'] = 65.7
    cleaner_life.at[993,'Life Expectancy'] = 69.9

    return cleaner_life

In [35]:
def clean_na(df):
    col = df.columns
    col = col.drop('Country')
    # get unique countries
    countries = df.Country.unique()
    for country in countries:
        cur_country = df[df.Country==country]
        ind = cur_country.index
        for c in col:
            na_ind = cur_country[cur_country[c].isna()].index
            # proceed with calculations if there are any na values
            if len(na_ind) > 0:
                value_ind = [i for i in ind if i not in na_ind]
                for i in na_ind:
                    # if there is a value in that country in the year before and the year after
                    # fill it with the average between those two
                    if i-1 in value_ind and i+1 in value_ind:
                        df.at[i,c] = np.mean(df.loc[[i-1,i+1],c])
                        # if it is the first or last year, or surrounded by na, just take average of whole column
                    else:
                        df.at[i,c] = np.mean(df.loc[value_ind,c])
    return df

In [36]:
def clean_na_col(df):
    col = df.columns
    col = col.drop('Country')
    # get unique countries
    countries = df.Country.unique()
    for country in countries:
        cur_country = df[df.Country==country]
        ind = cur_country.index
        for c in col:
            na_ind = cur_country[cur_country[c].isna()].index
            # proceed with calculations if there are any na values
            if len(na_ind) > 0:
                value_ind = [i for i in ind if i not in na_ind]
                if len(value_ind) == 0:
                    # developing or developed
                    status = list(cur_country.Status)[0]
                    same_status = df[df.Status==status]
                    # get each year
                    for i in na_ind:
                        year = df.loc[i].Year
                        # find average for respective status and year
                        year_status = same_status[same_status["Year"] == year]
                        df.at[i,c] = np.mean(year_status[c])
    return df

In [84]:
# df is your DataFrame
col = life.columns.drop('Country')
col = col.drop('Status')
life.at[:, col] = life.groupby('Country').transform(lambda g: replace(g, 3))

life = clean_na(life)
life = clean_na_col(life)

# replace false values
life = replace_false_values(life)
life = pd.get_dummies(life, columns=['Status','Country'], drop_first=True)

X = life.drop(columns=['Life Expectancy'])
y = life['Life Expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [85]:
X.head()

Unnamed: 0,Year,Adult Mortality,Infant Deaths,Alcohol,Percentage Expenditure,Hepatitis B,Measles,BMI,Under 5 Deaths,Polio,...,Country_United Republic of Tanzania,Country_United States of America,Country_Uruguay,Country_Uzbekistan,Country_Vanuatu,Country_Venezuela (Bolivarian Republic of),Country_Viet Nam,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,2015,263.0,62.0,0.01,71.279624,65.0,1154.0,19.1,83.0,6.0,...,0,0,0,0,0,0,0,0,0,0
1,2014,271.0,64.0,0.01,73.523582,62.0,492.0,18.6,86.0,58.0,...,0,0,0,0,0,0,0,0,0,0
2,2013,268.0,66.0,0.01,73.219243,64.0,430.0,18.1,89.0,62.0,...,0,0,0,0,0,0,0,0,0,0
3,2012,272.0,69.0,0.01,78.184215,67.0,2787.0,17.6,93.0,67.0,...,0,0,0,0,0,0,0,0,0,0
4,2011,275.0,71.0,0.01,7.097109,68.0,3013.0,17.2,97.0,68.0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
print(X_train.shape)
print(y_train.shape)
block = np.hstack((X_train, np.array([y_train]).T))
np.random.shuffle(block)

sample_size = 3000

X_train_sample = block[:sample_size, :-1]
y_train_sample = block[:sample_size, -1]

print(X_train_sample.shape)

model = GradientBoostingRegressor(n_estimators=50,
                                    learning_rate=0.02,
                                    min_samples_split = 3).fit(X_train_sample, y_train_sample)
print("Fitted")
score = model.score(X_test, y_test)
print(score)

(2056, 212)
(2056,)
(2056, 212)
Fitted
0.7648888034760681


In [87]:
mse = mean_squared_error(y_test, model.predict(X_test))
mae = mean_absolute_error(y_test, model.predict(X_test))
print(mse)
print(mae)

20.605161363953563
3.5746921172347466


In [94]:
#std dev
def my_score(y_test,y_pred,sd=9.3):
    diff = np.abs(y_pred-y_test)
    true = diff[diff<sd/2]
    res = len(true)/len(y_test)
    return res

mses = []
maes = []
accs = []
scores = []
start = time.time()

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    model = GradientBoostingRegressor(n_estimators=250,
                                    learning_rate=0.02,
                                    min_samples_split = 3,
                                    max_depth = 3,
                                    warm_start=False).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    acc = my_score(y_test, y_pred)
    score = model.score(X_test, y_test)
    mses.append(mse)
    maes.append(mae)
    accs.append(acc)
    scores.append(score)
    # Check similar attribute importances
    # feature_importance = model.feature_importances_
    # indices = np.argsort(feature_importance)[::-1]
    # print(X.columns[indices][:15])
end = time.time()
print("Time: " + str(end - start))
#print(mses)
#print(maes)
#print(accs)
#print(scores)

Time: 23.781676530838013


In [95]:
print(sum(mses)/len(mses))
print(sum(maes)/len(maes))
print(sum(accs)/len(accs))
print(sum(scores)/len(scores))


3.5026814361860006
1.413141322722893
0.9798185941043084
0.9598553384859734


In [96]:
feature_importance = model.feature_importances_
print(feature_importance[:23])

[3.20046373e-03 1.53095136e-01 8.39258535e-03 3.67826823e-03
 4.68715489e-04 1.55994077e-05 1.73629308e-05 1.37543283e-02
 1.32170055e-02 5.83023403e-03 8.94909688e-04 6.84614998e-03
 4.26029918e-01 4.15896954e-04 2.06370469e-04 5.05146332e-04
 9.24054490e-03 3.35926292e-01 9.52538205e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 3.91547132e-04]


In [97]:
indices = np.argsort(feature_importance)[::-1]
print(indices)
print(X.columns[indices][:20])

[ 12  17   1   7   8  18  16   2  11   9   3 171   0 156  10  15   4  13
  60  22  92 176  58  14 107 186  28  39 159  77 208   6  76   5 198 148
  74  67  73  72  71  70  69  68  90  75  66  65  64  98  63  97  96  89
  83  88  87  86  91  85  62  82  95  93  81  80  79  78  94  84  54  61
  29  36  35  34  33  32  31  30  27  38  26  25  24  23  21  20  19  37
  40  59  49  57  56  55  53  52  51  50  48  41 100  47  46  45  44  43
  42  99 211 101 169 181 180 179 178 177 175 174 173 172 170 168 154 167
 166 165 164 163 162 161 160 158 157 182 183 184 185 209 207 206 205 204
 203 202 201 200 199 197 196 195 194 193 192 191 190 189 188 187 155 153
 102 115 125 124 123 122 121 120 119 118 117 116 114 152 113 112 111 110
 109 108 106 210 104 103 126 127 128 129 151 150 149 147 146 145 144 143
 142 141 140 139 138 137 136 135 134 133 132 131 130 105]
Index(['HIV/AIDS', 'Income Comp', 'Adult Mortality', 'BMI', 'Under 5 Deaths',
       'Schooling', 'Thinness 5-9 years', 'Infant Deaths', 'D