# Life Expectancy Regressor

In [62]:
import numpy as np
import pandas as pd
from   sklearn.pipeline        import Pipeline
from   sklearn.model_selection import train_test_split
from   sklearn.experimental    import enable_iterative_imputer
from sklearn.decomposition import PCA
from   sklearn.preprocessing   import *
from   sklearn.impute          import *
from   sklearn.compose         import *
# from   category_encoders       import *
from sklearn.model_selection import RandomizedSearchCV
from   sklearn.linear_model    import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base            import BaseEstimator
from   sklearn.metrics         import r2_score, mean_squared_error

In [2]:
life = pd.read_csv('Life Expectancy Data.csv')

## 1. Load data and feature engineering

In [3]:
life.head(3)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9


In [4]:
life.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [5]:
life.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [6]:
# since our target varible is Life expectancy, so we need to delete those missing life expectancy
life = life[~life['Life expectancy '].isnull()]

In [7]:
life.shape

(2928, 22)

In [8]:
# Make those int values except year as float so that to treat them as numeric
int_cols = ['infant deaths', 'Measles ', 'under-five deaths ']
life[int_cols] = life[int_cols].astype(float)

In [9]:
y = life['Life expectancy ']

In [10]:
cols = list(life.columns)

In [11]:
cols.remove('Life expectancy ')

In [13]:
X = life[cols]

In [14]:
X.Year.unique()

array([2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005,
       2004, 2003, 2002, 2001, 2000])

## 1.1 Split Data for training and testing
Here we choose data before 2015 for model training and data of 2015 for model testing

In [27]:
X_train = X[X.Year != 2015]
X_test = X[X.Year == 2015]

In [28]:
y_train = y[X.Year != 2015]
y_test = y[X.Year == 2015]

In [29]:
cols.remove('Year')

ValueError: list.remove(x): x not in list

In [30]:
X_train = X_train[cols]

In [31]:
X_test = X_test[cols]

In [40]:
X_train.shape, X_test.shape

((2745, 20), (183, 20))

In [41]:
X_train.columns

Index(['Country', 'Status', 'Adult Mortality', 'infant deaths', 'Alcohol',
       'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

## 2. Data Preprocessing
In this process, we are going to include data imputation, PCA to better process data for analyzing.

In [42]:
mask_num = X_train.dtypes == (float or int)
columns_num = X_train.columns[mask_num].tolist()
columns_cat = X_train.columns[~mask_num].tolist()

print('-Num', columns_num)
print('-Cat', columns_cat, '\n')

-Num ['Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']
-Cat ['Country', 'Status'] 



In [43]:
life[columns_cat].head(3)

Unnamed: 0,Country,Status
0,Afghanistan,Developing
1,Afghanistan,Developing
2,Afghanistan,Developing


### 2.1 Imputation

In [44]:
cat_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan,
                                               strategy='most_frequent')),
                      ('ohe', OneHotEncoder())
                     ])
con_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan,
                                               strategy='median')),
                    ('scaler', StandardScaler())
                      ])
preprocessing = ColumnTransformer([('categorical', cat_pipe, columns_cat),
                                   ('continuous',  con_pipe, columns_num)
                                   ])

## 3. Model Training

In [45]:
# Train-Test split
# X_train, X_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2)

In [56]:
search_space = [
    {
        'clf': [LinearRegression()]
    },
    {
        'clf': [Ridge()],
        'clf__alpha': [200, 230, 250,265, 270, 275, 290, 300, 500]
    },
    {
        'clf': [Lasso()],
        'clf__alpha': [0.02, 0.024, 0.025, 0.026, 0.03]
    },
    {
        'clf': [DecisionTreeRegressor()],
        'clf__criterion': ["mse", "mae"],
        'clf__min_samples_split': [10, 20, 40],
        'clf__max_depth': [2, 6, 8],
        'clf__min_samples_leaf': [20, 40, 100],
        'clf__max_leaf_nodes': [5, 20, 100]
        
    },
    {
        'clf': [RandomForestRegressor()],
        'clf__bootstrap': [True, False],
        'clf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'clf__max_features': ['auto', 'sqrt'],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__min_samples_split': [2, 5, 10],
        'clf__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    }
]

In [57]:
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

In [58]:
pipe = Pipeline([('preprocessing', preprocessing),
                ('clf', DummyEstimator())])

In [59]:
clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=25,
                                    cv=5, 
                                    n_jobs=-1,
                                    verbose=1,
                                   scoring = 'neg_root_mean_squared_error')
#  Fit grid search
best_model = clf_algos_rand.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


 nan nan nan nan nan nan nan]


In [60]:
best_model.best_params_

{'clf__n_estimators': 400,
 'clf__min_samples_split': 2,
 'clf__min_samples_leaf': 4,
 'clf__max_features': 'auto',
 'clf__max_depth': None,
 'clf__bootstrap': True,
 'clf': RandomForestRegressor(min_samples_leaf=4, n_estimators=400)}

In [61]:
# hyperparameters = {
#  'bootstrap': True,
#  'ccp_alpha': 0.0,
#  'criterion': 'mse',
#  'max_depth': 40,
#  'max_features': 'sqrt',
#  'max_leaf_nodes': None,
#  'max_samples': None,
#  'min_impurity_decrease': 0.0,
#  'min_impurity_split': None,
#  'min_samples_leaf': 1,
#  'min_samples_split': 5,
#  'min_weight_fraction_leaf': 0.0,
#  'n_estimators': 2000,
#  'n_jobs': None,
#  'oob_score': False,
#  'random_state': None,
#  'verbose': 0,
#  'warm_start': False
# }
pipe = Pipeline([('preprocessing', preprocessing),
                ('rf', RandomForestRegressor(min_samples_leaf=4, n_estimators=400))])
# pipe = Pipeline([('preprocessing', preprocessing),
#                 'rf', RandomForestRegressor(n_estimators=200)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder())]),
                                                  ['Country', 'Status']),
                                                 ('continuous',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Adult Mortality',


## 4. Evaluation

In [66]:
pred = pipe.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))

In [67]:
rmse

2.0204132563002566

In [68]:
r2 = r2_score(y_test, pred)

In [69]:
r2

0.9378055388937334