In [1]:
# Let's import required libraries 

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns' , None)
pd.set_option('display.max_rows' , None)

# For Regression model 
import xgboost as xgb 
from sklearn.linear_model import LinearRegression , Lasso , Ridge 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor , AdaBoostRegressor , GradientBoostingRegressor 
from sklearn.svm import SVR 

# For Splitting test and train set. 
from sklearn.model_selection import train_test_split 

# For Pipeline and Preprocessing 
from sklearn.preprocessing import StandardScaler , RobustScaler 
from sklearn.pipeline import Pipeline 

# For Metrics Evaluation 
from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score  

# For Saving and loading pickle model. 
import pickle 

In [2]:
df = pd.read_csv('./Data/cleaned_df.csv')
df.drop('Unnamed: 0' ,axis = 1 , inplace = True)
df.head()

Unnamed: 0,Country,Year,Status,life_expectancy,adult_mortality,infant_deaths,Alcohol,percentage_expenditure,hepatitis_b,Measles,BMI,under_five_deaths,Polio,total_expenditure,Diphtheria,HIV_OR_AIDS,GDP,Population,thinness_one_nineteen_years,thinness_five_nine_years,income_composition_of_resources,Schooling
0,0,2015,0,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,0,2014,0,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,0,2013,0,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,0,2012,0,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,0,2011,0,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
df.columns

Index(['Country', 'Year', 'Status', 'life_expectancy', 'adult_mortality',
       'infant_deaths', 'Alcohol', 'percentage_expenditure', 'hepatitis_b',
       'Measles ', ' BMI ', 'under_five_deaths', 'Polio', 'total_expenditure',
       'Diphtheria ', 'HIV_OR_AIDS', 'GDP', 'Population',
       'thinness_one_nineteen_years', 'thinness_five_nine_years',
       'income_composition_of_resources', 'Schooling'],
      dtype='object')

In [4]:
features = df.drop(['life_expectancy'] ,axis = 1)
label = df['life_expectancy']

In [5]:
# Split train and test set. 

x_train , x_test , y_train , y_test = train_test_split(features , label , test_size = 0.2 , random_state = 42)
print(f"X train shape : {x_train.shape}\nY train shape : {y_train.shape}\nX test shape : {x_test.shape}\nY test shape: {y_test.shape}")

X train shape : (2350, 21)
Y train shape : (2350,)
X test shape : (588, 21)
Y test shape: (588,)


In [6]:
# Use pipelines and create multiple models. 

lr_pipe = Pipeline([
    ('scaler' , StandardScaler()) , ('linear_reg' , LinearRegression())
])

lasso_pipe = Pipeline([
    ('scaler' , StandardScaler()) , ('lasso' , Lasso())
])

ridge_pipe = Pipeline([
    ('scaler' , StandardScaler()) , ('ridge' , Ridge())
])

dt_pipe = Pipeline([
    ('scaler' , StandardScaler()) , ('tree' , DecisionTreeRegressor())
])

rf_pipe = Pipeline([
    ('scaler' , StandardScaler()) , ('random_forest' , RandomForestRegressor())
])

ada_pipe = Pipeline([
    ('scaler' , StandardScaler()) , ('Adaboost' , AdaBoostRegressor())
])

gb_pipe = Pipeline([
    ('scaler' , StandardScaler()) , ('gradient_model' , GradientBoostingRegressor())
])

svr_pipe = Pipeline([
    ('scaler' , StandardScaler()) , ('sv_regressor' , SVR())
])


In [7]:
model_names = ['Linear Regression' , 'Lasso' , 'Ridge' , 'Decision Tree Regressor' , 'Random Forest Regressor' , 'Adaboost Regressor' , 'Gradientboosting Regressor' , 'Support Vector Machines Regressor']
pipelines = [lr_pipe , lasso_pipe , ridge_pipe , dt_pipe , rf_pipe , ada_pipe , gb_pipe , svr_pipe ]


# Let's fit each model. 

for pipe in pipelines : 
    pipe.fit(x_train , y_train)


# Let's compare train accuracy of each other and find out which model has higher accuracy. 
    
for index , val in enumerate(pipelines) : 
    print(f"{pipelines[index]} has train accuracy : {val.score(x_train,y_train)}")

Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_reg', LinearRegression())]) has train accuracy : 0.8185916988298905
Pipeline(steps=[('scaler', StandardScaler()), ('lasso', Lasso())]) has train accuracy : 0.7754099915097948
Pipeline(steps=[('scaler', StandardScaler()), ('ridge', Ridge())]) has train accuracy : 0.8184702554405121
Pipeline(steps=[('scaler', StandardScaler()),
                ('tree', DecisionTreeRegressor())]) has train accuracy : 1.0
Pipeline(steps=[('scaler', StandardScaler()),
                ('random_forest', RandomForestRegressor())]) has train accuracy : 0.9945381629649194
Pipeline(steps=[('scaler', StandardScaler()),
                ('Adaboost', AdaBoostRegressor())]) has train accuracy : 0.9091472422861416
Pipeline(steps=[('scaler', StandardScaler()),
                ('gradient_model', GradientBoostingRegressor())]) has train accuracy : 0.962150279274787
Pipeline(steps=[('scaler', StandardScaler()), ('sv_regressor', SVR())]) has train accura

rf_pipe performs better than every pipeline. 

In [8]:
y_pred_rf = rf_pipe.predict(x_test)

In [9]:
# Metrics Evaluation 

mse = mean_squared_error(y_test , y_pred_rf)
mae = mean_absolute_error(y_test,y_pred_rf)
rmse = np.sqrt(mse)
r2 = r2_score(y_test , y_pred_rf)

print(f"Evaluation Metrics of Regression model Rf Pipe")
print(f"Mean Squared Error : {mse}\nMean Absolute Error : {mae}\nRMSE : {rmse}\nR2 Score:{r2} ")

Evaluation Metrics of Regression model Rf Pipe
Mean Squared Error : 2.7601011240808813
Mean Absolute Error : 1.0410975347106064
RMSE : 1.661355207076705
R2 Score:0.9681411403921453 


r2_score should be closer to 1 , the more the score is closer to r2_score the better model performs.<br>
In this case , rf_pipe's RandomForest Regressor is performing the best. 

In [10]:
# Let's dump the rf_pipe for future use. 

with open('../artifacts/model.pkl' , 'wb') as file : 
    pickle.dump(rf_pipe , file)