In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import RidgeCV,LassoCV,ElasticNetCV
from sklearn.metrics import r2_score

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
df=pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

# Missing Values

In [None]:
df.isna().sum()

In [None]:
cols=df.columns
percent_of_null=[]
for i in cols:
    p=(df[i].isnull().sum()/df.shape[0])*100
    percent_of_null.append(p)

for l,n in zip(df.columns,percent_of_null):
    print(l,':',n)

In [None]:
df['Status']=df['Status'].replace('Developing',1)
df['Status']=df['Status'].replace('Developed',0)

In [None]:
df=df.drop(['Country','Year'],axis=1)

In [None]:
df=df.fillna(df.median())

In [None]:
df.isna().sum()   # rechecking missing values after treatment

In [None]:
df.describe()

# Outliers

In [None]:
col={'Life expectancy ':1,'Adult Mortality':2,'infant deaths':3,'Alcohol':4,'percentage expenditure':5,'Hepatitis B':6, 
        'Measles ':7,' BMI ':8,'under-five deaths ':9,'Polio':10, 'Total expenditure':11,'Diphtheria ':12, 
     ' HIV/AIDS':13,'GDP':14,'Population':15,
        ' thinness  1-19 years':16,' thinness 5-9 years':17,'Income composition of resources':18,'Schooling':19}

In [None]:
plt.figure(figsize=(20,30))

for variable,i in col.items():
                     plt.subplot(5,4,i)
                     plt.boxplot(df[variable])
                     plt.title(variable)

plt.show()

In [None]:
df=df.transform(lambda x : x**0.5)  # Outliers treatment

# EDA

In [None]:
sns.distplot(df['Life expectancy '],kde=True)

In [None]:
sns.barplot(data=df, x='Life expectancy ',y='Status',orient = 'h')

Developed nations have high life expectancy than Developing nations

In [None]:
disease_cols=df[['Life expectancy ','Alcohol','Hepatitis B','Measles ',' BMI ','Polio','Diphtheria ',' HIV/AIDS']]

In [None]:
sns.pairplot(disease_cols,diag_kind='kde')

In [None]:
disease_cols.corr()

* BMI, Diphtheria, HIV/AIDS are highly correlated to Life expectancy , also evident from the pairplot.
* Polio - Diphtheria, Hepatitis B - Diphtheria are highly correlated, hence multi collinearity is present.

In [None]:
measures_cols=df[['Life expectancy ','Adult Mortality','infant deaths','under-five deaths ',' thinness  1-19 years',' thinness 5-9 years','Schooling']]

In [None]:
sns.pairplot(measures_cols,diag_kind='kde')

In [None]:
measures_cols.corr()

* Adult Mortality,infant deaths,under-five deaths and Schooling are highly correlated to target variable , thus all are significant features to predict target variable.
* All independent features are highly correlated with each other.

In [None]:
income_exp_cols=df[['Life expectancy ','percentage expenditure','Total expenditure','GDP','Population',
                    'Income composition of resources']]

In [None]:
sns.pairplot(income_exp_cols, diag_kind='kde')

In [None]:
income_exp_cols.corr()

* GDP, population, Income composition of resources are highly correlated to Life expectancy.
* percentage expenditure and GDP are highly correlated as percentage expenditure is Expenditure on health as a percentage of Gross Domestic Product per capita(%).

# Base Model

In [None]:
X=df.drop('Life expectancy ',axis=1)
y=df['Life expectancy ']

In [None]:
X_constant = sm.add_constant(X)
lin_reg = sm.OLS(y,X_constant).fit()
lin_reg.summary()

* Assuming alpha= 0.05
* Significant features (pvalue < 0.05) are: 
    Status, Adult mortality, infant deaths, Alcohol, Hepatitis B, BMI, under-five deaths, polio, Diphtheria, HIV/AIDS, GDP, population, Income composition of resources, Schooling

# Assumptions of Linear Regression

Assumption 1 : No Autocorrelation

In [None]:
import statsmodels.tsa.api as smt
acf = smt.graphics.plot_acf(lin_reg.resid, alpha=0.05)
acf.show()

As residuals are having a pattern and Durbin-Watson test value = 0.637 indicates presence of positive autocorrelation.

Assumption 2 : Normality of Residuals

In [None]:
from scipy import stats
print(stats.jarque_bera(lin_reg.resid))

In [None]:
sns.distplot(lin_reg.resid)

From the plot it is clear that residuals are left skewed and pvalue of Jarque-Bera test is leass than alpha(0.05), thus residuals are not normally distributed.

Asssumption 3 : Linearity of residuals

In [None]:
sns.set_style('darkgrid')
sns.mpl.rcParams['figure.figsize'] = (15.0, 9.0)

def linearity_test(model, y):
    pred_vals = model.predict()
    resids = model.resid

    fig, ax = plt.subplots(1,2)
    
    sns.regplot(x=pred_vals, y=y, lowess=True, ax=ax[0], line_kws={'color': 'red'})
    ax[0].set_title('Observed vs. Predicted Values', fontsize=16)
    ax[0].set(xlabel='Predicted', ylabel='Observed')

    sns.regplot(x=pred_vals, y=resids, lowess=True, ax=ax[1], line_kws={'color': 'red'})
    ax[1].set_title('Residuals vs. Predicted Values', fontsize=16)
    ax[1].set(xlabel='Predicted', ylabel='Residuals')
    
linearity_test(lin_reg, y)  

To detect nonlinearity one can inspect plots of observed vs. predicted values or residuals vs. predicted values. The desired outcome is that points are symmetrically distributed around a diagonal line in the former plot or around horizontal line in the latter one. In both cases linearity of residuals can be seen.

In [None]:
lin_reg.resid.mean()

Mean of residuals closer to zero states that the residuals are linear.

Assumption 4 : Homoscedasticity_test(using goldfeld test)

In [None]:
import statsmodels.stats.api as sms
from statsmodels.compat import lzip

model = lin_reg
pred_vals = model.predict()
resids = model.resid
resids_standardized = model.get_influence().resid_studentized_internal
fig, ax = plt.subplots(1,2)

sns.regplot(x=pred_vals, y=resids, lowess=True, ax=ax[0], line_kws={'color': 'red'})
ax[0].set_title('Residuals vs Fitted', fontsize=16)
ax[0].set(xlabel='Fitted Values', ylabel='Residuals')
sns.regplot(x=pred_vals, y=np.sqrt(np.abs(resids_standardized)), lowess=True, ax=ax[1], line_kws={'color': 'red'})
ax[1].set_title('Scale-Location', fontsize=16)
ax[1].set(xlabel='Fitted Values', ylabel='sqrt(abs(Residuals))')

name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(model.resid, model.model.exog)
lzip(name, test)

As the pvalue of Goldfeldquandt test is greater than alpha(0.05), hence there is homoscedasticity distribution which is also seen in the plots.

Assumption 5 : No Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = [variance_inflation_factor(X_constant.values, i) for i in range(X_constant.shape[1])]
pd.DataFrame({'vif': vif[1:]}, index=X.columns)

infant deaths and under-five deaths are highly multi collinear.

# Train-Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred=lr.predict(X_test)

In [None]:
coll=['Status','Adult Mortality','infant deaths','Alcohol','percentage expenditure','Hepatitis B', 
        'Measles ',' BMI ','under-five deaths ','Polio', 'Total expenditure','Diphtheria ', 
     ' HIV/AIDS','GDP','Population',
        ' thinness  1-19 years',' thinness 5-9 years','Income composition of resources','Schooling']

In [None]:
coefficients = pd.Series(lr.coef_, index= coll)
print(coefficients)

In [None]:
print('Intercept: ',lr.intercept_)
print('Mean absolute error for test: ',mean_absolute_error(y_test,y_pred))
print('Mean Squared error for test: ',mean_squared_error(y_test,y_pred))
print('Root mean squared error for test: ',np.sqrt(mean_squared_error(y_test,y_pred)))
print('Accuracy for train: ',lr.score(X_train, y_train))
print('Accuracy for test: ',lr.score(X_test, y_test))
print('R square of test: ',r2_score(y_test,y_pred))

# Regularization techniques used to reduce overfitting of the model.

# RidgeCV

In [None]:
rcv=RidgeCV(cv=5)
rcv.fit(X_train,y_train)
y_pred_rr=rcv.predict(X_test)

In [None]:
print('Optimal alpha: ',rcv.alpha_)

In [None]:
coefficients_rcv=pd.Series(rcv.coef_,index=coll)
print(coefficients_rcv)
print('Intercept: ',rcv.intercept_)
print('Mean absolute error for test: ',mean_absolute_error(y_test,y_pred_rr))
print('Mean Squared error for test: ',mean_squared_error(y_test,y_pred_rr))
print('Root mean squared error for test: ',np.sqrt(mean_squared_error(y_test,y_pred_rr)))
print('Train accuracy: ',rcv.score(X_train,y_train))
print('Test accuracy: ',rcv.score(X_test,y_test))
print('R square of test:',r2_score(y_test,y_pred_rr))

# LassoCV

In [None]:
lassocv=LassoCV(cv=5,random_state=3)
lassocv.fit(X_train,y_train)
y_pred_lasso=lassocv.predict(X_test)

In [None]:
print('Optimal alpha: ',lassocv.alpha_)
print('No. of interations: ',lassocv.n_iter_)

In [None]:
coefficients_lasso=pd.Series(lassocv.coef_,index=coll)
print(coefficients_lasso)
print('Intercept: ',lassocv.intercept_)
print('Mean absolute error for test: ',mean_absolute_error(y_test,y_pred_lasso))
print('Mean Squared error for test: ',mean_squared_error(y_test,y_pred_lasso))
print('Root mean squared error for test: ',np.sqrt(mean_squared_error(y_test,y_pred_lasso)))
print('Train accuracy: ',lassocv.score(X_train,y_train))
print('Test accuracy: ',lassocv.score(X_test,y_test))
print('R square of test:',r2_score(y_test,y_pred_lasso))

# Elastic-Net CV

In [None]:
en_cv = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, .995, 1], eps=0.001, n_alphas=100, fit_intercept=True, 
                        normalize=True, precompute='auto', max_iter=2000, tol=0.0001, cv=5, 
                        copy_X=True, verbose=0, n_jobs=-1, positive=False, random_state=None, selection='cyclic')

In [None]:
en_cv.fit(X_train,y_train)
y_pred_en=en_cv.predict(X_test)

In [None]:
print('Optimal alpha: ',en_cv.alpha_)
print('Optimal l1_ratio: ',en_cv.l1_ratio_)
print('Number of iterations: ',en_cv.n_iter_)

In [None]:
coefficients_en=pd.Series(en_cv.coef_,index=coll)
print(coefficients_en)
print('Intercept: ',en_cv.intercept_)
print('Mean absolute error for test: ',mean_absolute_error(y_test,y_pred_en))
print('Mean Squared error for test: ',mean_squared_error(y_test,y_pred_en))
print('Root mean squared error for test: ',np.sqrt(mean_squared_error(y_test,y_pred_en)))
print('Train accuracy: ',en_cv.score(X_train,y_train))
print('Test accuracy: ',en_cv.score(X_test,y_test))
print('R square of test:',r2_score(y_test,y_pred_en))

In [None]:
coefficients_en[abs(coefficients_en)>0.05]

# Most significant fetaures to predict Life expectancy

* Status
* infant deaths
* under-five deaths
* HIV/AIDS
* Income composition of resources
* Schooling