# Life Expectancy

### [1] Introduce DataSet

In [None]:
#Imports for data hadling
import numpy as np
import pandas as pd
import seaborn as sb
from pandas import DataFrame
from matplotlib import pyplot as plt
from sklearn import preprocessing
from scipy.stats.mstats import winsorize

%matplotlib inline

In [None]:
#Reading csv Dataset #DataSet source Kaggle.com
data = pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')

In [None]:
#Renaming column headers + Confirming Dataset using head. It should 22 columns of few rows
data.rename(columns={" BMI ":"BMI","Life expectancy ":"Life_Expectancy","Adult Mortality":"Adult_Mortality",
                   "infant deaths":"Infant_Deaths","percentage expenditure":"Percentage_Exp","Hepatitis B":"HepatitisB",
                  "Measles ":"Measles"," BMI ":"BMI","under-five deaths ":"Under_Five_Deaths","Diphtheria ":"Diphtheria",
                  " HIV/AIDS":"HIV/AIDS"," thinness  1-19 years":"thinness_1to19_years"," thinness 5-9 years":"thinness_5to9_years","Income composition of resources":"Income_Comp_Of_Resources",
                   "Total expenditure":"Tot_Exp"},inplace=True)
data.head(10)

In [None]:
#Checking structure of Dataset, Quantitive and Qualititive features of Data
data.info()

In [None]:
#Checking size of Dataset
data.shape

In [None]:
#For Categorical to Numerical data saving unique values for confirmation
data_bckup=data
print(len(data['Country'].unique()))
print(len(data['Status'].unique()))
#print(data)

In [None]:
#To handle null value and outliers need to check correlation before loosing important data
#To Check correlation all data should be nominal converting categorical to numerical and showing relation
#Using sklearn preprocessing to encode

status=preprocessing.LabelEncoder()
status.fit(data.Status.unique())
data['Status']=status.transform(data['Status'])
country=preprocessing.LabelEncoder()
country.fit(data.Country.unique())
data['Country']=country.transform(data['Country'])

data.info()
#data.head()
#print(data)

In [None]:
#Checking unique value count it should not have any mismatch
print(len(data['Country'].unique()))
print(len(data['Status'].unique()))

In [None]:
#Checking correlation across all 22 columns
correlation=data.corr()
correlation

### [2] Data Preparation

#### [2.1] Data Cleaning => Handling Null and Erroneous Values

In [None]:
#Checking null values
(data.isnull().sum()).sort_values(ascending=False)

In [None]:
###Plenty of null values in dataset
#Checking erroneous values
data.describe()

In [None]:
###Seems Error in Few Data like Population 34 only, Adult Mortality value 1!!! 
#Checking boxplot [Using seaborn] of columns has suspicious/implicit-missing values 

plt.figure(figsize=(15,10))
for i, col in enumerate(['Adult_Mortality', 'Infant_Deaths', 'BMI', 'Under_Five_Deaths', 'GDP', 'Population'], start=1):
    plt.subplot(2,3,i)
    sb.boxplot(y=data[col],palette='plasma')

plt.show()

In [None]:
#Some errors and ouliers detected like Infant death = 0, Under Five Death = 0, etc; Needs to be fixed
#Instead directly going to null value handling, fixing errors by using more relavant method heres as per
#Observingvariety of gaps/ranges here

#For Adult Mortality handling dropping 5th percentile
percentile_5th = np.percentile(data.Adult_Mortality.dropna(), 5)
data.Adult_Mortality = data.apply(lambda x: np.nan if x.Adult_Mortality < percentile_5th else x.Adult_Mortality, axis=1)

#Placing null values under frame of wherever 50 < BMI < 10
data.BMI = data.apply(lambda x: np.nan if (x.BMI < 10 or x.BMI > 50) else x.BMI, axis=1)

#Placing null value inplace of infant deaths so can tackle in null value handling
data.Infant_Deaths = data.Infant_Deaths.replace(0, np.nan)

#Placing null value inplace of Under_Five_Deaths so can tackle in null value handling
data.Under_Five_Deaths = data.Under_Five_Deaths.replace(0, np.nan)


In [None]:
#Checking decent number of null values
(data.isnull().sum()).sort_values(ascending=False)

In [None]:
#BMI is having very large quantity of null values which completely can deflect the model 
#If we remove rows than almost 50% data will be lost, Bad idea!
#By checking correlation, we found that BMI is not having any stronger correlation with other features maximum .4 which won't impact.
#Better to remove this feature

data.drop(columns='BMI', inplace=True)

#Checking remaining nulls
(data.isnull().sum()).sort_values(ascending=False)

In [None]:
#Checking remaining null values columns; take Population, GDP, Income_Comp_Of_Resources, Schooling, Alcohol consumption.
#Which are not indirectly related to health and can be imputed by interpolate method
Inter_cols = ['Population', 'GDP', 'Income_Comp_Of_Resources', 'Schooling', 'Alcohol']

for x in Inter_cols:
    data.loc[:, x] = data.loc[:, x].interpolate()

#Checking remaining nulls
(data.isnull().sum()).sort_values(ascending=False)

In [None]:
#Now for rest of the missing values, we can fill by mean taken from grouping by Country [region based] or Year [Time based]
#If we choose Country here, then it is superset so single feature for single country will have same value each year
#Year based is feasible here

impt_value = []
for Year in list(data.Year.unique()):
    Yr_data = data[data.Year == Year].copy()
    for column in list(Yr_data.columns)[3:]:
        Yr_data[column] = Yr_data[column].fillna(Yr_data[column].dropna().mean()).copy()
    impt_value.append(Yr_data)
data = pd.concat(impt_value).copy()

#Checking remaining nulls
(data.isnull().sum()).sort_values(ascending=False)

In [None]:
(data.isnull().sum()).sort_values(ascending=False)

#### [2.2] Exploraing/Analysing the DataSet

In [None]:
#Giving visuality to data for determining
#Dropping the categorical variable to plot histogram and boxplot.
df_num = data.drop(columns=["Year","Country","Status"])

# Storing numerical column list into a variable
num_col = df_num.columns.tolist()

#Plotting histogram for all numerical variables of dataframe to find the distribution
# Set background color of corr matrix to White
# sb.set_theme(style="white")

# Draw the hist plot with numerical variables
df_num.hist(bins=25, figsize=(20,15))
plt.show()

In [None]:
#Ploting Box plot across all the numerical columns

# Set up the matplotlib figure
plt.figure(figsize=(20,25))

# Set background color of corr matrix to White
# sb.set_theme(style="white")

#Loping over the numerical columns to draw boxplot
for i,col in enumerate(num_col):
    plt.subplot(7,6,i+1)
    sb.boxplot(y = df_num[col],palette='plasma')

plt.show()

In [None]:
#As we have encoded Status Feture for correlation decoding here
data['Status'] = data['Status'].replace([0, 1], ['Developing', 'Developed'])
# Set up the seaborn figure
sb.set(rc={'figure.figsize':(6,6)})

#Draw the barplot with country status and life expectancy variable
sb.barplot(data=data, y='Life_Expectancy',x='Status', orient = 'v')

#Adding the lable and title to the plot
plt.xlabel("Status",fontsize=12)
plt.ylabel("Avg Life_Expectancy",fontsize=12)
plt.title("Life_Expectancy vs Status")

plt.show()

#Finding thethe average life expectancy between both status. 
round(data[['Status','Life_Expectancy']].groupby(['Status']).mean(),2)

In [None]:
#Life Expectancy Range and it's pick
sb.countplot(data['Life_Expectancy'])

sb.set(rc={'figure.figsize':(8,8)})
plt.show()

### [3] Feature Engineering 

In [None]:
#Checking current dataset with no outliers and no nulls
#BMI we already have remove in previuos phase
#Plotting correlation before going further

#Using seaborn for better visualisation of correlation
#imports for graph

correlation=data.corr()
correlation

#Plot a graph
sb.set(font_scale=1)
# sb.set_theme(style="white")
mask = np.triu(correlation)
plt.figure(figsize=(15,15))
# Draw the heatmap with the mask and correct aspect ratio
sb.heatmap(correlation, mask=mask, cmap="YlOrRd_r", vmax=1,vmin = -1,center=0, annot=True,
           square=True, fmt = '.1g', linewidths=.25, cbar_kws={"shrink": 0.5})


In [None]:
correlation=data.corr()
correlation

In [None]:
correlation = data.corr().abs().unstack().drop_duplicates().sort_values(kind="quicksort", ascending = False)
correlation[1:10]

In [None]:
#Correlation between Target variable and other variables
feature_list=data.corr()['Life_Expectancy'].sort_values(kind="quicksort",ascending=False)
feature_list

In [None]:
#First 8 and last 6 highly correlated
print(feature_list.head(8))
print(feature_list.tail(6))

#Except Status top five and last five features selected
feature_list =['Schooling', 'Income_Comp_Of_Resources', 'Diphtheria','Alcohol','GDP', 'Polio','Measles', 'Percentage_Exp', 'thinness_5to9_years','thinness_1to19_years', 'HIV/AIDS', 'Adult_Mortality']

print(30*'*'+' Final selected features '+30*'*')
feature_list

In [None]:
#Set the figure size
plt.figure(figsize=(20,30))

#Looping the feature cols from the above list and draw the plot with Target variable
for col_name in feature_list:
    #Draw subplot with index position from the feature list
    plt.subplot(5,4,(list(feature_list).index(col_name)+1))
    
    #Draw regplot for feature column as x with Target variable as y
    sb.regplot(x=data[col_name], y=data['Life_Expectancy'], scatter_kws={"color": "blue"}, line_kws={"color": "red"})
    
    #Plot the title as column name
    plt.title(col_name)
    
    #Plot X,Y label
    plt.xlabel(col_name)
    plt.ylabel('Life Expectancy')

plt.show()

### [4] Modeling & Testing

In [None]:
#Import sklearn pakages for modeling the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

#### Spliting Data

In [None]:
#preparing list to compare all models
Model_Compare = [[]]

#prepare train and test data, taken 80% of Training Data and 20% of Test Data
X_feature=data[feature_list]
Y_label=data.Life_Expectancy 

X_feature_train,X_feature_test,Y_label_train,Y_label_test=train_test_split(X_feature,Y_label,test_size=0.2, 
                                     random_state=42, shuffle=True)

print('X_feature_training_set : ', X_feature_train.shape)
print('X_feature_test_set : ', X_feature_test.shape)
print('Y_label_train_set :', Y_label_train.shape) 
print('Y_label_test_set : ', Y_label_test.shape)

#### [Model-1] Linear Regresssion

In [None]:
#Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(X_feature_train,Y_label_train)

#Make predictions
Y_pred = lin_reg.predict(X_feature_train)

#Calculating MSE and RMSE
lr_train_mse = mean_squared_error(Y_label_train,Y_pred)
lr_train_rmse = np.sqrt(lr_train_mse)

lr_r2_score = r2_score(Y_label_train,Y_pred)

#Testing Score
test_score = lin_reg.score(X_feature_test, Y_label_test)

print("Linear Regression:")
print("Mean squared error: %.2f"%lr_train_mse)
print("Root Mean Squared error: %.2f"%lr_train_rmse)
print("R^2 score for lin_reg training set: %.2f"%lr_r2_score)
print("Test-set score:",test_score)

# Model_Compare.append(['Linear Regression', round(lr_train_mse,3), round(lr_train_rmse,3), round(lr_r2_score, 3), round(test_score,3)])
Model_Compare[0] = ['Linear Regression', round(lr_train_mse,3), round(lr_train_rmse,3), round(lr_r2_score, 3), round(test_score,3)]
#print(Model_Compare)

#### [Model-2] Random Forest Classifier

In [None]:
#Random Forest
forest_reg = RandomForestRegressor()
forest_reg.fit(X_feature_train,Y_label_train)

#Make predictions
Y_pred = forest_reg.predict(X_feature_train)

#Calculating MSE and RMSE
rf_train_mse = mean_squared_error(Y_label_train,Y_pred)
rf_train_rmse = np.sqrt(rf_train_mse)

#Calculating r2 score
forest_r2_score = r2_score(Y_label_train,Y_pred)

#Testing score
test_score = forest_reg.score(X_feature_test, Y_label_test)

print('Random Forest:')
print("Mean squared error: %.2f"%rf_train_mse)
print("Root Mean Squared error: %.2f"%rf_train_rmse)
print("R^2 score for forest training set: %.2f"%forest_r2_score)
print("Test-set score:",forest_reg.score(X_feature_test, Y_label_test))

Model_Compare.append(['Random Forest', round(rf_train_mse,3), round(rf_train_rmse,3), round(forest_r2_score, 3), round(test_score,3)])

#### Comparing Models

In [None]:
#Printing models comparision table
# Comparision_Table = Comparision_Table[1:]
Comparision_Table = DataFrame(Model_Compare, columns=['Model','MSE','RMSE','R2 Score','Test-Score'])
print(Comparision_Table)