# Predicting the life expectancy of different countries

![](http://abcscprod.azureedge.net/abc-of-money/-/media/Project/ABCL/Internal_Images_526x230/526x230/LI/li_ph3_181_How_Has_Life_Expectancy_of_an_Average_Person_Changed_Over_Years_526_230.webp?revision=9aaa8d4f-a7dc-4ede-92e1-2c02f03e755e&modified=20210208054305&extension=webp)

### The project tries to create a model based on data provided by the World Health Organization (WHO) to evaluate the life expectancy for different countries in years. 
### The data offers a timeframe from 2000 to 2015. The data originates from here: https://www.kaggle.com/kumarajarshi/life-expectancy-who/data. The output algorithms have been used to test if they can maintain their accuracy in predicting the life expectancy for data they havenâ€™t been trained.

# 1.Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 2.Reading the data
**The data is saved as a csv file as LifeExpectancy.csv and it is read and stored in the dataset variable.The Year column is dropped as it will not be used in the analysis. Below the first 5 rows are shown. The data contains 21 columns and 2938 rows with the header row. The table contains data about:**

1. Country
1. Status
1. Life Expectancy
1. Adult Mortality
1. Alcohol
1. percentage expenditure
1. Hepatitis B
1. Measles
1. BMI
1. under-five deaths
1. Polio
1. Total expenditure
1. Diphtheria
1. HIV/AIDS
1. GDP
1. Population
1. thinness 1-19 years
1. thinness 5-9 years
1. Income composition of resources
1. Schooling

**With the exclution of Country name and Status(either developed or developing) all of the data is numeric. The values are either in years, precentages, millions or dollars in the case of Gross Domestic Product (GDP)**

In [None]:
!pip install dataprep

In [None]:
dataset = pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
sns.countplot(x='Status',data=dataset)

In [None]:
from dataprep.eda import create_report
report = create_report(dataset, title='My Report')
report

# 3.Data Cleaning & Preprocessing the Data

In [None]:
dataset = dataset.drop(['Year','Country'],axis=1)

dataset.head()
sns.heatmap(pd.isnull(dataset))
dataset.isnull().sum()

In [None]:
dataset['Life expectancy ']=dataset['Life expectancy '].fillna(value=dataset['Life expectancy '].mean())
dataset['Adult Mortality']=dataset['Adult Mortality'].fillna(value=dataset['Adult Mortality'].mean())
corr_data=dataset.corr()
corr_data


In [None]:
plt.figure(figsize=(15, 12))
sns.heatmap(dataset.corr(),center=0,annot=True)

In [None]:
sns.scatterplot(x=dataset['Schooling'],y=dataset['Alcohol'])
def impute_Alcohol(cols):
    al=cols[0]
    sc=cols[1]
    if pd.isnull(al):
        if sc<=2.5:
            return 4.0
        elif 2.5<sc<=5.0:
            return 1.5
        elif 5.0<sc<=7.5:
            return 2.5
        elif 7.5<sc<=10.0:
            return 3.0
        elif 10.0<sc<=15:
            return 4.0
        elif sc>15:
            return 10.0
    else:
        return al
    
dataset['Alcohol']=dataset[['Alcohol','Schooling']].apply(impute_Alcohol,axis=1)

In [None]:
sns.heatmap(pd.isnull(dataset))

In [None]:
dataset['Alcohol']=dataset['Alcohol'].fillna(value=dataset['Alcohol'].mean())

In [None]:
sns.scatterplot(x=dataset['Life expectancy '],y=dataset['Polio']);

In [None]:
def impute_polio(c):
    p=c[0]
    l=c[1]
    if pd.isnull(p):
        if l<=45:
            return 80.0
        elif 45<l<=50:
            return 67.0
        elif 50<l<=60:
            return 87.44
        elif 60<l<=70:
            return 91
        elif 70<l<=80:
            return 94.3
        elif l>80:
            return 95
    else:
        return p
    
dataset['Polio']=dataset[['Polio','Life expectancy ']].apply(impute_polio,axis=1)

In [None]:
sns.scatterplot(x=dataset['Polio'],y=dataset['Diphtheria '])

In [None]:
def impute_Diptheria(c):
    d=c[0]
    p=c[1]
    if pd.isnull(d):
        if p<=10:
            return 75.0
        elif 10<p<=40:
            return 37.0
        elif 40<p<=45:
            return 40.0
        elif 45<p<=50:
            return 50.0
        elif 50<p<=60:
            return 55.0
        elif 60<p<=80:
            return 65.0
        elif p>80:
            return 90.0
    else:
        return d
dataset['Diphtheria ']=dataset[['Diphtheria ','Polio']].apply(impute_Diptheria,axis=1)

In [None]:
sns.scatterplot(x=dataset['Diphtheria '],y=dataset['Hepatitis B']);

In [None]:
def impute_HepatatisB(cols):
    hep=cols[0]
    dip=cols[1]
    if pd.isnull(hep):
        if dip<=15:
            return 75.0
        elif 15<dip<=30:
            return 20.0
        elif 30<dip<=45:
            return 38.0
        elif 45<dip<=60:
            return 43.0
        elif 60<dip<=80:
            return 63.0
        elif dip>80:
            return 88.4
    else:
        return hep
    
dataset['Hepatitis B']=dataset[['Hepatitis B','Diphtheria ']].apply(impute_HepatatisB,axis=1)

In [None]:
dataset[dataset['Diphtheria ']>80.0]['Hepatitis B'].mean()

In [None]:
sns.scatterplot(x=dataset['Life expectancy '],y=dataset[' BMI ']);

In [None]:
def impute_BMI(c):
    b=c[0]
    l=c[1]
    if pd.isnull(b):
        if l<=50:
            return 25.0
        elif 50<l<=60:
            return 25.0
        elif 60<l<=70:
            return 32.0
        elif 70<l<=80:
            return 46.8
        elif 80<l<=100:
            return 60.0
    else:
        return b
    
dataset[' BMI ']=dataset[[' BMI ','Life expectancy ']].apply(impute_BMI,axis=1)

In [None]:
sns.scatterplot(y=dataset['Total expenditure'],x=dataset['Alcohol']);

In [None]:
def impute_Total_exp(c):
    t=c[0]
    a=c[1]
    if pd.isnull(t):
        if a<=2.5:
            return 5.08
        elif 2.5<a<=5.0:
            return 6.0
        elif 5.0<a<=10.0:
            return 6.71
        elif 10.0<a<=12.5:
            return 6.9
        elif a>12.5:
            return 6.68
    else:
        return t
    
dataset['Total expenditure']=dataset[['Total expenditure','Alcohol']].apply(impute_Total_exp,axis=1)

In [None]:
sns.scatterplot(x=dataset['percentage expenditure'],y=dataset['GDP']);

In [None]:
def impute_GDP(c):
    g=c[0]
    p=c[1]
    if pd.isnull(g):
        if p<=1250:
            return 1100.0
        elif 1250<p<=2500:
            return 1800.0
        elif 2500<p<=3750:
            return 2900.0
        elif 3750<p<=7500:
            return 3500.0
        elif 7500<p<=8750:
            return 4500.0
        elif 8750<p<=10000:
            return 5000.0
        elif 10000<p<=11250:
            return 5700.0
        elif 11250<p<=12500:
            return 7000.0
        elif 12500<p<=15000:
            return 8000.0
        elif 15000<p<=17500:
            return 9000.0
        elif p>17500:
            return 8500.0
    else:
        return g

dataset['GDP']=dataset[['GDP','percentage expenditure']].apply(impute_GDP,axis=1)

In [None]:

sns.scatterplot(x=dataset['infant deaths'],y=dataset['Population']);

In [None]:
def impute_population(c):
    p=c[0]
    i=c[1]
    if pd.isnull(p):
        if i<=100:
            return 0.19*((10)**9)
        elif 100<i<=250:
            return 0.18*((10)**9)
        elif 250<i<=350:
            return 0.02*((10)**9)
        elif 350<i<=900:
            return 0.1*((10)**9)
        elif 900<i<=1100:
            return 0.18*((10)**9)
        elif 1100<i<=1250:
            return 0.05*((10)**9)
        elif 1250<i<=1500:
            return 0.19*((10)**9)
        elif 1500<i<=1750:
            return 0.05*((10)**9)
        elif i>1750:
            return 0.1*((10)**9)
    else:
        return p
    
dataset['Population']=dataset[['Population','infant deaths']].apply(impute_population,axis=1)

In [None]:
sns.scatterplot(x=dataset[' BMI '],y=dataset[' thinness  1-19 years']);

In [None]:
def impute_Thin_1(c):
    t=c[0]
    b=c[1]
    if pd.isnull(t):
        if b<=10:
            return 5.0
        elif 10<b<=20:
            return 10.0
        elif 20<b<=30:
            return 8.0
        elif 30<b<=40:
            return 6.0
        elif 40<b<=50:
            return 3.0
        elif 50<b<=70:
            return 4.0
        elif b>70:
            return 1.0
    else:
        return t
    
dataset[' thinness  1-19 years']=dataset[[' thinness  1-19 years',' BMI ']].apply(impute_Thin_1,axis=1)

In [None]:
sns.scatterplot(x=dataset[' BMI '],y=dataset[' thinness 5-9 years'])

In [None]:
def impute_Thin_1(c):
    t=c[0]
    b=c[1]
    if pd.isnull(t):
        if b<=10:
            return 5.0
        elif 10<b<=20:
            return 10.0
        elif 20<b<=30:
            return 8.0
        elif 30<b<=40:
            return 6.0
        elif 40<b<=50:
            return 3.0
        elif 50<b<=70:
            return 4.0
        elif b>70:
            return 1.0
    else:
        return t
    
dataset[' thinness 5-9 years']=dataset[[' thinness 5-9 years',' BMI ']].apply(impute_Thin_1,axis=1)

In [None]:
sns.scatterplot(x=dataset['Life expectancy '],y=dataset['Income composition of resources'])

In [None]:
def impute_Income(c):
    i=c[0]
    l=c[1]
    if pd.isnull(i):
        if l<=40:
            return 0.4
        elif 40<l<=50:
            return 0.42
        elif 50<l<=60:
            return 0.402
        elif 60<l<=70:
            return 0.54
        elif 70<l<=80:
            return 0.71
        elif l>80:
            return 0.88
    else:
        return i
        
dataset['Income composition of resources']=dataset[['Income composition of resources','Life expectancy ']].apply(impute_Income,axis=1)

In [None]:
sns.scatterplot(x=dataset['Life expectancy '],y=dataset['Schooling']);

In [None]:
def impute_schooling(c):
    s=c[0]
    l=c[1]
    if pd.isnull(s):
        if l<= 40:
            return 8.0
        elif 40<l<=44:
            return 7.5
        elif 44<l<50:
            return 8.1
        elif 50<l<=60:
            return 8.2
        elif 60<l<=70:
            return 10.5
        elif 70<l<=80:
            return 13.4
        elif l>80:
            return 16.5
    else:
        return s
    
dataset['Schooling']=dataset[['Schooling','Life expectancy ']].apply(impute_schooling,axis=1)

In [None]:
dataset[(dataset['Life expectancy ']>80) & (dataset['Life expectancy ']<=90)]['Schooling'].mean()

## Clean dataset with no null values

In [None]:
a=list(dataset.columns)
b=[]
for i in a:
    c=dataset[i].isnull().sum()
    b.append(c)
null_df=pd.DataFrame({'Feature name':a,'no. of Nan':b})
null_df

In [None]:
#Creating 2 dummy variables to deal with numerical value
y=dataset['Life expectancy ']
X=dataset.drop('Life expectancy ',axis=1)
X['Status'].unique()

In [None]:
status_dummy=pd.get_dummies(X['Status'])
X.drop(['Status'],inplace=True,axis=1)

In [None]:
X=pd.concat([X,status_dummy],axis=1)
X.shape

# 4.Train/Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

## RandomForest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
# Predicting a new result
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
y_pred = np.array(y_pred)
y_test = np.array(y_test)

print(np.concatenate((y_pred.reshape(len(y_test),1), y_test.reshape(len(y_test),1)),1))

In [None]:
accuracy_score =  regressor.score(X_test,y_test)
print(accuracy_score)

In [None]:
print('Random Forest Classifier Accuracy:',(accuracy_score)*100,'%')

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(mean_squared_error(y_test,y_pred)**(0.5))

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(regressor,X_train,y_train,cv=10)
accuracies.mean()