## Importing necessary packages for the Analysis:

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
import datetime
import math

warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style("whitegrid")

## Loading the Dataset:

In [None]:
data=pd.read_csv("../input/life-expectancy-who/Life Expectancy Data.csv")

## Getting a Brief Look at the Data:

In [None]:
data.head(3)

In [None]:
data.tail(3)

In [None]:
data.shape

In [None]:
data.info()

## Checking Missing Values:

In [None]:
def checkna(data):
    missing_values= data.isna().sum().reset_index()
    missing_values.columns= ["Features", "Missing_Values"]
    missing_values["Missing_Percent"]= round(missing_values.Missing_Values/len(data)*100,2)
    return missing_values[missing_values.Missing_Values > 0 ]

In [None]:
checkna(data)

In [None]:
data.columns=['Country', 'Year', 'Status', 'Life_expectancy', 'Adult_Mortality',
       'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B',
       'Measles', 'BMI', 'under_five_deaths', 'Polio', 'Total_expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population',
       'thinness', 'thinness_5-9_years',
       'Income_composition_of_resources', 'Schooling']

In [None]:
y= data["Life_expectancy"]
df=  data.drop(["Life_expectancy"], axis=1)

categorical= df.select_dtypes(include= "O")
numerical= df.select_dtypes(exclude= "O")

## Exporatory Data Analysis:

### Non Graphical EDA:

In [None]:
categorical.describe()

In [None]:
round(numerical.describe(),3)

#### Countries Vs Life Expectancy:

In [None]:
print("Top 10 Countries with Most Life Expectancy")
print("="*50)
print(data.groupby("Country").Life_expectancy.mean().sort_values(ascending =False).head(10))
print("="*50)
print("Top 10 Countries with Least Life Expectancy")
print("="*50)
print(data.groupby("Country").Life_expectancy.mean().sort_values(ascending =True).head(10))
print("="*50)
print(data.groupby("Country").Life_expectancy.median().sort_values(ascending =False).head(10))

#### Countries Vs Life Expectancy on the basis of Status:

In [None]:
data.groupby("Status").Life_expectancy.mean()

In [None]:
print("="*50)
print("Top 10 Developed Countries Life Expectancy")
print("="*50)
print(data[data.Status== "Developed"].groupby("Country").Life_expectancy.mean().sort_values(ascending=False).head(10))
print("="*50)
print("Top 10 Developing Countries Life Expectancy")
print("="*50)
print(data[data.Status== "Developing"].groupby("Country").Life_expectancy.mean().sort_values(ascending=False).head(10))

print("="*50)
print("Least 10 Developed Countries Life Expectancy")
print("="*50)
print(data[data.Status== "Developed"].groupby("Country").Life_expectancy.mean().sort_values(ascending=True).head(10))
print("="*50)
print("Least 10 Developing Countries Life Expectancy")
print("="*50)
print(data[data.Status== "Developing"].groupby("Country").Life_expectancy.mean().sort_values(ascending=True).head(10))

#### Countries Vs Population:

In [None]:
print("="*50)
print("Top 10 Countries with Most Population")
print("="*50)
print(data.groupby("Country").Population.mean().sort_values(ascending =False).head(10))
print("="*50)
print("Top 10 Countries with Least Population")
print("="*50)
print(data.groupby("Country").Population.mean().sort_values(ascending =True).head(10))

#### Countries Vs GDP:

In [None]:
print("="*50)
print("Top 10 Countries with Highest GDP")
print("="*50)
print(data.groupby("Country").GDP.mean().sort_values(ascending =False).head(10))
print("="*50)
print("Top 10 Countries with Lowest GDP")
print("="*50)
print(data.groupby("Country").GDP.mean().sort_values(ascending =True).head(10))

### Graphical EDA:

In [None]:
for feature in categorical.columns:
    sns.countplot(categorical[feature], dodge=True)
    plt.show()

In [None]:
for feature in numerical.columns:
    sns.histplot(numerical[feature])
    plt.xticks(fontsize= 12)
    plt.yticks(fontsize=12)
    plt.ylabel("Count", fontsize= 13, fontweight="bold", name= "helvetica")
    plt.xlabel(feature, fontsize=13, fontweight="bold")
    plt.show()

In [None]:
sns.histplot(y, color= "olive")
plt.ylabel("Count", fontsize= 13, fontweight="bold", name= "Sans", family= "monospace")
plt.xlabel("Life Expectancy", fontsize=13, fontweight="bold", family= "monospace")

In [None]:
sns.boxplot(x= data["Status"], y= y,fliersize=5)
plt.xticks(fontsize= 12)
plt.yticks(fontsize=12)
plt.ylabel("Life Expectancy", fontsize= 13, fontweight="bold", name= "helvetica")
plt.xlabel("Status", fontsize=13, fontweight="bold")
plt.show()

In [None]:
plt.figure(figsize= [7,5])
order= data.groupby("Country").Life_expectancy.mean().nlargest(20).index
sns.barplot(y= "Country", x= "Life_expectancy", data= data, order= order, palette= "YlGnBu_r")
plt.title("Top 20 Countries with Highest Life Expectancy", fontsize= 18,fontweight="bold", fontstyle="italic")
plt.xticks( fontsize= 12)
plt.yticks(fontsize=12)
plt.ylabel("Country", fontsize= 14, fontweight="bold")
plt.xlabel("Life Expectancy", fontsize=14, fontweight="bold")

In [None]:
plt.figure(figsize= [7,5])
order=data.groupby("Country").Life_expectancy.mean().sort_values(ascending= True)[:20].index
sns.barplot(y= "Country", x= "Life_expectancy", data= data, order= order, palette= "RdYlBu")
plt.title("Top 20 Countries with Lowest Life Expectancy", fontsize= 18,fontweight="bold", fontstyle="italic")
plt.xticks(fontsize= 12)
plt.yticks(fontsize=12)
plt.ylabel("Country", fontsize= 14, fontweight="bold")
plt.xlabel("Life Expectancy", fontsize=14, fontweight="bold")

In [None]:
for feature in numerical.columns:
    sns.scatterplot(x= numerical[feature], y= y, hue= categorical.Status)
    plt.xticks(rotation=90, fontsize= 12)
    plt.yticks(fontsize=12)
    plt.ylabel("Life Expectancy", fontsize= 13, fontweight="bold", name= "helvetica")
    plt.xlabel(feature, fontsize=13, fontweight="bold")
    plt.show()

In [None]:
plt.figure(figsize= [8,6])
order= data.groupby("Country").GDP.mean().sort_values(ascending= False).head(20).index
sns.barplot(x= "Country", y= "GDP", data=data, order=order, errwidth=False, palette="mako")
plt.title("Top 20 Countries with Highest GDP", fontsize= 18,fontweight="bold", fontstyle="italic")
plt.xticks(rotation=90, fontsize= 12)
plt.yticks(fontsize=12)
plt.ylabel("GDP", fontsize= 14, fontweight="bold")
plt.xlabel("Country", fontsize=14, fontweight="bold")

In [None]:
plt.figure(figsize= [8,6])
order= data.groupby("Country").GDP.mean().sort_values(ascending= True).head(20).index
sns.barplot(x= "Country", y= "GDP", data=data, order=order, errwidth=False, palette="mako")
plt.title("Top 20 Countries with Lowest GDP", fontsize= 18,fontweight="bold", fontstyle="italic")
plt.xticks(rotation=90, fontsize= 12)
plt.yticks(fontsize=12)
plt.ylabel("GDP", fontsize= 14, fontweight="bold")
plt.xlabel("Country", fontsize=14, fontweight="bold")

## Feature Engineering:

In [None]:
checkna(df)

In [None]:
def imputer(data, feature, method):
    if method== "mode":
        data[feature]=data[feature].fillna(data[feature].mode()[0])
    elif method== "median":
        data[feature]=data[feature].fillna(data[feature].median())
    else:
        data[feature]=data[feature].fillna(data[feature].mean())

In [None]:
features_missing= df.columns[df.isna().any()]
for feature in features_missing:
    imputer(data= df, feature= feature, method= "mean")

In [None]:
y.fillna(y.median(), inplace=True)

##### We have imputed missing values with mean of that particular columns:

### Label Encoding:

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
columns= ["Country", "Year", "Status"]
for feature in columns:
    le= LabelEncoder()
    df[feature]= le.fit_transform(df[feature])
    df[feature].astype("int64")

### Feature Selection:

In [None]:
plt.figure(figsize= [12,12])
sns.heatmap(pd.concat([df,y], axis=1).corr(), annot=True, cmap= "YlGnBu_r")

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
columns= [ 'Adult_Mortality', 'infant_deaths',
       'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI',
       'under_five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria',
       'HIV/AIDS', 'GDP', 'Population', 'thinness',
       'Income_composition_of_resources', 'Schooling']
for feature in columns:
    sc= StandardScaler()
    df[[feature]]= sc.fit_transform(df[[feature]])

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
plt.figure(figsize= [8,6])
pd.Series(mutual_info_regression(df, y), index= df.columns).sort_values(ascending=False).plot(kind="barh")
plt.title("Feature importances", fontsize= 20)
plt.yticks(fontsize= 12)

## Training and Testing the Model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test= train_test_split(df, y, test_size= 0.30, random_state=9)

In [None]:
columns= pd.Series(mutual_info_regression(df, y), index= df.columns).sort_values(ascending= False)[:10].index
datacopy= df[columns]
target= y
    
X_train,X_test, y_train, y_test= train_test_split(datacopy, target, test_size= 0.33, random_state=9)   

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_percentage_error as mape, mean_squared_error as mse

    
alg= [RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), DecisionTreeRegressor(), LinearRegression(), SVR()]
    
    
for i in alg:
    model= i
    model.fit(X_train, y_train)
    y_pred1= model.predict(X_test)
    y_pred2= model.predict(X_train)
    print(i, "Train Root Mean Squared error:", np.sqrt(mse(y_train, y_pred2)))
    print(i, "Train Accuracy:", (1-mape(y_train, y_pred2))*100)
    print("*"*50)
    print(i, "Test Root Mean Squared error:", np.sqrt(mse(y_test, y_pred1)))
    print(i, "Test Accuracy:", (1-mape(y_test, y_pred1))*100)
    print("="*70)

In [None]:
rf= RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred3= rf.predict(X_test)
print( "RMSE:", np.sqrt(mse(y_test, y_pred3)))

In [None]:
columns= pd.Series(mutual_info_regression(df, y), index= df.columns).sort_values(ascending= False)[:10].index
datacopy= df[columns]
target= y
    
X_train,X_test, y_train, y_test= train_test_split(datacopy, target, test_size= 0.33, random_state=9)
from sklearn.ensemble import RandomForestRegressor
rf= RandomForestRegressor(n_estimators=90)
rf.fit(X_train, y_train)
y_pred= rf.predict(X_test)
y_pred2= rf.predict(X_train)
  
print("Test Root Mean Squared error:", round(np.sqrt(mse(y_test, y_pred)),2))
print("Test Accuracy:", round((1-mape(y_test, y_pred))*100,2))
print("Mean Absolute Percentage Error:", round(mape(y_test, y_pred)*100),2)

In [None]:
plt.figure(figsize= [7,5])
sns.scatterplot(y=(y_pred2-y_train), x= y_pred2,color= "brown")
plt.title("Residuals Vs Predicted value", fontsize= 18,fontweight="bold", fontstyle="italic")
plt.xticks( fontsize= 12)
plt.yticks(fontsize=12)
plt.ylabel("Residual", fontsize= 14, fontweight="bold")
plt.xlabel("Predicted", fontsize=14, fontweight="bold")

In [None]:
plt.subplot(1,2,1)
sns.histplot(y_pred)
plt.title("Predicted")
plt.xlabel("Predicted")
plt.subplot(1,2,2)
sns.histplot(y_test)
plt.title("Actual")
plt.xlabel("Actual")