In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')

In [None]:
data.head()

Let's take a look at the dimension of the data.

In [None]:
data.shape

Data is not too big.

Let's take a look at what the data tells us.

In [None]:
data.info()

Looks like there are 2 features which are categorical in nature.

Let's also see if there are any missing values.

In [None]:
data.isnull().sum()

Now while this gives the missing values, it's not that good when it comes to readability.
And as Data Scientists, we should also be careful that our code looks clean and readable.
So, let's make a function that throws a more clean readable output.

In [None]:
def missing_values(df):
    missing=pd.DataFrame(df.isnull().sum()/len(data))*100
    missing.columns = ['missing_values(%)']
    missing['missing_values(numbers)'] = pd.DataFrame(df.isnull().sum())
    return missing.sort_values(by='missing_values(%)', ascending=False)
missing_values(data)

Hmmm, looks clean enough now let's take care of these missing values.

Now let's rename the column names, because they have spaces between the words.

In [None]:
# Renaming some column names as they contain trailing spaces.
data.rename(columns={" BMI ":"BMI","Life expectancy ":"Life_Expectancy","Adult Mortality":"Adult_Mortality",
                   "infant deaths":"Infant_Deaths","percentage expenditure":"Percentage_Exp","Hepatitis B":"HepatitisB",
                  "Measles ":"Measles","under-five deaths ":"Under_Five_Deaths","Diphtheria ":"Diphtheria",
                  " HIV/AIDS":"HIV/AIDS"," thinness  1-19 years":"thinness_1to19_years"," thinness 5-9 years":"thinness_5to9_years","Income composition of resources":"Income_Comp_Of_Resources",
                   "Total expenditure":"Tot_Exp"},inplace=True)

In [None]:
data.head()

Now taking care of missing values is always a pickle, because we'd be uncertain of choosing the values that are going to replace missing values. 
Now that depends upon the data.

If the feature that has missing values also has outliers, it's better to replace missing values with ***`median()`***.
If there are no outliers then go ahead and use ***`mean()`*** of the feature to replace missing values.
Now while this holds good for numerical data, in categorical data we can maybe use ***`mode()`*** of that feature to fill the missing values.

In [None]:
for label,content in data.items():
    if pd.isnull(content).sum():
        data[label] = content.fillna(content.median())

Let's check if we have successfully replaced missing values with ***`median()`***

In [None]:
missing_values(data)

Beautiful, all our missing values have been taken care of.

Now let's take care of categorical features.
While there are multipe ways of taking care of categorical features, I've decided to use **`pd.get_dummies()`** function from **`pandas`** library. 
Let's see how that works.

There are only two categorical features.

In [None]:
data=pd.get_dummies(data, columns=['Country','Status'])

Let's take a look at our data now.

In [None]:
data.head()

Nice, looks good.
Now let's divide the data into X & y, so that we can split the data into training & test sets.

In [None]:
X = data.drop('Life_Expectancy', axis=1)
y = data['Life_Expectancy']

In [None]:
X.head()

In [None]:
y.head()

Now let's split our model into training & test set.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

Everything look's good.

Now here comes the dilemma, choosing the right estimator. But lucky for us **scikit-learn** is kind enough to provide us with a map, that'll help us choose the right estimator.
You can find it [here](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)

Now I'll first try with **`GradientBoostingRegressor`**
But first let's import our evaluation metrics

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_pred = gbr.predict(X_test)
print('R2 score is : {:.2f}'.format(r2_score(y_test, gbr_pred)))

Not bad. But wait a minute, we've not yet checked for outliers.

But first, let's make a copy of our dataframe.

In [None]:
df = data.copy()

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = pd.DataFrame(((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum())
outliers1= outliers[:60]
outliers2 = outliers[60:120]
outliers3 = outliers[120:180]
outliers4 = outliers[180:]
outliers1,outliers2,outliers3,outliers4

Wow ! Almost all of the features have outliers.

So instead of taking care of these outliers, let's use `RandomForestRegressor`, since they're really robust and immune to outliers. 

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf_pred=rf.predict(X_test)

Let's evaluate our model.

In [None]:
print('R2 score is : {:.2f}'.format(r2_score(y_test, rf_pred)))

This did really better compared to `GradientBoostingRegressor`.

Now let's take a look at important features according to our model.

In [None]:
rf.feature_importances_

Woaahhh ! That looks overwhelming.

Let's try and visualize it, so that we can understand it better.

Let's look at the top 10 features.

In [None]:
import seaborn as sns
# Helper function for plotting feature importance
def plot_features(columns, importances, n=10):
    df = (pd.DataFrame({"features": columns,
                        "feature_importance": importances})
          .sort_values("feature_importance", ascending=False)
          .reset_index(drop=True))
    
    sns.barplot(x="feature_importance",
                y="features",
                data=df[:n],
                orient="h")

In [None]:
plot_features(X_train.columns, rf.feature_importances_)

Hmmm, top 10 features according to our model are : 
                                                    
`'HIV/AIDS',
'Income_Comp_Of_Resources',
'Adult_Mortality',
'BMI',
'Under_Five_Deaths',
'Schooling',
'thinness_5to9_years',
'Year',
'Alcohol',
'thinness_1to19_years'`.
                                                    
So let's just use these 10 features and see if the model still works good.

Because as a data scientist, we should always look at ways to cut down computational costs. And that can happen when you reduce the dimension of your data.

In [None]:
new_data = data[['HIV/AIDS','Adult_Mortality','Income_Comp_Of_Resources','Schooling',
      'BMI','thinness_5to9_years','Under_Five_Deaths','Infant_Deaths',
      'thinness_1to19_years','Year']]

In [None]:
new_data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_data,y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

Now let's re-train our model on this new data.

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf_pred_new = rf.predict(X_test)

Now let's evaluate on new data.

In [None]:
print('R2 score is : {:.2f}'.format(r2_score(y_test, rf_pred_new)))

Worked like a charm.

In [None]:
rf_pred_new = pd.DataFrame(rf_pred_new)

In [None]:
rf_pred_new.to_csv('predictions.csv')