In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Exploratory Data Analysis 

## Schedule

- Load data
- Get data basic statistics
- Check missing values
- Data Imputation or Feature Removal
- Categorical Variables countplot
- Numerical Variables Distribution
- Effect Plots

## Load Data

In [None]:
df = pd.read_csv(r"../input/life-expectancy-who/Life Expectancy Data.csv")
df

In [None]:
df.dtypes

## Basic Statistics

In [None]:
df.describe()

## Checking Missing Values

In [None]:
df.isna().mean().plot(kind="bar",figsize=(12,5))

In [None]:
plt.figure(figsize=(12,5))
sns.heatmap(df.isna(),cbar=False)

## Data Imputation or Feature Removal

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
def DataImputation(data,strategy="mean"):
    numerical_columns = data.select_dtypes(exclude="O").columns
    imp = SimpleImputer(strategy=strategy)
    data.loc[:,numerical_columns] = imp.fit_transform(data.loc[:,numerical_columns])
    return data

In [None]:
df_clean = DataImputation(df)
df_clean.isna().mean()

## Categorical Variables Counplots

In [None]:
df_cat = df_clean.select_dtypes(include="O")
cat_features = df_cat.columns

In [None]:
for feature in cat_features:
    plt.figure(figsize=(12,5))
    sns.countplot(df_cat[feature])

## Numerical Variables Distribution

In [None]:
df_num = df_clean.select_dtypes(exclude="O")
for feature in df_num.columns:
    plt.figure(figsize=(10,5))
    sns.histplot(df_num[feature])

## Effects Plot

Since every country has the same number of observations and has a lot of unique values only Status will be used as the categorical variable in the effects plots

In [None]:
y = df_clean["Life expectancy "]
df_clean2 = df_clean.drop("Life expectancy ",axis=1)

for feature in df_clean2.select_dtypes(exclude="O").columns:
    plt.figure(figsize=(12,5))
    sns.scatterplot(x=df_clean2[feature],y=y,hue=df_clean2["Status"])

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_clean.corr(),annot=True,cbar=False)

# Model Creation

## Splitting Data

- Target: Life expectancy

We'll use all the variables except Country and Year to predict our Target variable

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoding Status
enc = LabelEncoder()
df_clean.loc[:,"Status"] = enc.fit_transform(df_clean["Status"]) 

y = df_clean["Life expectancy "]
X = df_clean.drop(labels=["Country","Year","Life expectancy "],axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2)

## Model Selection

In [None]:
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import time

models = [LinearRegression(),Ridge(),RandomForestRegressor(),
        DecisionTreeRegressor(),SVR(),KNeighborsRegressor()]

models_names = ["MLR","Ridge","RandomForest","DecisionTree","SVR","KNN"]

models_time = []

mse_trains = []
mse_valids = []

mae_trains = []
mae_valids = []


for count,model in enumerate(models):

    start = time.time()
    pipe = Pipeline([("scaler",StandardScaler()),(models_names[count],model)])
    pipe.fit(X_train,y_train)
    end = time.time()
    
    fitting_time = end-start
    
    y_train_pred = pipe.predict(X_train)
    y_valid_pred = pipe.predict(X_valid)

    mse_train = mean_squared_error(y_train,y_train_pred)
    mse_valid = mean_squared_error(y_valid,y_valid_pred)

    mae_train = mean_absolute_error(y_train,y_train_pred)
    mae_valid = mean_absolute_error(y_valid,y_valid_pred)

    mse_trains.append(mse_train)
    mse_valids.append(mse_valid)
    mae_trains.append(mae_train)
    mae_valids.append(mae_valid)
    models_time.append(fitting_time)


In [None]:
df_models = pd.DataFrame({"Model":models_names,"MSE Train":mse_trains,"MSE Valid":mse_valids,
                        "MAE Train":mae_trains,"MAE Valid":mae_valids})

df_models.set_index("Model",inplace=True)

df_models

In [None]:
plt.style.use("seaborn")
df_models.loc[:,df_models.columns.str.match("MSE")].plot.bar(figsize=(12,5),rot=0,fontsize=13)
df_models.loc[:,df_models.columns.str.match("MAE")].plot.bar(figsize=(12,5),rot=0,fontsize=13)


## Model Tunning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
rf = RandomForestRegressor()
steps = [("scaler",StandardScaler()),("rf",rf)]
pipe = Pipeline(steps)

param_grid = {"rf__max_depth":[5,10,None],
                "rf__bootstrap":[True],
                "rf__max_features":["auto","log2"],
                "rf__n_estimators":np.arange(100,210,10)}

search = GridSearchCV(estimator = pipe,param_grid=param_grid,return_train_score=True,verbose=3)

search.fit(X,y)

## Checking Best Model

In [None]:
print("Best Parameters")
for key,item in search.best_params_.items():
    print(f"{key}: {item}")

In [None]:
best = search.best_estimator_
y_pred_best = best.predict(X)
mae_best = mean_absolute_error(y,y_pred_best)
print(f"Best Model MAE: {mae_best:.2f}")

In [None]:
features = X.columns
importances = best.steps[1][1].feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances',fontsize=20,fontweight="bold")
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices],fontsize=13)
plt.xlabel('Relative Importance',fontsize=15);
plt.tight_layout()