In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Get the Data

In [None]:
happines_data_2015 = pd.read_csv('../input/world-happiness/2015.csv', parse_dates=True, encoding = "cp1252")
happines_data_2016 = pd.read_csv('../input/world-happiness/2016.csv', parse_dates=True, encoding = "cp1252")
happines_data_2017 = pd.read_csv('../input/world-happiness/2017.csv', parse_dates=True, encoding = "cp1252")
happines_data_2018 = pd.read_csv('../input/world-happiness/2018.csv', parse_dates=True, encoding = "cp1252")
happines_data_2019 = pd.read_csv('../input/world-happiness/2019.csv', parse_dates=True, encoding = "cp1252")

In [None]:
happines_data_2015.drop(['Region', 'Standard Error', 'Dystopia Residual'], axis=1, inplace=True)
happines_data_2015.rename({'Happiness Rank':'Overall rank', 
                           'Country':'Country or region', 
                           'Happiness Score': 'Score', 
                           'Economy (GDP per Capita)': 'GDP per capita', 
                           'Family':'Social support', 
                           'Health (Life Expectancy)':'Healthy life expectancy', 
                           'Freedom':'Freedom to make life choices', 
                           'Trust (Government Corruption)':'Perceptions of corruption'}, axis=1, inplace=True)
happines_data_2015.head()

In [None]:
happines_data_2016.drop(['Region', 'Lower Confidence Interval', 'Upper Confidence Interval', 'Dystopia Residual'], axis=1, inplace=True)
happines_data_2016.rename({'Happiness Rank':'Overall rank', 
                           'Country':'Country or region', 
                           'Happiness Score': 'Score', 
                           'Economy (GDP per Capita)': 'GDP per capita', 
                           'Family':'Social support', 
                           'Health (Life Expectancy)':'Healthy life expectancy', 
                           'Freedom':'Freedom to make life choices', 
                           'Trust (Government Corruption)':'Perceptions of corruption'}, axis=1, inplace=True)
happines_data_2016.head()

In [None]:
happines_data_2017.drop(['Whisker.high', 'Whisker.low', 'Dystopia.Residual'], axis=1, inplace=True)
happines_data_2017.rename({'Happiness.Rank':'Overall rank', 
                           'Country':'Country or region', 
                           'Happiness.Score': 'Score', 
                           'Economy..GDP.per.Capita.': 'GDP per capita', 
                           'Family':'Social support', 
                           'Health..Life.Expectancy.':'Healthy life expectancy', 
                           'Freedom':'Freedom to make life choices', 
                           'Trust..Government.Corruption.':'Perceptions of corruption'}, axis=1, inplace=True)
happines_data_2017.head()

In [None]:
happines_data = pd.concat([happines_data_2019, happines_data_2018, happines_data_2017, happines_data_2016, happines_data_2015])
happines_data.drop(['Overall rank', 'Country or region'], axis=1, inplace=True)
happines_data.head(800)

In [None]:
happines_data.describe()

In [None]:
happines_data['Perceptions of corruption'].isnull().values.any()

In [None]:
happines_data['Perceptions of corruption'].isnull().sum()

In [None]:
happines_data['Perceptions of corruption'] = happines_data['Perceptions of corruption'].fillna(0)

### Split up the data to training set and test set

In [None]:
X = happines_data[['GDP per capita',
                   'Social support', 
                   'Healthy life expectancy',
                   'Freedom to make life choices',
                   'Generosity',
                   'Perceptions of corruption']]

y = happines_data['Score']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_squared_error, mean_squared_log_error
from sklearn.metrics import median_absolute_error, mean_poisson_deviance, mean_gamma_deviance
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

In [None]:
models=[("Linear Regression", LinearRegression()),
        ("Ridge Regression", Ridge()),
        ("Lasso Regression", Lasso()),
        ("Elastic-Net Regression", ElasticNet()),
        ("Stochastic Gradient Descent", SGDRegressor()),
        ("Decision Tree", DecisionTreeRegressor()),
        ("Random Forest", RandomForestRegressor()),
        ("Extra Trees", ExtraTreesRegressor()),
        ("Gradient Boostin", GradientBoostingRegressor()),
        ("KNeighbors", KNeighborsRegressor()),
        ("SVM linear", SVR(kernel='linear')),
        ("SVM rbf", SVR(kernel='rbf')),
        ("Ada Boost", AdaBoostRegressor())]

for name, model in models:
    results = cross_val_score(model, X_train, y_train, cv=10)
    print(f"\x1b[96m{name}\x1b[0m: \x1b[93m{results.mean():.4f}\x1b[0m ± {results.std():.4f}")

In [None]:
enet = ElasticNet(random_state=0)
enet.fit(X_train, y_train)

enet_predict = enet.predict(X_test)

print("score: ", enet.score(X_test, y_test))
print("cross_val_score: ", cross_val_score(enet, X_train, y_train, cv = 10).mean())
print("r2_score: ", r2_score(y_test, enet_predict))
print("")
print("mean_absolute_error: ", mean_absolute_error(y_test, enet_predict))
print("mean_squared_error: ", mean_squared_error(y_test, enet_predict))
print("root_mean_squared_error: ", mean_squared_error(y_test, enet_predict, squared=False))
print("max_error: ", max_error(y_test, enet_predict))

In [None]:
et = ExtraTreesRegressor()
et.fit(X_train, y_train)

et_predict = et.predict(X_test)

print("score: ", et.score(X_test, y_test))
print("cross_val_score: ", cross_val_score(et, X_train, y_train, cv = 10).mean())
print("r2_score: ", r2_score(y_test, et_predict))
print("")
print("mean_absolute_error: ", mean_absolute_error(y_test, et_predict))
print("mean_squared_error: ", mean_squared_error(y_test, et_predict))
print("root_mean_squared_error: ", mean_squared_error(y_test, et_predict, squared=False))
print("max_error: ", max_error(y_test, et_predict))

## Train the model

In [None]:
gr_boosting = GradientBoostingRegressor(random_state=0,
                                        loss='huber',
                                        max_depth=5,
                                        max_features=3,
                                        learning_rate=0.2,
                                        n_estimators=27,
                                        min_samples_split=6,
                                        min_samples_leaf=4)

gr_boosting.fit(X_train, y_train)

print(f"""Тrain: {gr_boosting.score(X_train, y_train)}\nТest: {gr_boosting.score(X_test, y_test)}""")

gr_predict = gr_boosting.predict(X_test)
print("")
print("mean_absolute_error: ", mean_absolute_error(y_test, gr_predict))
print("mean_squared_error: ", mean_squared_error(y_test, gr_predict))
print("root_mean_squared_error: ", mean_squared_error(y_test, gr_predict, squared=False))
print("max_error: ", max_error(y_test, gr_predict))

In [None]:
correct=[]
error=[]
X_list=X_test.reset_index()
y_list=list(y_test)

for i in range(len(X_test)):
    r=y_list[i]
    p=gr_boosting.predict([[X_list.loc[i,"GDP per capita"],
                            X_list.loc[i,"Social support"],
                            X_list.loc[i,"Healthy life expectancy"],
                            X_list.loc[i,"Freedom to make life choices"],
                            X_list.loc[i,"Generosity"],
                            X_list.loc[i,"Perceptions of corruption"]]])[0]

    correct.append((abs(r-p)/r))
    error.append(abs(r-p))
    print(f'real: {r:.4f} - predicted: {p:.4f} - difference: {abs(r-p):.4f} ({(abs(r-p)/r):.2f}%)')

print(f"\nMean accuracy: {1-sum(correct) / len(correct)}")
print(f"Mean error: {sum(error) / len(error)}")
print(f"Max error: {max(error)}")
print(f"Min error: {min(error)}")

### Feature Importants

In [None]:
feature_importance = gr_boosting.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(17, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')

In [None]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(gr_boosting, X_test, y_test, n_repeats=30, random_state=0)
sorted_idx = np.argsort(perm_importance.importances_mean)
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(17, 6))

plt.barh(pos, perm_importance.importances_mean[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Permutation Importance')

In [None]:
import shap

explainer = shap.TreeExplainer(gr_boosting)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.dependence_plot('GDP per capita', shap_values, X_test, interaction_index='Healthy life expectancy')

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, X_test)

In [None]:
from sklearn.inspection import plot_partial_dependence, PartialDependenceDisplay

fig, ax = plt.subplots(figsize=(17, 8))
grb_plot=plot_partial_dependence(gr_boosting, X_test, ["GDP per capita", "Social support", 'Healthy life expectancy',
                                                       'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'], 
                                 ax=ax, method='brute', n_jobs=-1)

et_plot=plot_partial_dependence(et, X_test, ["GDP per capita", "Social support", 'Healthy life expectancy',
                                             'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'], 
                                ax=grb_plot.axes_, line_kw={"color": "red"}, n_jobs=-1)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 6, figsize=(20, 6))
grb_plot.plot(ax=ax1, line_kw={"color": "blue"})
et_plot.plot(ax=ax2, line_kw={"color": "red"});

# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_rf={# 'max_depth': [3, 4, 5, 6],
#           'max_features': [3, 4, 5, 6],
          'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
#           'min_samples_split': [4, 5, 6, 7],
#           'min_samples_leaf': [2, 3, 4, 5],
#           'loss': ['ls', 'lad', 'huber', 'quantile'],
          'n_estimators' :[20, 30, 50, 100, 200, 300]}

gs_rf = GridSearchCV(GradientBoostingRegressor(), param_grid = param_rf, n_jobs=-1)
gs_rf.fit(X_train, y_train.values.ravel())
# print(gs_rf.best_estimator_)
print(gs_rf.best_params_)
print('score=',gs_rf.best_score_)

## Predictions

In [None]:
import ipywidgets as widgets
from ipywidgets import Button, Layout

style = {'description_width': '170px'}
layout = Layout(width='600px')

gdp = widgets.FloatSlider(min=0, max=2.1, step=0.01, value=1, 
                          description='GDP per capita', style=style, layout=layout)
social = widgets.FloatSlider(min=0, max=1.7, step=0.01, value=0.5,
                             description='Social support', style=style, layout=layout)
health = widgets.FloatSlider(min=0, max=1.2, step=0.01, value=0.5,
                             description='Healthy life expectancy', style=style, layout=layout)
freedom = widgets.FloatSlider(min=0, max=0.7, step=0.01, value=0.5,
                              description='Freedom to make life choices', style=style, layout=layout)
gen = widgets.FloatSlider(min=0, max=0.6, step=0.01, value=0.3,
                          description='Generosity', style=style, layout=layout)
cor = widgets.FloatSlider(min=0, max=0.5, step=0.01, value=0.2,
                          description='Perceptions of corruption', style=style, layout=layout)

def f(gdp, social, health, freedom, gen, cor):
    print(f'Predicted value: {gr_boosting.predict([[gdp, social, health, freedom, gen, cor]])[0]:.5f}')

    
out = widgets.interactive_output(f, {'gdp': gdp, 'social': social, 'health': health, 
                                     'freedom': freedom, 'gen': gen, 'cor': cor,})

widgets.HBox([widgets.VBox([gdp, social, health, freedom, gen, cor]), out])

## Features value

In [None]:
max_depths = np.arange(1, 31, 1)
results_train = []
results_test = []

for feature in max_depths:
    rf = GradientBoostingRegressor(random_state=0, max_depth=feature, loss='huber')
    rf.fit(X_train, y_train)
        
    results_train.append(rf.score(X_train, y_train))
    results_test.append(rf.score(X_test, y_test))

fig, ax = plt.subplots(figsize=(17,8)) 
plt.plot(max_depths, results_train, 'b')
plt.plot(max_depths, results_test, 'r')

ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which='major', linestyle='-', linewidth=0.5, color='black',)
ax.grid(which='minor', linestyle=':', linewidth=0.5, color='black', alpha=0.7)

plt.title('max_depth')

plt.gca().xaxis.set_major_locator(plt.MultipleLocator(1))

In [None]:
print(results_test[results_test.index(max(results_test))])
print(max_depths[results_test.index(max(results_test))])

In [None]:
learning_rates = np.arange(0.01, 0.5, 0.01)
results_train = []
results_test = []

for feature in learning_rates:
    rf = GradientBoostingRegressor(random_state=0, max_depth=5, learning_rate=feature, loss='huber')
    rf.fit(X_train, y_train)
        
    results_train.append(rf.score(X_train, y_train))
    results_test.append(rf.score(X_test, y_test))

fig, ax = plt.subplots(figsize=(17,8)) 

plt.plot(learning_rates, results_train, 'b')
plt.plot(learning_rates, results_test, 'r')

ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which='major', linestyle='-', linewidth=0.5, color='black',)
ax.grid(which='minor', linestyle=':', linewidth=0.5, color='black', alpha=0.7)

plt.title('learning_rates')

plt.gca().xaxis.set_major_locator(plt.MultipleLocator(0.1))

In [None]:
print(results_test[results_test.index(max(results_test))])
print(learning_rates[results_test.index(max(results_test))])

In [None]:
min_samples_splits = np.arange(2, 20, 1)
results_train = []
results_test = []

for feature in min_samples_splits:
    rf = GradientBoostingRegressor(random_state=0, min_samples_split=feature, 
                                   learning_rate=0.2, max_depth=5, loss='huber')
    
    rf.fit(X_train, y_train)
        
    results_train.append(rf.score(X_train, y_train))
    results_test.append(rf.score(X_test, y_test))

fig, ax = plt.subplots(figsize=(17,8)) 

plt.plot(min_samples_splits, results_train, 'b')
plt.plot(min_samples_splits, results_test, 'r')

ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which='major', linestyle='-', linewidth=0.5, color='black',)
ax.grid(which='minor', linestyle=':', linewidth=0.5, color='black', alpha=0.7)

plt.title('min_samples_split')

plt.gca().xaxis.set_major_locator(plt.MultipleLocator(1))

In [None]:
print(results_test[results_test.index(max(results_test))])
print(min_samples_splits[results_test.index(max(results_test))])

In [None]:
min_samples_leafs = np.arange(0.01, 0.5, 0.01)
results_train = []
results_test = []

for feature in min_samples_leafs:
    rf = GradientBoostingRegressor(random_state=0, min_samples_leaf=feature, 
                                   learning_rate=0.2, max_depth=5, loss='huber')
    rf.fit(X_train, y_train)
        
    results_train.append(rf.score(X_train, y_train))
    results_test.append(rf.score(X_test, y_test))

fig, ax = plt.subplots(figsize=(17,8)) 

plt.plot(min_samples_leafs, results_train, 'b')
plt.plot(min_samples_leafs, results_test, 'r')

ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which='major', linestyle='-', linewidth=0.5, color='black',)
ax.grid(which='minor', linestyle=':', linewidth=0.5, color='black', alpha=0.7)

plt.title('min_samples_leaf')

plt.gca().xaxis.set_major_locator(plt.MultipleLocator(0.1))

In [None]:
print(results_test[results_test.index(max(results_test))])
print(min_samples_leafs[results_test.index(max(results_test))])

In [None]:
max_features  = list(range(1,X.shape[1]+1))
results_train = []
results_test = []

for feature in max_features :
    rf = GradientBoostingRegressor(random_state=0, max_features=feature,
                                   learning_rate=0.2, max_depth=5, loss='huber')
    
    rf.fit(X_train, y_train)
        
    results_train.append(rf.score(X_train, y_train))
    results_test.append(rf.score(X_test, y_test))

fig, ax = plt.subplots(figsize=(17,8)) 

plt.plot(max_features, results_train, 'b')
plt.plot(max_features, results_test, 'r')

ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which='major', linestyle='-', linewidth=0.5, color='black',)
ax.grid(which='minor', linestyle=':', linewidth=0.5, color='black', alpha=0.7)

plt.title('max_features')

plt.gca().xaxis.set_major_locator(plt.MultipleLocator(1))

In [None]:
n_estimator  = np.arange(1, 100, 1)
results_train = []
results_test = []

for feature in n_estimator :
    rf = GradientBoostingRegressor(random_state=0, n_estimators=feature,
                                   learning_rate=0.2, max_depth=5, loss='huber')
    rf.fit(X_train, y_train)
        
    results_train.append(rf.score(X_train, y_train))
    results_test.append(rf.score(X_test, y_test))

fig, ax = plt.subplots(figsize=(17,8)) 

plt.plot(n_estimator, results_train, 'b')
plt.plot(n_estimator, results_test, 'r')

ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which='major', linestyle='-', linewidth=0.5, color='black',)
ax.grid(which='minor', linestyle=':', linewidth=0.5, color='black', alpha=0.7)

plt.title('n_estimators')

plt.gca().xaxis.set_major_locator(plt.MultipleLocator(10))

In [None]:
print(results_test[results_test.index(max(results_test))])
print(n_estimator[results_test.index(max(results_test))])