# 

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from collections import Counter
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Checking data information

In [None]:
whr_21    = pd.read_csv('../input/world-happiness-report-2021/world-happiness-report-2021.csv')
whr_05_20 = pd.read_csv('../input/world-happiness-report-2021/world-happiness-report.csv')
print('WHR 2021')
print(whr_21.info())
print('WHR 2005 - 2020')
print(whr_05_20.info())

# Correlation between some variables

In [None]:
plt.figure(dpi=150)
sns.heatmap(whr_21.iloc[:,[2,6,7,8,9,10,11]].corr(), cmap='crest', annot=True)
plt.tick_params(axis="x", labelsize=6)
plt.tick_params(axis="y", labelsize=6)
plt.xticks(rotation=45)
plt.show()

# Variable distribution with mean + standard deviation. Brazil is the red dashed line

In [None]:
br_ladder = whr_21['Ladder score'].values[whr_21['Country name'] == 'Brazil'][0]
br_gdp    = whr_21['Logged GDP per capita'].values[whr_21['Country name'] == 'Brazil'][0]
br_ss     = whr_21['Social support'].values[whr_21['Country name'] == 'Brazil'][0]
br_hle    = whr_21['Healthy life expectancy'].values[whr_21['Country name'] == 'Brazil'][0]
br_free   = whr_21['Freedom to make life choices'].values[whr_21['Country name'] == 'Brazil'][0]
br_co     =whr_21['Perceptions of corruption'].values[whr_21['Country name'] == 'Brazil'][0]


fig = plt.figure(figsize=(18,12))
plt.subplot(321)
plt.title('Ladder score. Min is {} and max is {}.'.format(min(whr_21['Ladder score'].values), max(whr_21['Ladder score'].values)))
ax = sns.distplot(whr_21['Ladder score'].values)
ax2 = ax.twinx()
sns.boxplot(x=whr_21['Ladder score'].values, ax=ax2)
plt.axvline(br_ladder, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(322)
plt.title('Logged GDP per capita. Min is {} and max is {}.'.format(min(whr_21['Logged GDP per capita'].values), max(whr_21['Logged GDP per capita'].values)))
ax = sns.distplot(whr_21['Logged GDP per capita'].values)
ax2 = ax.twinx()
sns.boxplot(x=whr_21['Logged GDP per capita'].values, ax=ax2)
plt.axvline(br_gdp, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(323)
plt.title('Social support. Min is {} and max is {}.'.format(min(whr_21['Social support'].values), max(whr_21['Social support'].values)))
ax = sns.distplot(whr_21['Social support'].values)
ax2 = ax.twinx()
sns.boxplot(x=whr_21['Social support'].values, ax=ax2)
plt.axvline(br_ss, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(324)
plt.title('Healthy life expectancy. Min is {} and max is {}.'.format(min(whr_21['Healthy life expectancy'].values), max(whr_21['Healthy life expectancy'].values)))
ax = sns.distplot(whr_21['Healthy life expectancy'].values)
ax2 = ax.twinx()
sns.boxplot(x=whr_21['Healthy life expectancy'].values, ax=ax2)
plt.axvline(br_hle, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(325)
plt.title('Freedom to make life choices. Min is {} and max is {}.'.format(min(whr_21['Freedom to make life choices'].values), max(whr_21['Freedom to make life choices'].values)))
ax = sns.distplot(whr_21['Freedom to make life choices'].values)
ax2 = ax.twinx()
sns.boxplot(x=whr_21['Freedom to make life choices'].values, ax=ax2)
plt.axvline(br_free, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(326)
plt.title('Perceptions of corruption. Min is {} and max is {}.'.format(min(whr_21['Perceptions of corruption'].values), max(whr_21['Perceptions of corruption'].values)))
ax = sns.distplot(whr_21['Perceptions of corruption'].values)
ax2 = ax.twinx()
sns.boxplot(x=whr_21['Perceptions of corruption'].values, ax=ax2)
plt.axvline(br_co, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.show()

# Linear regression to see what are the most influent variables to predict ladder scores above 60%

In [None]:
useful_df = whr_21.iloc[:,[2,6,7,8,9,10,11]]
norm_lc = ((useful_df['Ladder score'].values-min(useful_df['Ladder score'].values))/(max(useful_df['Ladder score'].values)-min(useful_df['Ladder score'].values)))
useful_df['Normalized Ladder score'] = norm_lc
bin_ladder = np.where(useful_df['Normalized Ladder score'].values >0.6, 1, 0); useful_df['Binary LC'] = bin_ladder
useful_df.sample(10)

In [None]:
column_names = list(useful_df.columns)

X = np.array(useful_df[useful_df.columns[1:7]])
y = np.array(useful_df[useful_df.columns[8]])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=80)

y_train_count = Counter(y_train)
y_test_count = Counter(y_test)
print('Output count for train set is {} and for test set is {}'.format(y_train_count, y_test_count))

In [None]:
linear_reg  = LinearRegression()
lr_model    = linear_reg.fit(X_train, y_train)
lr_predict  = lr_model.predict(X_test)
round_preds = np.where(lr_predict > 0.5, 1, 0)
lr_acc      = accuracy_score(y_test, round_preds)
lr_matrix   = confusion_matrix(y_test, round_preds)

print('Accuracy: {}'.format(lr_acc.round(2)))
print(classification_report(y_test, round_preds))

fig=plt.figure(figsize=(12, 6))
sns.heatmap(lr_matrix, annot=True, cmap='crest') 
plt.show()

# Using SHAP to check what were the avg variable impact on model output

In [None]:
explainer_lr = shap.LinearExplainer(lr_model, masker=shap.maskers.Impute(data=X_train)).shap_values(X_train)
plt.figure(dpi=100)
shap.summary_plot(explainer_lr, X_train, plot_type="violin", feature_names=column_names)
plt.show()

## What are the insights from SHAP?
* Not much for the first 4 variables. The higher their values the higher the impact to produce a positive output (ladder score above 60%);
* Generosity does not play a significant role in model output. It was possible to see this trend in the confusion matrix. One thing that is interesting is that higher values of generosity were linked to lower output values, thus shows us a possible negative correlation between this variable and ladder scores above 60%;
* Freedom to make life choices also does not play a significant role here. However, it is possible to observe a negative correlation between this variable and high output scores. Interesting...

## Final consideration
### The method used to evaluate happiness is the Cantril Ladder, "which asks survey respondents to place the status of their lives on a “ladder” scale ranging from 0 to 10, where 0 means the worst possible life and 10 the best possible life" (see [here](https://worldhappiness.report/ed/2019/changing-world-happiness/) for more information).


Social support, healthy life expectancy, and logged GPD per capita were variables positively correlated with higher output values (ladder scores above 60%). However, the perception of generosity and freedom to make life choices were (minimally) negatively correlated with higher output values. This observation is intriguing, so I think I will explore it deeply with the WHR values from 2005 to 2020.