# World happiness report - Visualize 2020 scores and contributions

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# read 2020 data
df2020 = pd.read_csv('../input/world-happiness-report/2020.csv')
df2020.head()

In [None]:
df2020.columns

In [None]:
n_countries = df2020.shape[0]
print(n_countries)

In [None]:
# extract residual effect
df2020['Residual'] = df2020['Dystopia + residual'] - df2020['Ladder score in Dystopia']
df2020['Residual'].plot(kind='hist')
plt.xlabel('Residual')
plt.show()

In [None]:
df2020['Residual'].describe()

In [None]:
# score w/o residual effect
df2020['Ladder score ex residual'] = df2020['Ladder score'] - df2020['Residual']

In [None]:
# look at correlation between residual and "remaining" score
plt.scatter(df2020['Ladder score ex residual'], df2020['Residual'])
plt.title('Residual vs. Ladder score ex Residual')
plt.xlabel('Ladder score ex Residual')
plt.ylabel('Residual')
plt.show()

In [None]:
# calc Pearson correlation
stats.pearsonr(df2020['Ladder score ex residual'], df2020['Residual'])

### Residual is not correlated with the sum of the other score components. 

# Show 20 countries having highest ladder score

In [None]:
# plot score of top 20
sns.barplot(x=df2020['Country name'][0:20], y=df2020['Ladder score'][0:20])
plt.title('Ladder Score for Top 20 countries')
plt.xticks(rotation=90)
plt.show()

In [None]:
# ladded score is the sum of the following contributions/explanations:
contribution_features = ['Explained by: Log GDP per capita',
                         'Explained by: Social support',
                         'Explained by: Healthy life expectancy',
                         'Explained by: Freedom to make life choices',
                         'Explained by: Generosity', 'Explained by: Perceptions of corruption',
                         'Dystopia + residual']

df2020_contributions = df2020[contribution_features]

In [None]:
# plot top 20 again showing contributions/explanations
sns.set()
df2020_contributions.set_index(df2020['Country name'])[0:20].plot(kind='bar',stacked=True)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Top 20 including contribution split')
plt.show()

In [None]:
# show error bars (here using 2*stdev)
sns.set()
plt.bar(x=df2020['Country name'][0:20], height=df2020['Ladder score'][0:20], 
        yerr=2*df2020['Standard error of ladder score'][0:20], color='lightblue', ecolor='black')
plt.xticks(rotation=90)
plt.title('Top 20 with error bars')
plt.show()

In [None]:
# plot top 20 - show only residual effect
sns.barplot(x=df2020['Country name'][0:20], y=df2020['Residual'][0:20])
plt.title('Top 20 - Showing residual effect only')
plt.xticks(rotation=90)
plt.show()

In [None]:
# residual effect - sorted
df2020res = df2020[['Country name','Residual']].sort_values(by=['Residual'], ascending=False)
df2020res.reset_index(inplace=True)
df2020res

In [None]:
# plot top 20 residual effects
sns.barplot(x=df2020res['Country name'][0:20], y=df2020res['Residual'][0:20])
plt.title('Top 20 Residual effects')
plt.xticks(rotation=90)
plt.show()

In [None]:
# plot bottom 20 residual effects
sns.barplot(x=df2020res['Country name'][n_countries-20:n_countries], y=df2020res['Residual'][n_countries-20:n_countries])
plt.title('Bottom 20 Residual effects')
plt.xticks(rotation=90)
plt.show()

# Show 20 countries having lowest ladder score

In [None]:
# plot score of bottom 20
sns.barplot(x=df2020['Country name'][n_countries-20:n_countries], y=df2020['Ladder score'][n_countries-20:n_countries])
plt.title('Ladder Score for Bottom 20 countries')
plt.xticks(rotation=90)
plt.show()

In [None]:
# plot bottom 20 again showing contributions/explanations
sns.set()
df2020_contributions.set_index(df2020['Country name'])[n_countries-20:n_countries].plot(kind='bar',stacked=True)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Bottom 20 including contribution split')
plt.show()

In [None]:
# show error bars (here using 2*stdev)
sns.set()
plt.bar(x=df2020['Country name'][n_countries-20:n_countries], height=df2020['Ladder score'][n_countries-20:n_countries], 
        yerr=2*df2020['Standard error of ladder score'][0:20], color='lightblue', ecolor='black')
plt.xticks(rotation=90)
plt.title('Bottom 20 with error bars')
plt.show()

In [None]:
# plot bottom 20 - show only residual effect
sns.barplot(x=df2020['Country name'][n_countries-20:n_countries], y=df2020['Residual'][n_countries-20:n_countries])
plt.title('Bottom 20 - Residual effect only')
plt.xticks(rotation=90)
plt.show()

# Averages by Region

In [None]:
df2020_by_region = df2020.groupby('Regional indicator').mean()
df2020_by_region

In [None]:
# plot score by region
sns.barplot(x=df2020_by_region.index, y=df2020_by_region['Ladder score'])
plt.title('Ladder score by region (same weight for all countries)')
plt.xticks(rotation=90)
plt.show()

In [None]:
# plot residual effect by region
sns.barplot(x=df2020_by_region.index, y=df2020_by_region['Residual'])
plt.title('Residual effect by region (same weight for all countries)')
plt.xticks(rotation=90)
plt.show()

### Region "Latin America and Caribbean" has in average the highest residual impact. Let's have a closer look:

In [None]:
df2020_LACB = df2020[df2020['Regional indicator'] == 'Latin America and Caribbean']
df2020_LACB.reset_index(inplace=True)
df2020_LACB

In [None]:
# plot score of bottom 20
sns.barplot(x=df2020_LACB['Country name'], y=df2020_LACB['Residual'])
plt.title('Residuals - Latin America and Caribbean')
plt.xticks(rotation=90)
plt.show()