In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.express as px
import plotly.subplots as psp

import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# World Happiness Report

In the following we visualize the results of the World Happiness Report.

- We first take a look at which countries have been in the top and bottom 10 of the ranking through the years.

- We then move to analyse the 2021 report, specifically the happiness score in different world regions and countries, as well as the happiness measures used to build the score.

In [None]:
# report 2005 - 2020
path_report = '/kaggle/input/world-happiness-report-2021/world-happiness-report.csv'
df = pd.read_csv(path_report)

df['Yearly Ranking'] = df.groupby('year')['Life Ladder'].rank(method='first',ascending=False)
df['Inverse Yearly Ranking'] = df.groupby('year')['Life Ladder'].rank(method='first')

# report 2021
path_report_2021 = '/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv'
df_2021 = pd.read_csv(path_report_2021)

## World Happiness Report 2005-2020

Let's start with an animation of the happiness score obtained by the top and bottom 10 countries throughout the years.

### Top 10 Countries

In [None]:
df_top_happiness = df.query("`Yearly Ranking` <= 10").sort_values('year')

ranking_happiness = px.bar(
    df_top_happiness,
    x='Life Ladder',
    y='Yearly Ranking',
    animation_frame='year',
    animation_group='Country name',
    color='Country name',
    text='Country name',
    title='Happiness score - Top 10 countries',
    orientation='h',
    range_x=[0,8.5]
)

ranking_happiness.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000
ranking_happiness.update_layout(showlegend=False)

ranking_happiness.update_yaxes(
    autorange='reversed',
    tickmode='linear'
)

ranking_happiness.show()

It seems that Western countires are consistently at the top of the ranking, in particular the Norther European ones. Let's take a better look at how often each counties appears in the top 10.

In [None]:
series_top_appearences = df_top_happiness['Country name'].value_counts()

df_regions = df_2021[['Country name','Regional indicator']].set_index('Country name')

df_top_appearences = series_top_appearences.to_frame(name='Total appearences').reset_index(level=0)
df_top_appearences = df_top_appearences.rename(columns={'index':'Country name'})
df_top_appearences = df_top_appearences.join(df_regions, on='Country name')

top_appearences = px.sunburst(
    df_top_appearences,
    path=['Regional indicator','Country name'],
    values='Total appearences',
    title='Total appearence of countries in the top 10 list'
)

top_appearences.show()

Interestingly, Denmark is the one country that appear every time since the ranking was made.

### Bottom 10 Countries

In [None]:
df_bottom_happiness = df.query("`Inverse Yearly Ranking` <= 10").sort_values('year')

ranking_happiness = px.bar(
    df_bottom_happiness,
    x='Life Ladder',
    y='Inverse Yearly Ranking',
    animation_frame='year',
    animation_group='Country name',
    color='Country name',
    text='Country name',
    title='Happiness score - Bottom 10 countries',
    orientation='h',
    range_x=[0,7]
)

ranking_happiness.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000
ranking_happiness.update_layout(showlegend=False)

ranking_happiness.update_yaxes(
    autorange='reversed',
    tickmode='linear'
)

ranking_happiness.show()

It seems that a large number of countries at the bottom of the ranking are from the African continent. Let' check whether this is indeed the case,

In [None]:
series_bottom_appearences = df_bottom_happiness['Country name'].value_counts()

df_regions = df_2021[['Country name','Regional indicator']].set_index('Country name')

df_bottom_appearences = series_bottom_appearences.to_frame(name='Total appearences').reset_index(level=0)
df_bottom_appearences = df_bottom_appearences.rename(columns={'index':'Country name'})
df_bottom_appearences = df_bottom_appearences.join(df_regions, on='Country name')
df_bottom_appearences.fillna('Sub-Saharan Africa',inplace=True)

bottom_appearences = px.sunburst(
    df_bottom_appearences,
    path=['Regional indicator','Country name'],
    values='Total appearences',
    title='Total appearence of countries in the bottom 10 list'
)

bottom_appearences.show()

Indeed, African counties overwelmingly populate the bottom of the ranking. However, it is interesting to notice that the countries at the bottom of the list tend to change from year to year, while this seems not to be the case for the ones on top of the ranking.

### Distinct countries at bottom/top of the ranking

We can better visualize this phenomenon by showing the number of distinct countires appearing in the top/bottom of the Happiness ranking throughout the years,

In [None]:
df_top = df_top_appearences.value_counts('Regional indicator').to_frame(name='Total appearences').reset_index(level=0)
df_top['Ranking Position'] = 'Top'
df_top.sort_values('Regional indicator',inplace=True)

df_bottom = df_bottom_appearences.value_counts('Regional indicator').to_frame(name='Total appearences').reset_index(level=0)
df_bottom['Ranking Position'] = 'Bottom'
df_bottom.sort_values('Regional indicator',inplace=True)

df_appearnces = pd.concat((df_top,df_bottom))

df_appearnces['Countires appearing'] = 'all'

fig = px.bar(
    df_appearnces,
    x='Total appearences',
    y='Countires appearing',
    range_x=[0,df_appearnces['Total appearences'].sum()],
    color='Ranking Position',
    hover_data={'Ranking Position':True,'Regional indicator':True,'Total appearences':True,'Countires appearing':False},
    orientation='h',
    title='Number of distinct counties appearing in the top/bottom of the ranking',
    height=250
)

fig.update_yaxes(showticklabels=False)

fig.show()

### Distribution of the Happiness index

From the animation, we get an idea of the tails of the distribution of Happiness scores, but not of the distributions as a whole. Furthermore, we see that when the study started in 2005, the bottom countries had an unusal high score, compare to those in the following years. This suggest that when the study started, not many countries where accounted for.

Here, we plot the distribution of the Happiness score at different years, together with the number of countires accounted for that year.

In [None]:
fig = px.violin(
    df,
    x='year',
    y='Life Ladder',
    range_y=[1.5,10],
    points='all',
    title='Distribution of Happiness score throughout the years'
)

df_countries = df.groupby('year')['Life Ladder'].agg('count').to_frame(name='total countries').reset_index()

for year, n_countries in df_countries.to_numpy():

    fig.add_annotation(
        x=year,
        y=9.5,
        text=str(n_countries),
        showarrow=False,
        bordercolor="#94c5d1",
        borderwidth=2,
        borderpad=4,
        bgcolor="#65a2db"
    )

fig.show()

In the above figure, the number in the squared box is the total number of countries surveyed on the correspondin year. We see that during the first few years, the study was limited to a fewer countries than it was later on, and that in 2020 the number of counties studied dropped significantly, possibly due to the COVID-19 pandemic.

## World Happiness Report 2021

We can now focus on this year report. First, let's have a bird eye look at the happiness index on the geographical map. 

In [None]:
px.choropleth(
    df_2021,
    locations='Country name',
    locationmode='country names',
    color='Ladder score',
    title='Happiness index - world view'
)

It seems that the countries with highest Happiness score are sitauted in Europe, North America, and Oceania. Let's explore a bit more.

### Happiness index

First, we can check how the score is distributed in the different regional zones,

In [None]:
px.box(
    data_frame=df_2021,
    x='Regional indicator',
    y='Ladder score',
    color='Regional indicator',
    title='Happines index - regional distribution'
)

The above figure seems to confirm the impression we had, that Europe, North America and Oceania are the regions with highest Happiness score.

Let's check the Happiness score of each country within a given world region, explaied by different measures such as the amount of social support available, the life expectancy, and so on.

You can replace the field below with any other available region to check the Happiness score of the countries.

In [None]:
world_region = 'Latin America and Caribbean'

In [None]:
index_fields = [
    'Regional indicator',
    'Country name',
    'Ladder score',
    'Standard error of ladder score'
]

index_explained = {
    'Dystopia + residual' : 'Dystopia + Residuals',
    'Explained by: Social support' : 'Social support',
    'Explained by: Perceptions of corruption' : 'Perceived corruption',
    'Explained by: Healthy life expectancy' : 'Life expectancy',
    'Explained by: Freedom to make life choices' : 'Freedom',
    'Explained by: Generosity' : 'Generosity',
    'Explained by: Log GDP per capita' : 'Log GDP per capita'
}

df_index_score = df_2021[index_fields+list(index_explained.keys())].rename(columns=index_explained)

df_index_score = df_index_score.melt(
    id_vars=index_fields,
    value_vars=list(index_explained.values()),
    var_name='Explained by',
    value_name='Contribution to score'
)

df_region_score = df_index_score.query("`Regional indicator` == '{}'".format(world_region))

region_plot = px.bar(
    data_frame=df_region_score,
    x='Country name',
    y='Contribution to score',
    color='Explained by',
    title='Happiness index for {}'.format(world_region)
)

region_plot.update_traces(
    patch={'error_y': {'array': df_region_score['Standard error of ladder score'].to_numpy()}},
    selector={'name': 'Log GDP per capita'}
)

region_plot.add_hline(
    y=df_2021.loc[0,'Ladder score in Dystopia'],
    line_width=2,
    line_dash="dash"
)

region_plot.add_annotation(
    text="Dystopia score",
    xref="paper",
    yref="paper",
    x=1.13,
    y=0.3,
    showarrow=False
)

region_plot.show()

The happiness index for each contries in a given geographical region, with the contribution of each happiness measure highlighed. The dashed line represent the happiness score of Dystopia, an imaginary country where all measures take their minimum value.

One can use the distance between the dashed line and the top of the blue bar (the 'Dysopia + Residuals' bar) to compute the residual, that is, how much of the Happiness index is not explained by the main happiness measures used in the study. For more info, see this [link](https://worldhappiness.report/faq/).

Before moving on, we can check which countries are in the top/bottom 5 list this year.

In [None]:
df_2021['Ranking'] = df_2021['Ladder score'].rank(method='first',ascending=False)

n_top = 5
n_bot = df_2021['Ranking'].count() - 5

df_2021_top = df_2021.query("Ranking <= {}".format(n_top)).copy()
df_2021_bottom = df_2021.query("Ranking > {}".format(n_bot)).copy()

df_2021_top['Ranking position'] = 'top'
df_2021_bottom['Ranking position'] = 'bottom'

df_2021_rank = pd.concat((df_2021_top,df_2021_bottom)).sort_values('Ladder score')

fig = px.bar(
    df_2021_rank,
    x='Ladder score',
    y='Country name',
    color='Ranking position',
    title='Happiness Index - Top and bottom countries'
)

fig.data[0].marker = {'color': '#d12e13'}
fig.data[1].marker = {'color': '#46b361'}

fig.show()

### Happiness measures analysis

Here we have a look at the measures used to compute the happiness index. We consider their distribution in different geographical regions, as well as their correlations and trends.

In [None]:
df_2021['GDP per capita'] = np.exp(df_2021['Logged GDP per capita'])

happiness_measures = [
    'GDP per capita',
    'Social support',
    'Healthy life expectancy',
    'Freedom to make life choices',
    'Generosity',
    'Perceptions of corruption'
]

df_happiness_measures = df_2021[['Regional indicator']+happiness_measures]

df_happiness_measures = df_happiness_measures.melt(
    id_vars='Regional indicator',
    value_vars=happiness_measures,
    var_name='Happiness measure',
    value_name='Measure value'
)

happiness_plot = px.box(
    data_frame=df_happiness_measures,
    x='Regional indicator',
    y='Measure value',
    color='Regional indicator',
    facet_row='Happiness measure',
    title='Happines index - regional distribution',
    height=2000
)

happiness_plot.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
happiness_plot.update_yaxes(matches=None,showticklabels=True)

happiness_plot.show()

It gives some info, but it is also a lot to digest and possibly it is not very instructive. Let's see how the different measures are correlated, and how they are correlated with the Happiness score.

In [None]:
px.imshow(
    df_2021[['Ladder score']+happiness_measures].corr(),
    title='Happiness measures - correlations'
)

From the above correlation matrix we see how Happiness score, GDP per capita, Social support, Life expectancy and Freedom are positively correlated with one another, while the perception of corruption is negatively correlated with these measures. On the other hand, generosity is only weakly correlated with the other measures.

Let's visualize these quantities against one another, to better understand their relation.

In [None]:
px.scatter(
    df_2021,
    x='Healthy life expectancy',
    y='Ladder score',
    color='Regional indicator',
    size='GDP per capita',
    title='Happiness measures - life expectancy vs happiness'
)

The above plot very clearly show how the Happiness score is positively correlated with both the life expectancy and the GDP per capita. 

In [None]:
px.scatter(
    df_2021,
    x='Freedom to make life choices',
    y='Perceptions of corruption',
    color='Regional indicator',
    size='GDP per capita',
    title='Happiness measures - freedom vs corruption'
)

We see that, in countries where freedom is limited, the perception of corruption is generally pretty high, as one might expect. For countires with high freedom, instead, we see that the perception of corruption range from low to high. However, we see that countries with a high GDP per capita and high freedom score tend to have a low perception of corrpution, compared to those countries with a lower GDP per capita.

In [None]:
px.scatter(
    df_2021,
    x='Social support',
    y='Healthy life expectancy',
    color='Regional indicator',
    size='GDP per capita',
    title='Happiness measures - social support vs life expectancy'
)

We see that, in countries with a high social support and high GDP per capita, the life expectance tends to be high. Vice versa, when the social support is lacking, and the country has a low GDP per capita, the life expectance is generally low. 