# World Happiness Report - Data Exploration

Exploration of the world happiness report data published by the United Nations.

In [15]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import cartopy
import cartopy.io.shapereader as shpreader
import cartopy.crs as ccrs

In [16]:
sns.set(color_codes=True)
sns.set_palette('colorblind')
%matplotlib inline

## Load and normalize data

Data from Kaggle https://www.kaggle.com/unsdsn/world-happiness

See the World Happiness Report homepage at http://worldhappiness.report/ for more detailed information.

In [17]:
# path = '/tmp/data/world-happiness-report'   # path to local data
path = '../input/world-happiness'  # path to data in Kaggle notebook
dat2015 = pd.read_csv(os.path.join(path, '2015.csv'))
dat2016 = pd.read_csv(os.path.join(path, '2016.csv'))
dat2017 = pd.read_csv(os.path.join(path, '2017.csv'))

The data format differs slightly between years. Let's even out the differences.

In [18]:
dat2015.head()

In [19]:
dat2017.head()

Both 2015 and 2016 data look quite similar. However, the 2017 data has different columns names and is lacking region information. We will make the 2017 dataset consistent with the others.

In [20]:
# 2017 data does not contain region, we create it based on the 2016 data:

# generating a new column using apply
# def get_region_by_country(country):
#    row = dat2016.query('Country == @country')
#    if row.shape[0] > 0:
#        return row.iloc[0].loc['Region']
#    else:
#        return np.nan
#
# dat2017['Region'] = dat2017.apply(lambda row: get_region_by_country(row['Country']), axis=1)

# creating region info using joins, much nice IMO
dat2017 = pd.merge(dat2017, dat2016.loc[:, ['Country', 'Region']], on='Country')

In [21]:
dat2017.rename(columns={'Happiness.Rank': 'Happiness Rank',
                       'Happiness.Score': 'Happiness Score',
                       'Economy..GDP.per.Capita.': 'Economy (GDP per Capita)',
                       'Health..Life.Expectancy.': 'Health (Life Expectancy)',
                       'Trust..Government.Corruption.': 'Trust (Government Corruption)', 'Dystopia.Residual': 'Dystopia Residual'}, inplace=True)

In [22]:
dat2017.head()

### Change index from numerical to country name

This makes assures dataframes are properly aligned for operations.

Just in case check for duplicates:

In [23]:
(any(dat2015.duplicated('Country').values)
    or any(dat2016.duplicated('Country').values)
    or any(dat2017.duplicated('Country').values))

In [24]:
dat2015.set_index('Country', inplace=True)
dat2016.set_index('Country', inplace=True)
dat2017.set_index('Country', inplace=True)

## Initial data visualization

### World map of happiness

In [25]:
# shp_filename = shpreader.natural_earth(resolution='110m', category='cultural', name='admin_0_countries')
shp_filename = '../input/natural-earth/110m_cultural/ne_110m_admin_0_countries.shp'
shp_reader = shpreader.Reader(shp_filename)

Some country names from the shapefile are different from the names used in the happiness data. We need to fix this (still incomplete).

In [26]:
country_name_map = {'Bosnia and Herz.': 'Bosnia and Herzegovina',
                    'Czechia': 'Czech Republic',
                    'Congo': 'Congo (Brazzaville)',
                    'Dem. Rep. Congo': 'Congo (Kinshasa)',
                    'Dominican Rep.': 'Dominican Republic',
                    'Greenland': 'Denmark',
                    'Palestine': 'Palestinian Territories',
                    'Somaliland': 'Somalia',
                   'United States of America': 'United States'}

In [27]:
plt.figure(figsize=(12,5))
ax = plt.axes(projection=ccrs.PlateCarree())
# ax.add_feature(cartopy.feature.OCEAN)
ax.set_extent([-150, 60, -25, 60])

map_colors = sns.color_palette('Blues_r', 8)

for country in shp_reader.records():
    if country.attributes['NAME'] in country_name_map:
        name = country_name_map[country.attributes['NAME']]
    else:
        name = country.attributes['NAME']
    if name in dat2017.index:
        ax.add_geometries(country.geometry, ccrs.PlateCarree(),
                          facecolor=map_colors[int(dat2017.loc[name, "Happiness Rank"] / (dat2017['Happiness Rank'].max() + 1) * len(map_colors))],
                          label=country.attributes['ADM0_A3'])
    else:
        ax.add_geometries(country.geometry, ccrs.PlateCarree(),
                          facecolor=(1, 0, 0),
                          label=country.attributes['ADM0_A3'])
        # print(name)

Happy countries are shown in dark blue, least happy ones in white.

Bright red colored countries are not part of the analysis.

In [28]:
sns.kdeplot(dat2015['Happiness Score'], label='2015')
sns.kdeplot(dat2016['Happiness Score'], label='2016')
sns.kdeplot(dat2017['Happiness Score'], label='2017')
plt.xlabel('Happiness Score')

Plotting the histograms of the yearly happiness scores on top of each other shows a small shift towards more happiness from 2015 to 2017.

In [29]:
happiness_factors = ['Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 
           'Freedom', 'Generosity', 'Trust (Government Corruption)', 
           'Dystopia Residual']

def plot_columns_on_grid(data, columns, grid):
    for i, column in enumerate(columns):
        plt.subplot(grid[0], grid[1], i+1)
        sns.distplot(data[column])

plt.figure(figsize=(12,12))
plot_columns_on_grid(dat2017, happiness_factors, (3, 3))

Some of the distributions look like we have at least two distinct groups of countries. For instance the Health data has the majority clustered around 0.7 but also a second group of countries around 0.3.

## Do some sanity checks

According to the dataset description, the columns GDP per Capita, Family, Life Expectancy, Freedom, Generosity, Trust Government Corruption describe the extent to which these factors contribute in evaluating the happiness in each country. The Dystopia Residual metric actually is the Dystopia Happiness Score(1.85) + the Residual value for each country.

Summing these contributions up should give the Happiness Score. Let's verify this for the 2017 dataset:

In [30]:
dat = dat2017[happiness_factors].sum(axis=1)
residual = dat2017['Happiness Score'] - dat
residual.describe()

The residual value not explained by the above explanation is quite small --> check passed.

## Happiness change from 2015 to 2017

In [31]:
dat2017['Happiness Change'] = dat2017['Happiness Score'] - dat2015['Happiness Score']

In [32]:
dat2017['Happiness Change'].describe()

Looks like mean global happiness did not change much at all the last two years!

### Find the country with the biggest improvement of happiness

In [33]:
country_max_chg = dat2017['Happiness Change'].idxmax()
dat2017.loc[country_max_chg]

The price goes to Latvia!

In [34]:
dat2017.loc[country_max_chg, happiness_factors] - dat2015.loc[country_max_chg, happiness_factors]

### Find the country with the biggest loss of happiness

In [35]:
country_min_chg = dat2017['Happiness Change'].idxmin()
dat2017.loc[country_min_chg]

In [36]:
dat2017.loc[country_min_chg, happiness_factors] - dat2015.loc[country_min_chg, happiness_factors]

## Happiness by region

In [37]:
by_region = dat2017.groupby('Region')

In [38]:
by_region[['Happiness Score', 'Happiness Change'] + happiness_factors].mean().sort_values(by='Happiness Score', ascending=False)

Australia and New Zealand is the region with the most happy people, closely followed by North America. North America seems to be on a downtrend, though.

The least happy people are living in Sub-Saharan Africa.

## Factors contributing to happiness

Influence of the 6 factors Economy, Family, etc. on happiness depending on regions. Nomalize the factors to the total happiness score.

In [39]:
sns.heatmap(by_region[happiness_factors[:-1]].mean().div(by_region['Happiness Score'].mean(), axis='index'))

The economy and family are by far the most important contributors to the total happiness score. Generosity and freedom are the least important factors.

### Cluster analysis

In the original data, the happiness factors such as Economy, Family, etc. sum up to the happiness Score. Consequently, a countries with high happiness score also tend to have high factors. To analyze how the influence of economy on happiness varies between countries, we first normalize the factors using the total happiness score.

In [40]:
dat2017_norm = dat2017
dat2017_norm[happiness_factors] = dat2017_norm[happiness_factors].div(dat2017['Happiness Score'].values, axis=0)

Cluster analysis based on the happiness factors using the k-means method (see https://de.wikipedia.org/wiki/K-Means-Algorithmus)

In [41]:
cluster_n = 3
k_means = KMeans(init='k-means++', n_clusters=cluster_n, n_init=10)
cluster_labels = k_means.fit_predict(dat2017_norm[happiness_factors[:-1]])

Plot distributions of the factors for each cluster:

In [42]:
plt.figure(figsize=(12,12))
for i, factor in enumerate(happiness_factors):
    ax = plt.subplot(3, 3, i+1)
    for cluster in range(cluster_n):
        sns.kdeplot(dat2017_norm.loc[cluster_labels == cluster, factor], label=cluster)
        ax.set_title(factor)

Compare happiness score distribution for the clusters:

In [43]:
for cluster in range(cluster_n):
    sns.kdeplot(dat2017.loc[cluster_labels == cluster, 'Happiness Score'], label=cluster)

There is a big difference between the happiness score distributions of the clusters.

In [44]:
dat2017['Cluster'] = cluster_labels

In [45]:
plt.figure(figsize=(12,5))
ax = plt.axes(projection=ccrs.PlateCarree())
# ax.add_feature(cartopy.feature.OCEAN)
ax.set_extent([-150, 60, -25, 60])

for country in shp_reader.records():
    if country.attributes['NAME'] in country_name_map:
        name = country_name_map[country.attributes['NAME']]
    else:
        name = country.attributes['NAME']
    if name in dat2017.index:
        ax.add_geometries(country.geometry, ccrs.PlateCarree(),
                          facecolor=sns.color_palette()[dat2017.loc[name, 'Cluster']],
                          label=country.attributes['ADM0_A3'])
    else:
        ax.add_geometries(country.geometry, ccrs.PlateCarree(),
                          facecolor=(1, 0, 0),
                          label=country.attributes['ADM0_A3'])  

This map visualizes coutries clustered by the factors contributing to happiness. 

Bright red colored countries are not part of the analysis.