In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
happiness_data = pd.read_csv('/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv')

happiness_data.head()

In [None]:
regions = happiness_data['Regional indicator'].unique()

regions.sort()
regions

In [None]:
countries = happiness_data['Country name'].unique()

countries.sort()
countries

Let's compare the healthy life expectancy for North American countries.

In [None]:
country_idx_df = happiness_data.set_index('Country name')

na_life_expectancy = country_idx_df.groupby('Country name')['Healthy life expectancy']

for country, exp in na_life_expectancy:
    if country in ['United States', 'Canada', 'Mexico']:
        print(country.upper())
        print('Age: ', int(exp[0]), '\n')

We can see that Canada has a higher life expectancy in 2021, but overall the life expectancy for all three countries is close in range.

Let's plot this on a graph:

In [None]:
na_to_plot = country_idx_df.loc[['United States', 'Canada', 'Mexico'], 'Healthy life expectancy']

plt.figure(figsize=(8, 6))
plt.title('North American Countries and their healthy life expectancies')
sns.barplot(x=na_to_plot.index, y=na_to_plot)
plt.xlabel('Country')
plt.ylabel('Age')

for idx, data in enumerate(na_to_plot):
    plt.text(x=idx, y=data, s=str(int(data)), fontdict=dict(fontsize=16))

The differences here are negligible. Let's take a look at differences in life expectancy across different regions. I'll use groupby() and .mean() to find the average life expectancy for each region.

In [None]:
regions_life_exp = happiness_data.groupby('Regional indicator')['Healthy life expectancy'].mean()

regions_life_exp

In [None]:
plt.figure(figsize=(14, 8))
plt.title('Average healthy life expectancy across all regions')
ax = sns.barplot(x=regions_life_exp.index, y=regions_life_exp)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.xlabel('Region')
plt.ylabel('Average Life Expectancy')

for idx, data in enumerate(regions_life_exp):
    plt.text(x=idx, y=data, s=str(int(data)), fontdict=dict(fontsize=14))

The highest healthy life expectancy by region is in Western Europe, while the lowest is in Sub-Saharan Africa. I would like to know more about both of these regions. Let's investigate some of the other columns for these two regions.

In [None]:
min_max_regions = ['Sub-Saharan Africa', 'Western Europe']
reg_countries = country_idx_df.groupby(['Country name', 'Regional indicator'])['Logged GDP per capita']

for reg in min_max_regions:
    print(reg.upper(), ': \n')
    for group in reg_countries:
        if group[0][1] == reg:
            print('Country: ', group[0][0])
            print('GDP per capita: ', group[1][0], '\n')

Now we will look at the average GDP per capita across all regions in the dataset

In [None]:
reg_count_avg = country_idx_df.groupby('Regional indicator')['Logged GDP per capita'].mean()

reg_count_avg

Perhaps there is some correlation between GDP per capita and healthy life expectancy? We can see that once again Western Europe is at the top while Sub-Saharan Africa is at the bottom.

Let's graph it!!

In [None]:
plt.figure(figsize=(12, 6))
plt.title('Average GDP per capita by Region')
ax = sns.barplot(x=reg_count_avg.index, y=reg_count_avg)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.xlabel('Region')
plt.ylabel('GDP Per Capita')

for idx, data in enumerate(reg_count_avg):
    plt.text(x=idx, y=data, s=str(round(data, 2)), fontdict=dict(fontsize=12))

What about the relationship between GDP per capita and the average ladder score (happiness) for each region?

I'll start with North America and ANZ:

In [None]:
na_data = country_idx_df[country_idx_df['Regional indicator'] == 'North America and ANZ']

plt.figure(figsize=(10, 6))
plt.title('Relationship between GDP per capita and Happiness in North America and ANZ',
         weight='bold')
sns.scatterplot(x=na_data['Logged GDP per capita'], y=na_data['Ladder score'],
               hue=na_data.index, s=200)
plt.axvline(na_data['Logged GDP per capita'].mean(), c='black', ls='--')
plt.axhline(na_data['Ladder score'].mean(), c='gray', ls='--')
plt.text(x=na_data['Logged GDP per capita'].mean(), y=na_data['Ladder score'].mean(), s='Average', size=12)
plt.xlabel('GDP per capita (K$)', weight='bold')
plt.ylabel('Happiness Score', weight='bold')

Same analysis for Western Europe:

In [None]:
we_data = country_idx_df[country_idx_df['Regional indicator'] == 'Western Europe']

plt.figure(figsize=(12, 6))
plt.title('Relationship between GDP per capita and Happiness in Western Europe',
         weight='bold')
ax = sns.scatterplot(x=we_data['Logged GDP per capita'], y=we_data['Ladder score'],
               hue=we_data.index, s=100)
sns.regplot(x=we_data['Logged GDP per capita'], y=we_data['Ladder score'],
            scatter=False, ax=ax)
plt.axvline(we_data['Logged GDP per capita'].mean(), c='black', ls='--')
plt.axhline(we_data['Ladder score'].mean(), c='gray', ls='--')
plt.text(x=we_data['Logged GDP per capita'].mean(), y=we_data['Ladder score'].mean(), s='Average', size=12)
plt.xlabel('GDP per capita (K$)', weight='bold')
plt.ylabel('Happiness Score', weight='bold')
plt.legend(bbox_to_anchor=(1.0, 1.0))

One more time for our lowest scorers in Sub-Saharan Africa:

In [None]:
ssa_data = country_idx_df[country_idx_df['Regional indicator'] == 'Sub-Saharan Africa']

plt.figure(figsize=(10, 6))
plt.title('Relationship between GDP per capita and Happiness in Sub-Saharan Africa',
         weight='bold')
sns.scatterplot(x=ssa_data['Logged GDP per capita'], y=ssa_data['Ladder score'],
               hue=ssa_data.index, s=100)
plt.axvline(ssa_data['Logged GDP per capita'].mean(), c='black', ls='--')
plt.axhline(ssa_data['Ladder score'].mean(), c='gray', ls='--')
plt.text(x=ssa_data['Logged GDP per capita'].mean(), y=ssa_data['Ladder score'].mean(), s='Average', size=12)
plt.xlabel('GDP per capita (K$)', weight='bold')
plt.ylabel('Happiness Score', weight='bold')
plt.legend(bbox_to_anchor=(1.0, 1.10))

That's all well and good but now I would like to examine the differences in ladder score (happiness) on it's own by region.

In [None]:
plt.figure(figsize=(12, 6))
plt.title('Happiness Score by Region', weight='bold', size=20)
sns.kdeplot(happiness_data['Ladder score'], hue=happiness_data['Regional indicator'])
plt.axvline(x=happiness_data['Ladder score'].mean(), c='black', ls='--')
plt.text(x=happiness_data['Ladder score'].mean(), y=0.12, s='Average Happiness Score', size=10)
plt.xlabel('Happiness Score')

**WORK IN PROGRESS**