In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

World Happiness Report Dataset

In [None]:
whr_df = pd.read_csv("whr2023.csv")
whr_df.head()

Filter the columns and copy only the ones that will be used for analysis.

In [None]:
columns = ['Country name', 'iso alpha', 'Regional indicator', 'Happiness score', 'Logged GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']
happy_df = whr_df[columns].copy()
happy_df.head()

Convert all column names to lowercase

In [None]:
happy_df.columns = happy_df.columns.str.lower()
happy_df.head()

Data Info

In [None]:
happy_df.info()

In [None]:
# Check if there are any missing values
happy_df.isnull().sum()

In [None]:
happy_df[happy_df.isnull().any(axis = 1)]

In [None]:
# Choose the numerical columns for further analysis
numerical_columns = ['happiness score', 'logged gdp per capita', 'social support', 'healthy life expectancy', 'freedom to make life choices', 'generosity', 'perceptions of corruption']
happy_df[numerical_columns].describe()

The distribution of numerical data using both histograms and boxplots.

In [None]:
# Create a function that displays the distribution of numerical data using both histograms and boxplots.
def show_distribution(var):
    var_min = happy_df[var].min()
    var_mean = happy_df[var].mean()
    var_median = happy_df[var].median()
    var_mode = happy_df[var].mode()[0]
    var_max = happy_df[var].max()
    
    # Histogram 
    fig, ax = plt.subplots(2, 1, figsize = (8,8))
    sns.histplot(happy_df, x = var, kde = True, color = 'red', alpha = 0.2, ax = ax[0])
    ax[0].set_ylabel("Frequency")
    
    ax[0].axvline(x = var_min, color = 'gray', linewidth = 2, linestyle = "--")
    ax[0].axvline(x = var_mean, color = 'blue', linewidth = 2, linestyle = "--")
    ax[0].axvline(x = var_median, color = 'brown', linewidth = 2, linestyle = "--")
    ax[0].axvline(x = var_mode, color = 'cyan', linewidth = 2, linestyle = "--")
    ax[0].axvline(x = var_max, color = 'gray', linewidth = 2, linestyle = "--")
    
    # Box plot
    sns.boxplot(happy_df, x = var, width = 0.3, ax = ax[1],)
    ax[1].set_xlabel("Value")
    
    fig.suptitle("Data Distribution - " + var, size = 15)
    fig.show()

In [None]:
# Call show_distribution function
for column in numerical_columns:
    show_distribution(column)

In [None]:
# Generate a heatmap to identify the correlation
corr = happy_df.corr()
fig = plt.figure(figsize = (12,8))
sns.heatmap(corr, cmap = 'Blues', linewidth = 0.5, annot = True)

In [None]:
#Happiness score vs other factors
tab_20_colors = ["#1f77b4", "#2ca02c", "#9467bd", "#e377c2", "#bcbd22", "#9edae5"]
fig, ax = plt.subplots(3, 2, figsize = (12,12))
i = 0
for j, column in enumerate(numerical_columns[1:]):
    corr = happy_df['happiness score'].corr(happy_df[column])
    sns.regplot(happy_df, x = "happiness score", y = column, color = tab_20_colors[i], ax = ax[j//2, j%2])
    ax[j//2, j%2].set_title("Correlation = {:.4f}".format(corr))
    i += 1 
fig.tight_layout()
fig.subplots_adjust(hspace = 0.3)
fig.show()

In [None]:
fig = plt.figure(figsize = (12,7))
sns.scatterplot(happy_df, x = "logged gdp per capita", y = "healthy life expectancy", hue = "social support", s = 150)
plt.xlabel("GDP per capita", size = 15)
plt.ylabel("Healthy Life Expectancy", size = 15)   
plt.title("Relationship between GDP per Capita, Healthy Life Expectancy, and Social Support", size = 20)
plt.show()

In [None]:
#Count of Countries by Region
fig = plt.figure(figsize = (12,6))
sns.countplot(happy_df, y = 'regional indicator', palette = 'tab20')
plt.ylabel("Regions")
plt.xlabel("Counts")
plt.title("Count of Countries by Region", size = 20)
plt.show()

In [None]:
#Boxplot
fig = plt.figure(figsize = (10,7))
sns.boxplot(happy_df, y = 'regional indicator', x = "happiness score", palette = 'tab20')
plt.title("Happiness Score by region", size = 20)

In [None]:
region_wise = happy_df.groupby("regional indicator")[numerical_columns].mean().reset_index()
fig, ax = plt.subplots(3, 2, figsize = (12,12))
for i, column in enumerate(numerical_columns[1:]):
    region_wise.sort_values(column, ascending = False, inplace = True)
    sns.barplot(region_wise, x = column, y = 'regional indicator', palette = 'tab20', ax = ax[i//2, i%2])
fig.tight_layout()
fig.subplots_adjust(hspace = 0.3)
fig.show()#A column chart was created to analyze the top 10 happiest and least happy countries.
top_10_happy_country = happy_df.sort_values('happiness score', ascending = False)
bottom_10_happy_country = happy_df.sort_values('happiness score', ascending = True)

fig, ax = plt.subplots(2, 1, figsize = (12,8))

sns.barplot(top_10_happy_country.iloc[:10], y = 'happiness score', x = 'country name', palette = 'tab20', ax = ax[0])
ax[0].set_title("Top 10 happiest countries")



sns.barplot(bottom_10_happy_country.iloc[:10], y = 'happiness score', x = 'country name', palette = 'tab20', ax = ax[1])
ax[1].set_title("Top 10 least happy countries")

fig.tight_layout()
fig.show()

In [None]:
happy_df_time_line = pd.read_csv("whr_200522.csv")
happy_df_time_line.sort_values('year', inplace = True)

In [None]:
px.choropleth(happy_df_time_line, locations="Iso alpha", color="Happiness score", animation_frame="year" ,hover_name="Country name", range_color=[0,8], color_continuous_scale=px.colors.diverging.RdBu, width = 950, height = 600, title = "Global Happiness Scores: A Country-by-Country Analysis")
