In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## ---------------------------------- UNDER CONSTRUCTION ------------------------------------

> *Even though much of what I have done till now is verily self explanatory, nonetheless, the generous reader may not find much assistance in form of long paragraphs and explanations, because the notebook is under construction and a range of different things are to be introduced. However, I think it provides the beginner with some premises to begin the research with. As I go along, I'll keep writing explanations for most of what I have done yet.*

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style

style.use('fivethirtyeight')
font = {'family' : 'sans-serif',
        'weight' : 'bold',
        'size' : 20}
plt.rc('font', **font)
plt.rcParams['figure.figsize'] = (15, 8)

from scipy import stats

In [None]:
df1 = pd.read_csv('/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv')
df2 = pd.read_csv('/kaggle/input/world-happiness-report-2021/world-happiness-report.csv')

In [None]:
print('shape of df1: ', df1.shape)
print('shape of df2: ', df2.shape)

In [None]:
df1.head()

## A Warm up exercise

In [None]:
hle = df1['Healthy life expectancy'].values
ladder = df1['Ladder score'].values
countries = df1['Country name'].values

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(hle.reshape(-1, 1), ladder.reshape(-1, 1))
predictions = lr.predict(hle.reshape(-1, 1))

In [None]:
print(lr.coef_, lr.intercept_)

In [None]:
plt.scatter(hle, ladder, s = 20, color = 'indigo')
plt.xlabel('Healthy Life Expectancy')
plt.ylabel('Subjective Well Being')
plt.plot(hle, predictions, color = 'green', linewidth = 4)
plt.title('Trend of High Life Expectancy vs Subjective Well Being')

for num, country in enumerate(countries):
    if country == 'India':
        plt.annotate(country, (hle[num], ladder[num]), color = 'black', arrowprops = dict(arrowstyle="simple", color = 'maroon'),
                     xytext=(hle[num]+0.05, ladder[num]+1.0))

In [None]:
annotate_country = ['India', 'Pakistan', 'Bangladesh', 'China', 'Sri Lanka', 'Afghanistan', 'Myanmar', 'Bhutan', 'Nepal']
#separate_color = df1[df1['Country name'].isin(annotate_country)]['Healthy life expectancy'].values

plt.scatter(hle, ladder, s = 20, color = 'indigo')
plt.xlabel('Healthy Life Expectancy')
plt.ylabel('Subjective Well Being')
plt.plot(hle, predictions, color = 'green', linewidth = 4)
plt.title('Trend of High Life Expectancy vs Subjective Well Being')

for num, country in enumerate(countries):
    if country in annotate_country:
        plt.annotate(country, (hle[num], ladder[num]), color = 'black', arrowprops = dict(arrowstyle="simple", color = 'maroon'),
                     xytext=(hle[num]+0.05, ladder[num]+1.0))

In the above warm-up exercise, I depicted where does India and its immediate neighbors stand wrt the "Healthy Life Expectancy" vs the "Subjective Well Being" (Ladder score) trend

# Region wise comparison

***In this section, I demonstrate whether Western European countries have outperformed their South Asian counterparts***

In [None]:
df1.head()

I use the cumulative distribution function (CDF) for comparing the distribution of various parameters characterized by their regional indicators

In [None]:
def EvalCdf(sample, x):
    count = 0
    for i in sample:
        if i <= x:
            count += 1
    return count / len(sample)

def PlotCDF(df, variable, colorcdf, coloraxv, labelcdf, labelaxv):
    sample = df[variable].values
    sample.sort()
    cdf = [EvalCdf(sample, x) for x in sample]
    plt.plot(sample, cdf, color = colorcdf, drawstyle = 'steps', label = labelcdf)
    plt.axvline(np.median(sample), ls = ':', color = coloraxv, label = 'Median: {}'.format(labelaxv))
    plt.legend(shadow = True, framealpha = 0.9, title = 'Label Description')

In [None]:
western_europe = df1[df1['Regional indicator'] == 'Western Europe']
south_asia = df1[df1['Regional indicator'] == 'South Asia']

In [None]:
PlotCDF(western_europe, 'Ladder score', 'blue', 'black', 'Western Europe', 'Ladder score for WE')
PlotCDF(south_asia, 'Ladder score', 'maroon', 'indigo', 'South Asia', 'Ladder score for SA')
plt.xlabel('Ladder Score')
plt.ylabel('CDF')

In [None]:
PlotCDF(western_europe, 'Social support', 'blue', 'black', 'Western Europe', 'Social Support for WE')
PlotCDF(south_asia, 'Social support', 'maroon', 'indigo', 'South Asia', 'Social Support for SA')
plt.xlabel('Social Support')
plt.ylabel('CDF')

In [None]:
PlotCDF(western_europe, 'Logged GDP per capita', 'blue', 'black', 'Western Europe', 'Logged GDP per capita for WE')
PlotCDF(south_asia, 'Logged GDP per capita', 'maroon', 'indigo', 'South Asia', 'Logged GDP per capita for SA')
plt.xlabel('Logged GDP per capita')
plt.ylabel('CDF')

In [None]:
PlotCDF(western_europe, 'Healthy life expectancy', 'blue', 'black', 'Western Europe', 'Healthy life expectancy for WE')
PlotCDF(south_asia, 'Healthy life expectancy', 'maroon', 'indigo', 'South Asia', 'Healthy life expectancy for SA')
plt.xlabel('Healthy life expectancy')
plt.ylabel('CDF')

In [None]:
PlotCDF(western_europe, 'Freedom to make life choices', 'blue', 'black', 'Western Europe', 'Freedom to make life choices for WE')
PlotCDF(south_asia, 'Freedom to make life choices', 'maroon', 'indigo', 'South Asia', 'Freedom to make life choices for SA')
plt.xlabel('Freedom to make life choices')
plt.ylabel('CDF')

In [None]:
PlotCDF(western_europe, 'Generosity', 'blue', 'black', 'Western Europe', 'Generosity for WE')
PlotCDF(south_asia, 'Generosity', 'maroon', 'indigo', 'South Asia', 'Generosity for SA')
plt.xlabel('Generosity')
plt.ylabel('CDF')

In [None]:
PlotCDF(western_europe, 'Perceptions of corruption', 'blue', 'black', 'Western Europe', 'Perception of corruption for WE')
PlotCDF(south_asia, 'Perceptions of corruption', 'maroon', 'indigo', 'South Asia', 'Perception of corruption for SA')
plt.xlabel('Perceptions of corruption')
plt.ylabel('CDF')

***From the above comparison, it is quite clear that Western Europe has indeed outperformed South Asia on all paramters but 'Generosity'.*** 

In [None]:
features = ['Ladder score', 'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
            'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']

# Cohen's d

***I use Cohen's d to evaluate the effect sizes of different variables***

Cohen's d is parameter used to demonstrate whether an effect size between 2 distributions is 'small', 'medium', or 'large'. For more information, visit the following website;

[About Cohen's d](http://www.simplypsychology.org/effect-size.html#:~:text=Cohen%20suggested%20that%20d%20%3D%200.2,if%20it%20is%20statistically%20significant.)

In [None]:
def CohenEffectSize(group1, group2):
    mean1 = np.median(group1)
    mean2 = np.median(group2)
    
    diff = mean1 - mean2
    
    var1 = np.var(group1)
    var2 = np.var(group2)
    
    n1 = len(group1)
    n2 = len(group2)
    
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    
    d = diff / np.sqrt(pooled_var)
    
    return d

In [None]:
for feat in features:
    we = western_europe[feat].values
    sa = south_asia[feat].values
    print("Effect size (difference between population means) for {} is: {}".format(feat, np.median(we) - np.median(sa)))
    print("Cohen's d for {} is: ".format(feat), CohenEffectSize(we, sa), '\n')

***The effect size is indded large in all but 'Generosity' and 'Perception of corruption', however, a small effect size is indeed a trouble for 'Perception of corruption' as it indicated that South Asian countries espouse more corrupt percetion of their leaders which is not a good indicator.***

# Reporting other statistics

***I report a couple of other statistics, namely the 'mode', and the 'IQR' for the same set of 7 variables (features).***

# 1. IQR

In [None]:
for feat in features:
    we = western_europe[feat].values
    sa = south_asia[feat].values
    print("IQR of West European countries for {} is {}".format(feat, [np.percentile(we, 25), np.percentile(we, 75)]))
    print("IQR of South Asian countries for {} is {}".format(feat, [np.percentile(sa, 25), np.percentile(sa, 75)]), '\n')

# 2. Mode

In [None]:
for feat in features:
    we = western_europe[feat].values
    sa = south_asia[feat].values
    print("Mode of West European countries for {} is {}".format(feat, stats.mode(we)[0][0]))
    print("Mode of South Asian countries for {} is {}".format(feat, stats.mode(sa)[0][0]), '\n')

# Relationship between variables

In [None]:
df1[features].corr()

In [None]:
df1['Rank3'] = df1.index.map(lambda x: 1 if x <= 9
                                    else 3 if x >= 139
                                    else 2)

In [None]:
def trend_plot(df, xaxis, yaxis):
    
    x, y = df[xaxis].values, df[yaxis].values
    
    lr.fit(x.reshape(-1, 1), y.reshape(-1, 1))
    predictions = lr.predict(x.reshape(-1, 1))
    
    sns.scatterplot(x = x, y = y, s = 100, palette = 'coolwarm', hue = df1['Rank3'])
    plt.axvline(np.mean(x), ls = ':', color = 'maroon', label = 'Mean of {}'.format(xaxis))
    plt.axhline(np.mean(y), ls = '--', color = 'black', label = 'Mean of {}'.format(yaxis))
    plt.xlabel(xaxis)
    plt.ylabel(yaxis)
    plt.plot(x, predictions, color = 'green', linewidth = 4)
    plt.title('Trend of {} vs {}'.format(xaxis, yaxis))
    plt.legend(shadow = True, framealpha = 0.01)
    
    for num, country in enumerate(countries):
        if country == 'India':
            plt.annotate(country, (x[num], y[num]), color = 'black', arrowprops = dict(arrowstyle="simple", color = 'maroon'),
                         xytext=(x[num]+0.05, y[num]+1.0))

In [None]:
trend_plot(df1, 'Healthy life expectancy', 'Ladder score')

In [None]:
trend_plot(df1, 'Logged GDP per capita', 'Ladder score')

In [None]:
trend_plot(df1, 'Social support', 'Ladder score')

In [None]:
trend_plot(df1, 'Freedom to make life choices', 'Ladder score')

In [None]:
trend_plot(df1, 'Perceptions of corruption', 'Ladder score')

# Sandwiches

In [None]:
sa = df1[df1['Regional indicator'] == 'South Asia']['Country name'].values
we = df1[df1['Regional indicator'] == 'Western Europe']['Country name'].values

In [None]:
SA = df2[df2['Country name'].isin(sa)]
WE = df2[df2['Country name'].isin(we)]

In [None]:
y1 = SA[SA['year'] == 2008]
y2 = SA[SA['year'] == 2019]

def sandwich(xaxis):
    countries = y1['Country name'].values
    
    x1 = y1[xaxis].values
    x2 = y2[xaxis].values
    
    plt.plot(x1, countries, 'o', color = 'purple', label = '2008', markersize = 20)
    plt.plot(x2, countries, 's', color = 'indigo', label = '2019', markersize = 20)
    plt.hlines(countries, x1, x2, color = 'maroon')
    plt.xlabel('{}'.format(xaxis))
    plt.ylabel('Countries')
    plt.title('Trends in {}'.format(xaxis))
    plt.legend()

In [None]:
sandwich('Life Ladder')

In [None]:
sandwich('Log GDP per capita')

In [None]:
sandwich('Social support')

In [None]:
sandwich('Healthy life expectancy at birth')

In [None]:
sandwich('Freedom to make life choices')

In [None]:
sandwich('Generosity')

In [None]:
sandwich('Perceptions of corruption')

India has performed miserably in the 'Ladder score' and the 'Social support' parameters; let's look closely at the countries among which it is featuring

In [None]:
ls = df1['Ladder score'].values
ss = df1['Social support'].values
countries = df1['Country name'].values

plt.scatter(ss, ls, s = 100, color = 'maroon')
plt.xlim(0, np.mean(ss))
plt.ylim(0, np.mean(ls))

for num, country in enumerate(countries):
    if country == 'India':
        plt.annotate(country, (ss[num], ls[num]), color = 'black', arrowprops = dict(arrowstyle="simple", color = 'forestgreen'),
                     xytext=(ss[num]+0.05, ls[num]-1.0))

In [None]:
SA = SA.pivot('Country name', 'year', 'Life Ladder')
sns.heatmap(SA, annot = True, cmap="YlGnBu")

In [None]:
WE = WE.pivot('Country name', 'year', 'Life Ladder')
sns.heatmap(WE, annot = True, cmap="YlGnBu")