In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/world-happiness-report-2021/world-happiness-report-2021.csv')
df1=pd.read_csv('../input/world-happiness-report-2021/world-happiness-report.csv')
import warnings
warnings.filterwarnings("ignore")

# Read and Analyse Data

In [None]:
df.head()

In [None]:
df1.head()


1. df - consist of 2021 happiness index and corresponding data
2. df1 - consist of past records by country

# Import plotting libraries

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
import seaborn as sns

In [None]:
#Colors to be used in the plots
color=["#f94144","#f3722c","#f8961e","#f9c74f","#90be6d","#43aa8b","#577590"]
sns.palplot(color)

In [None]:
#top 5 and bottom 5 countries in 2021 report
fig= plt.figure(figsize=(15,8))
g=gs.GridSpec(ncols=1, nrows=2, figure=fig)
plt.suptitle("Top 5 and Bottom 5 countries in Happiens index 2021", family='Serif', weight='bold', size=20)
ax1=plt.subplot(g[0,0])

top_5=df.head(5)
bot_5= df.tail(5)
ax1=sns.barplot(data=top_5, x=top_5['Ladder score'],y=top_5['Country name'], color=color[4])
#ax1.set_xlabel('')
ax1.xaxis.set_visible(False)
ax1.annotate("Top 5 countries in Happiness index",xy=(8,2), family='Serif', weight='bold', size=12)
ax2=plt.subplot(g[1,0], sharex=ax1)
ax2=sns.barplot(data=bot_5, x=bot_5['Ladder score'],y=bot_5['Country name'], color=color[0])
ax2.annotate("Bottom 5 countries in Happiness index",xy=(8,2), family='Serif', weight='bold', size=12)
for s in ['left','right','top','bottom']:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)

In [None]:
fig=plt.figure(figsize=(15,8))
plt.title("Ladder score distribution and mean by countries",family='Serif', weight='bold', size=20)
sns.kdeplot(df['Ladder score'], fill=True,color=color[-1], shade=True, edgecolor='black', linewidth=3, ls='--', alpha=0.3)
plt.axvline(df['Ladder score'].mean(), c='black',ls='--')
plt.axvline(df[df['Country name']=='Finland']['Ladder score'].mean(), c=color[4])
plt.text(x=df[df['Country name']=='Finland']['Ladder score'].mean(), y=0.15, s='Finland', rotation=90)
plt.axvline(df[df['Country name']=='Afghanistan']['Ladder score'].mean(), c=color[0])
plt.text(x=df[df['Country name']=='Afghanistan']['Ladder score'].mean(), y=0.15, s='Afghanistan', rotation=90)
plt.axvline(df[df['Country name']=='India']['Ladder score'].mean(), c=color[1])
plt.text(x=df[df['Country name']=='India']['Ladder score'].mean(), y=0.15, s='India', rotation=90)
plt.axvline(df[df['Country name']=='United States']['Ladder score'].mean(), c=color[1])
plt.text(x=df[df['Country name']=='United States']['Ladder score'].mean(), y=0.15, s='United States', rotation=90)
for s in ['left','right','top','bottom']:
    plt.gca().spines[s].set_visible(False)
plt.text(x=9,y=-0.05, s="There are difference in mean between the Asian countries and EU & American countries", ha='right', family='San', size=15, weight='bold')

In [None]:
fig=plt.figure(figsize=(20,8))
plt.title("Ladder score distribution and mean Regional Indicators",family='Serif', weight='bold', size=20)
sns.kdeplot(df['Ladder score'], fill=True, shade=True, color=color[-1], linewidth=3, ls='--', edgecolor='black', alpha=0.5)
plt.axvline(x=df['Ladder score'].mean(), linewidth=4, ls='--', color='white')
pop=x=df['Ladder score'].mean()

plt.text(x=df['Ladder score'].mean(),y=0, s='Population Mean', rotation=90)
for i in df['Regional indicator'].unique():
    m=df[df['Regional indicator']==i]['Ladder score'].mean()
    
    if m<pop:
        c=color[1]
    else:
        c=color[5]
    plt.axvline(x=m, color=c)
    plt.text(x=m, y=0.05, rotation=90,s=i)
for s in ['left','right','top','bottom']:
    plt.gca().spines[s].set_visible(False)
   
    

In [None]:
fig=plt.figure(figsize=(15,8))
plt.title("Ladder score distribution by Regional indicator",family='Serif', weight='bold', size=20)
sns.kdeplot(df['Ladder score'], fill=True,hue=df['Regional indicator'], color=color, shade=True, linewidth=2, edgecolor='white', multiple='layer')
plt.axvline(df['Ladder score'].mean(), c='black',ls='--')
plt.text(x=df['Ladder score'].mean(),y=-0.01,s='Population mean', size=15)
for s in ['left','right','top','bottom']:
    plt.gca().spines[s].set_visible(False)


In [None]:
fig=plt.figure(figsize=(15,8))
plt.title("Data distribution by Regional indicator",family='Serif', weight='bold', size=20)
sns.boxplot(x=df['Ladder score'],y=df['Regional indicator'], palette=color)
plt.axvline(df['Ladder score'].mean(), c='black',ls='--')
plt.text(x=df['Ladder score'].mean(),y=10,s='Population mean', size=15)
for s in ['left','right','top','bottom']:
    plt.gca().spines[s].set_visible(False)


# Hypothesis testing for Regions
# is there a significant difference in between Two Regions

# hypothesis test

# Ho = There is no significant differenct
# H1 = There are signifcant difference

In [None]:
from scipy.stats import ttest_ind, wilcoxon, ttest_rel
def hypo_test(reg1,reg2):
    r1 = df[df['Regional indicator']==reg1]['Ladder score']
    r2 = df[df['Regional indicator']==reg2]['Ladder score']
    t1="Hypotheis test between regions: {} vs {}".format(reg1,reg2)
    stats, p_value= ttest_ind(r1, r2)
    
    if p_value <0.05:
        t2="since p_value : {} is < 0.05, we can reject the Null Hypothesis and say there are significant difference".format(np.round(p_value,3))
    else:
        t2="P_value: {} is >0.05, we can't reject the Null Hypothesis, so we say there is no difference".format(np.round(p_value,3))
    fig=plt.figure(figsize=(15,3))
    plt.title(t1,family='Serif', weight='bold', size=20)
    plt.text(x=plt.xlim()[1]/2,y=plt.ylim()[1]/2,s=t2,ha='center', family='Serif', weight='bold', size=15)
    for pos in ['right','top','bottom','left']:
        plt.gca().spines[pos].set_visible(False)
    plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    plt.tick_params(axis='y', which='both', right=False, left=False, labelleft=False)
    return

In [None]:
reg=df['Regional indicator'].unique()
hypo_test(reg[0],reg[1])
hypo_test(reg[0],reg[-1])

In [None]:
top_5 = df1.groupby('Country name')['Life Ladder'].mean().nlargest(5)
#bot_5 = df1.groupby('Country name')['Life Ladder'].mean().nsmallest(5)
fig=plt.figure(figsize=(15,10))
plt.title("Life Ladder of top 5 countries and India",family='Serif', weight='bold', size=20)
sns.lineplot(data=df1[df1['Country name'].isin(top_5.index)], x='year',y='Life Ladder',hue='Country name', palette=color[0:5])
sns.lineplot(data=df1[df1['Country name']=='India'], x='year',y='Life Ladder', color=color[1], linewidth=3, ls='--')


Let us check what are the field contribute to happiness index

In [None]:
df2=df[['Country name', 'Regional indicator', 'Ladder score',
        'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']]

In [None]:
cols=df2.corr()['Ladder score'].sort_values(ascending=False)

In [None]:
fig=plt.figure(figsize=(15,10))
plt.suptitle("Comparing the Features that contribute for Ladder score",family='Serif', weight='bold', size=20)
j=0
for i in cols.index[1:]:
    
    ax=plt.subplot(421+j)
    ax=sns.regplot(data=df2, x='Ladder score',y=i, color=color[-j])
    ax.legend('')
    j=j+1
    
plt.legend('')


# Observation
Above plot clearly shows that the columns 
1. Logged GDP per capita
2. Social support
3. Healthy life expectancy
4. Freedom to make life choices
5. Generosity - less likely
6. Perceptions of corruption

have clear contribution to Happiness index

In [None]:
def comp_country(country1, country2):
    df3=df1[df1['Country name'].isin([country1,country2])]
    df3.rename(columns={'Healthy life expectancy at birth':'Healthy life expectancy','Log GDP per capita':'Logged GDP per capita','Life Ladder':'Ladder score'}, inplace=True)
    fig=plt.figure(figsize=(15,10))
    plt.suptitle("Comparing the Features that contribute for Happiness index for {} vs {}".format(country1, country2),family='Serif', weight='bold', size=20)
    j=0 
    for i in cols.index:    
        ax=plt.subplot(421+j)
        sns.lineplot(data=df3,x='year',y=i, hue='Country name', color=color[-2])
        j=j+1
    return

In [None]:
comp_country('India','Finland')

In [None]:
comp_country('United States','Finland')

In [None]:
df.mean()

In [None]:
def country_comp1(country1, country2):
    df4=df[df['Country name'].isin([country1, country2])]
    df4=df4[cols.index].T
    df4.columns=[country1,country2]
    print(df4)
    fig = plt.figure(figsize=(15,10))
    plt.suptitle("Comparison {} and {}".format(country1, country2),family='Serif', weight='bold', size=20)
    g=gs.GridSpec(nrows=1, ncols=2, wspace=0)
    ax1=plt.subplot(g[0,0])
    for s in ['left','right','top','bottom']:
        ax1.spines[s].set_visible(False)
   
    #ax1.set_xlim(xmin=0)
    ax1=sns.barplot(data=df4,y=df4.index,x=country1, color=color[-1]).invert_xaxis()
    
    ax2=plt.subplot(g[0,1], sharey=ax1)
    ax2=sns.barplot(data=df4,y=df4.index,x=country2, color=color[-2])
    #ax2.set_xlim(xmin=0)
    ax2.yaxis.tick_right()
    for s in ['left','right','top','bottom']:
        ax2.spines[s].set_visible(False)
    return

In [None]:
ax1.get_xlim()

In [None]:
country_comp1('Finland','India')

In [None]:
country_comp1('United States','Russia')

In [None]:
country_comp1('United States','India')

In [None]:
df6=df1.merge(df['Regional indicator'],left_on=df1['Country name'], right_on=df['Country name']).drop('key_0', axis=1)

In [None]:
df6['Regional indicator'].unique()

In [None]:
df6[df6['Regional indicator']=='South Asia'].pivot_table(index='Country name', columns='year')['Life Ladder']

In [None]:
#fig=plt.figure(figsize=(15,10))
for i in df6['Regional indicator'].unique():
    _,ax=plt.subplots(figsize=(15,10))
    ax.set_title("Comparison of Ladder score by countries in {} Region".format(i),family='Serif', weight='bold', size=20)
    ax=sns.heatmap(df6[df6['Regional indicator']==i].pivot_table(index='Country name', columns='year')['Life Ladder'], square=True, linewidths=1, cmap='GnBu_r', annot=True, cbar=False)
   # plt.show()

In [None]:
# Ladder score changes - Inida
df7=df1[df1['Country name'].isin(['India','Finland'])].loc[:,'Country name':'Life Ladder']


In [None]:
fig=plt.figure(figsize=(15,8))
ax=plt.subplot()
ax=sns.lineplot(data=df7,x='year',y='Life Ladder', alpha=0.5, hue='Country name')
ax=sns.scatterplot(data=df7,x='year',y='Life Ladder',hue='Country name', size='Life Ladder', sizes=(100,700))
for i in range(len(df7)):
    ax.text(x=df7['year'].iloc[i],y=df7['Life Ladder'].iloc[i], s=df7['Life Ladder'].iloc[i])

# Conclusion
Countries having good Healthly life, Social support, Freedom to make life choised, less corruptions, good per capita income have high happiness index