In [None]:
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Reading the data from the file
df=pd.read_csv('../input/world-happiness-report-2021/world-happiness-report-2021.csv')
df.head(3)

In [None]:
df.info() #Checking for non-null values and dtypes

### The data does not contain any null values as seen from the above display. So dropping of null values is not required.

In [None]:
df.describe().T #Summary of numerical series

In [None]:
df.describe(include='object').T #summary of categorical series

In [None]:
df['Rank']=np.arange(1,150) #Since rank is not defined, defining rank of countries from 1 to 149

In [None]:
df.set_index('Rank',inplace=True)
df.head()

In [None]:
df.tail() #Checking the last 5 records

# Starting the analysis

In [None]:
# Get names of top 5 countries by rank
print ('Top 5 happiest countires of the world are:\n',df.iloc[:6]['Country name'])

# Finding ranks of countries

In [None]:
def Country_Rank(Name):
    '''Functions gives the rank of any country'''
    
    ind=df[df['Country name']==Name].index
    for i in ind:
        print(Name,"Ranks",i,"in the world happiness report")

In [None]:
# Where does India rank
Country_Rank('India')



# Region-wise analysis

In [None]:
# List of regions
print('The regions covered are:\n',df['Regional indicator'].unique())

In [None]:
# No of countries covered in each region
df['Regional indicator'].value_counts(ascending=False).plot(kind='barh',title='Countries in each region',color='seagreen');

In [None]:
#Displaying the data using seaborn
sns.countplot(y=df['Regional indicator'],order=df['Regional indicator'].value_counts().index)
plt.title('Countries in each region',fontdict={'fontsize':15,'color':'darkgreen'});

In [None]:
# Plotting using groupby function
df.groupby('Regional indicator')['Country name'].nunique().sort_values(ascending=False).plot(kind='barh',title='Countries in each region',color='gold');

In [None]:
# Mean ladder score for all regions
Regions_score=df.groupby('Regional indicator')['Ladder score'].mean().round(3).sort_values()
Regions_score.plot(kind='barh',title='Mean Ladder score by region',color='purple');

In [None]:
#using Seaborn
sns.barplot(x='Ladder score',y='Regional indicator',data=df,ci=False)
plt.title('Mean score by region',fontdict={'fontsize':15,'color':'darkgreen'});

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(x='Regional indicator', y='Ladder score',data=df)
plt.xticks(rotation=90);

###### From the boxplot, we can infer the following:
1) Median score is high for Western Europe and North America & ANZ regions.
2) Median score is lowest for Sub-Saharan Africa.
3) North America & ANZ region has very low variation as IQR is very narrow.
4) Middle East & North Africa has huge variation as IQR is very broad.
5) Top ranked country lies in Western Europe and Bottom Ranked country lies in South Asia.
6) There are outliers observed in 3 regions, viz, Latin America & Carribean, Central & Eastern Europe, East Asia

In [None]:
df.groupby('Regional indicator')['Ladder score'].median().sort_values(ascending=False)
#Cross checking the data values for above graph

In [None]:
# Region for Top 20 countries
df_top20=df.iloc[:20,:][['Country name', 'Regional indicator', 'Ladder score',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']]
df_top20

In [None]:
#Regional distribution of top 20
df_top20['Regional indicator'].value_counts(ascending=True).plot(kind='barh',title='Region for top 20 countries')
plt.xticks(np.arange(0,15,2));

## It is clear from above plot that 13 of the top 20 countries lie in Western Europe. North America and ANZ has 4 in top 20 while Central and Eastern Europe, Middle East and North Africa, Latin America and Caribbean each has 1 country in the top 20

# Asian countries analysis

In [None]:
df_Asian=df[df['Regional indicator'].str.contains('Asia')] [['Country name', 'Regional indicator', 'Ladder score',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']] # For all regions containing Asia in their name 
df_Asian['Regional indicator'].unique() # Checking if all regions are covered

In [None]:
df_Asian.describe().T

In [None]:
# Ladder score for Asian countries
plt.figure(figsize=(12,7))
sns.barplot(y='Country name',x='Ladder score',data=df_Asian);

###### From above plot, it is clear that Taiwan ranks highest while Afghanistan scores lowest.
India ranks second last amongst all Asian countries




# Relationship between various parameters and ladder score

In [None]:
sns.pairplot(df[['Ladder score',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']],diag_kind='kde');

# Lm plots of ladder score vs other socio-economic parameters

In [None]:
sns.lmplot(x='Ladder score',y='Logged GDP per capita',data=df);

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Logged GDP per capita',data=df,hue='Regional indicator');

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Social support',data=df);

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Social support',data=df,hue='Regional indicator');

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Healthy life expectancy',data=df);

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Healthy life expectancy',data=df,hue='Regional indicator');

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Freedom to make life choices',data=df);

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Freedom to make life choices',data=df,hue='Regional indicator');

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Generosity',data=df);

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Generosity',data=df,hue='Regional indicator');

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Perceptions of corruption',data=df);

In [None]:
plt.figure(figsize=(5,15))
sns.lmplot(x='Ladder score',y='Perceptions of corruption',data=df,hue='Regional indicator');

In [None]:
# How GDP varies amongst all countries
sns.histplot(df['Logged GDP per capita'],kde=True,bins=35);

In [None]:
sns.histplot(df['Social support'],kde=True,bins=35);

In [None]:
sns.histplot(df['Freedom to make life choices'],kde=True,bins=35);

In [None]:
sns.histplot(df['Perceptions of corruption'],kde=True,bins=50);

# The End.
Please comment if you found this analysis useful.