In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv')

In [None]:
data.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')

In [None]:
data.info()

No null values and all numeric variables have float type

In [None]:
sns.distplot(data['Ladder score'])
plt.show()

It has 2 modas and looks almost normal))

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.boxplot(y="Regional indicator", x="Ladder score", data=data, orient="h", ax=ax)
plt.show()

We can see here some interesting observation: large difference between hi- and low-level of happiness in regions like 'South Asia', 'Sub-Saharan Africa', 'MiddleEast and North Africa'.
An anomalies in 'Latin America and Carribbean' (for the worse), 'Central and Eastern Europe' (in both sides), and 'East Asia' (for the better side).
Very small differnce between levels in 'North America and ANZ'

In [None]:
region = data.groupby('Regional indicator').agg({'Country name': 'count'})
region

Division by region

In [None]:
region.plot.pie(y='Country name', autopct="%.2f", legend=False, figsize=(8,8), ylabel='')
plt.show()

It can explain some thoughts about boxplot which was earlier. Difference between some regions depend on size of this region.

In [None]:
f, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(x='Ladder score', y='Social support', hue='Regional indicator', data=data, ax=ax)
plt.show()

In [None]:
for_hist = data.iloc[0:, 6:12]

Names of significant columns

In [None]:
sns.pairplot(for_hist)
plt.show()

In [None]:
for_hist.hist(figsize=(15,12))
plt.show()

The main histograms in zoomed size

In [None]:
mean_by_region = data.groupby('Regional indicator').agg({'Ladder score': 'mean'})

In [None]:
mean_by_region.plot.barh(legend=False, title='Mean by region', grid=True)
plt.show()

In [None]:
data['Ladder score'].describe()

In [None]:
data['Ladder score'].median()

Mean and median are very close

In [None]:
q25 = data['Ladder score'].quantile(.25)
q75 = data['Ladder score'].quantile(.75)
mean = data['Ladder score'].mean()

In [None]:
unhappiest = data[data['Ladder score'] < q25]
middle = data[(data['Ladder score'] > q25) & (data['Ladder score'] < q75)]
happiest = data[data['Ladder score'] > q75]

In [None]:
fig,ax = plt.subplots(nrows=1, ncols=3, figsize=(20,6))
ax = plt.subplot(1, 3, 1).set(title='unhappy', ylim = [0, 25])
sns.countplot(x='Regional indicator', data=unhappiest)
plt.xticks(rotation=45, ha='right')
ax = plt.subplot(1, 3, 2).set(title='middle', ylim = [0, 25])
sns.countplot(x='Regional indicator', data=middle)
plt.xticks(rotation=45, ha='right')
ax = plt.subplot(1, 3, 3).set(title='happy', ylim = [0, 25])
sns.countplot(x='Regional indicator', data=happiest)
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
happiest_chart = happiest.groupby('Regional indicator', as_index=False).agg({'Country name': 'count'})
for region in (happiest_chart.sort_values(by='Country name', ascending=False)['Regional indicator']):
    print()
    print('The happiest countries in ' + region + ' are:', end=' ')
    happy_list = list(happiest[happiest['Regional indicator'] == region]['Country name'])
    print(*happy_list, sep=', ')
    print('------------------------------------------------------------------------------')

In [None]:
unhappiest_chart = unhappiest.groupby('Regional indicator', as_index=False).agg({'Country name': 'count'})
for region in (unhappiest_chart.sort_values(by='Country name', ascending=False)['Regional indicator']):
    print()
    print('The unhappiest countries in ' + region + ' are:', end=' ')
    unhappy_list = list(unhappiest[unhappiest['Regional indicator'] == region]['Country name'])
    print(*unhappy_list, sep=', ')
    print('------------------------------------------------------------------------------')

In [None]:
regions = data['Regional indicator'].unique()

In [None]:
for region in regions:
    d1 = data[data['Regional indicator'] == region][['Country name', 'Regional indicator', 'Ladder score']]
    upper = d1[d1['Ladder score'] == d1['Ladder score'].max()].values
    lower = d1[d1['Ladder score'] == d1['Ladder score'].min()].values      
    print('Happiest in ' + region + ' is ' + upper[0][0] + ' with score ' + str(upper[0][2]))
    print('Unhappiest in ' + region + ' is ' + lower[0][0] + ' with score ' + str(lower[0][2]))
    print(end='---------------------------------------------------------------')
    print()

In [None]:
data_per_years = pd.read_csv('/kaggle/input/world-happiness-report-2021/world-happiness-report.csv')

In [None]:
data_per_years.nunique()

The amount of 'Country name' aren't the same

In [None]:
data.nunique()

In [None]:
data['Regional indicator'].unique()

In [None]:
rus_data = data_per_years[data_per_years['Country name'] == 'Russia']

You can write your country instead of "Russia" and have same statistics

In [None]:
rus_data.head()

In [None]:
fig = plt.figure(figsize=(20,20))
n = 1
for col in rus_data.columns[2:]:
    ax = fig.add_subplot(3, 3, n)
    sns.lineplot(x='year', y=col, data=rus_data, ax=ax).set(xlabel=' ', ylabel=' ', title=col + ' per years')
    n += 1

Lineplots of indicators per years for Russia

In [None]:
# fig,ax = plt.subplots(3, 3, figsize=(20,20))
# n = 1
# for col in rus_data.columns[2:]:
#     plt.subplot(3, 3, n)
#     sns.lineplot(x='year', y=col, data=rus_data)
#     n += 1
# the same, just for me

Then I'll groupby regions and do lineplot of changes 'Ladder score' by years

In [None]:
data_per_years.head()

In [None]:
country_region_data = data[['Country name', 'Regional indicator']]

In [None]:
new_data = data_per_years.merge(country_region_data, how='left', left_on='Country name', right_on='Country name')

In [None]:
new_data['Regional indicator'].unique()

In [None]:
new_data

In [None]:
grouped_data = new_data.groupby(['Regional indicator', 'year'], as_index=False).agg({'Life Ladder': 'mean'})

In [None]:
grouped_data

In [None]:
fig = plt.figure(figsize=(20,12))
sns.lineplot(x='year', y='Life Ladder', hue='Regional indicator', data=grouped_data).set(title='"Life Ladder" mean values per years for regions')
plt.show()

'Life Ladder' mean values per years for regions