In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np 
import os

pd.set_option("max_colwidth", None)
pd.set_option("max_columns", None)
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
happy_2021 = pd.read_csv("/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv")
happy_2021.head()

In [None]:
# Explained variables are redundant.

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(14,8))
sns.scatterplot(data=happy_2021, x='Logged GDP per capita', y='Explained by: Log GDP per capita', ax=ax[0,0])
sns.scatterplot(data=happy_2021, x='Social support', y='Explained by: Social support', ax=ax[0,1])
sns.scatterplot(data=happy_2021, x='Healthy life expectancy', y='Explained by: Healthy life expectancy', ax=ax[1,0])
sns.scatterplot(data=happy_2021, x='Freedom to make life choices', y='Explained by: Freedom to make life choices', ax=ax[1,1])
plt.show()

In [None]:
# Drop Explained columns
explained_cols= happy_2021.columns.str.find("Explained")==0
happy_2021 = happy_2021.loc[:,~explained_cols]

In [None]:
happy_2021.iloc[:,-3:].var()   # Ladder score in Dystopia has no variance

# Assign to variables
Ladder_score_in_Dystopia = happy_2021.iloc[:,-3]
happy_2021.drop(['Ladder score in Dystopia'], axis=1, inplace=True)

In [None]:
corr_cols_drop = ['Standard error of ladder score','upperwhisker','lowerwhisker']
happy_2021\
    .drop(corr_cols_drop,axis=1)\
    .corr()\
    .style.background_gradient(sns.light_palette('green', as_cmap=True))

In [None]:
happy_2021.head(3)

___
# Statistical tests
1. Is countries in `Sub-Saharan Africa` have a population mean of `Ladder score` equal to `6.5`?, and equal to `4.5`?
2. Is `Latin America` and `Western Europe` have the same population mean of `Ladder score`?
3. Is population mean of `Ladder score` of `Central and Eastern Europe` increased from 2011 compared to 2018?
4. Is different `Regional indicator` affect on different `Ladder score`?
5. Is `Ladder score` and `Logged GDP per capita` correlated?

### 1. Is countries in `Sub-Saharan Africa` have a population mean of `Ladder score` equal to `6.5`?, and equal to `4.5`?

First, we have to see the **distribution** of **Ladder score in Sub-Saharan Africa**.

In [None]:
data = happy_2021[happy_2021['Regional indicator'] == 'Sub-Saharan Africa']['Ladder score']
sns.histplot(data, binwidth=0.2, kde=True);

Not so normal, we will do `t-test` <br>
1.1) $H_0$ : Sub-Saharan Africa have a population mean of Ladder score equal to 6.5

In [None]:
from scipy.stats import ttest_1samp

t, p = ttest_1samp(data, 6.5)
print(f'p_val = {p}')     # Reject Null: From the data we have, we can reject that the mean is 6.5  

1.2) $H_0$ : Sub-Saharan Africa have a population mean of Ladder score equal to 4.5

In [None]:
from scipy.stats import ttest_1samp

t, p = ttest_1samp(data, 4.5)
print(f'p_val = {p}')     # Not reject Null: From the data we have, we cannot reject that the mean is 4.5  

### 2. Is `Latin America` and `Western Europe` have the same population mean of `Ladder score`?

We will perform `two-sample t-test` under the assumption : Ladder score in Latin America and in Western Europe are independent to each other.

$H_0$ : Latin America and Western Europe have the same population mean of Ladder score

In [None]:
western_e = happy_2021.query("`Regional indicator` == 'Western Europe'")['Ladder score']
latin_a = happy_2021.query("`Regional indicator` == 'Latin America and Caribbean'")['Ladder score']

from scipy.stats import ttest_ind

t, p = ttest_ind(western_e, latin_a, equal_var=True)
print(f'p_val = {p}')   # Reject Null: From the data we have, we can reject that the means are equal  

### 3. Is population mean of `Ladder score` of `Central and Eastern Europe` increased from 2011 compared to 2018?

$H_0$ : population mean difference between 2008 and 2018 is 0

In [None]:
# Import the historical data
happy_2008_2020 = pd.read_csv("/kaggle/input/world-happiness-report-2021/world-happiness-report.csv")
happy_2008_2020 = happy_2008_2020.merge(happy_2021[['Country name','Regional indicator']], on='Country name', how='left')

# Query the data
df = happy_2008_2020.query("`Regional indicator` == 'Central and Eastern Europe'")
df_2011 = df[df['year']==2011]['Life Ladder']
df_2018 = df[df['year']==2018]['Life Ladder']

In [None]:
from scipy.stats import ttest_rel

t, p = ttest_rel(df_2011, df_2018)
print(f'p_val = {p}')   # Reject Null: From the data we have, we can reject that the means is unchanged

### 4. Is different `Regional indicator` affect on different `Ladder score`?

$H_0$ : all Regional indicator have the same population mean of Ladder score

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

data = happy_2021[['Ladder score','Regional indicator']]\
        .rename(columns={'Ladder score':'Ladder', 'Regional indicator':'Region'})

model = ols("Ladder ~ Region", data=data).fit()
table = sm.stats.anova_lm(model)
print(table)     # Reject Null: From the data we have, we can reject that the means are the same

### 5. Is `Ladder score` and `Logged GDP per capita` correlated?

$H_0$ : ladder score and GDP are not correlated to each other

In [None]:
from scipy.stats import pearsonr

r, p = pearsonr(happy_2021['Ladder score'], happy_2021['Logged GDP per capita'])
print(f'p_val = {p}\ncorr = {r}')     # Reject null : From the data we have, we can reject that they are not related