In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Context of Data and Goals

For this analysis i'll use the follow datasets:
1. Happiness scored according to economic production, social support, etc. From the World Happiness Report
2. Population by country
3. Gdp per capita from Gapminder

# 2. Getting the data

* For the world population I had to use two sets of data. One from 1960 to 2019 and another from 2020. 
* The range of years is from 2005 to 2020. 

In [None]:
#Importing dataset for world population 1960 to 2019 from World Bank Population
#Link: https://data.worldbank.org/indicator/SP.POP.TOTL
range1 = [i for i in range(2005, 2020)]
range2 = list(map(str,["Country Name"] + range1))

df_world_pop_60_to_19 = pd.read_csv("/kaggle/input/world-population-by-country-1960-to-2019/world_population_by_country_1960_to_2019.csv", skiprows = range(0, 4), usecols=range2)

#Importing world population for 2020
df_pop_2020 = pd.read_csv("/kaggle/input/population-by-country-2020/population_by_country_2020.csv", usecols=["Country (or dependency)", "Population (2020)"])


#Importing Happyness Index 
df_happyness_index = pd.read_csv("/kaggle/input/world-happiness-report-2021/world-happiness-report.csv")


#Importing GDP Per Capita from Gapminder
range1 = [i for i in range(2005, 2021)]
range2 = list(map(str,["country"] + range1))

df_gdp_per_capita = pd.read_csv("/kaggle/input/gdp-per-capita/income_per_person.csv", usecols=range2)


#Rename columns to better joins and coding
df_pop_2020.rename(columns={'Country (or dependency)': 'Country', "Population (2020)": "2020" }, inplace=True)
df_world_pop_60_to_19.rename(columns={'Country Name': 'Country'}, inplace=True)
df_happyness_index.rename(columns={'Country name': 'Country'}, inplace=True)
df_gdp_per_capita.rename(columns={'country': 'Country'}, inplace=True)


#Merge to add 2020 population and fill nulls with 0
df_world_pop = df_world_pop_60_to_19.merge(df_pop_2020[["Country", "2020"]], on="Country", how="left").fillna(0)


#Melt to put columns in lines
df_world_pop = df_world_pop.melt(id_vars="Country", var_name="year", value_name="population")

df_gdp_per_capita = df_gdp_per_capita.melt(id_vars="Country", var_name="year", value_name="gdp_per_capita")

#Convertin year string object to int
df_world_pop['year'] = df_world_pop['year'].astype(str).astype(int)
df_gdp_per_capita['year'] = df_gdp_per_capita['year'].astype(str).astype(int)

# Data cleaning

The main dataset is Happiness Index. So i had to use countries from happy datasets and make a join with population and gdp per capita datasets, excluding not found contries. For some i find an equivalent, but writed in another way, soo i corrected them.

In [None]:
#Functions
def looking_for_countries_with_diferent_name(df_main, df_for_looking):
    return df_main[~df_main['Country'].isin(df_for_looking['Country'])]

def replacing_name_for_equivalent_countries(dict_of_countries_names, df_to_replace):
    return 

In [None]:
#Counting the qtd of countries for each dataframe
print("Country qtd for Df World Pop:", df_world_pop["Country"].nunique())
print("Country qtd for Df Gdp Per Capita:", df_gdp_per_capita["Country"].nunique())
print("Country qtd for Df Happyness Index:", df_happyness_index["Country"].nunique())


#Checking countries are in happyness df and not into WORLD POP DATASET
print("\nNot founded in World Pop Dataset: ", looking_for_countries_with_diferent_name(df_happyness_index, df_world_pop).Country.unique())

#Replacing the most equivalent countries
replace_countries_names_for_df_world_pop = {"Egypt, Arab Rep." : "Egypt",
                                            "Gambia, The" : "Gambia",
                                            "Hong Kong SAR, China" : "Hong Kong S.A.R. of China",
                                            "Iran, Islamic Rep." : "Iran",
                                            "Korea, Dem. People's Rep." : "South Korea",
                                            "Russian Federation" : "Russia",
                                            "Syrian Arab Republic" : "Syria",
                                            "Venezuela, RB" : "Venezuela",
                                            "Yemen, Rep." : "Yemen"
                                            }

#Replacing countries names
df_world_pop["Country"].replace(replace_countries_names_for_df_world_pop, inplace=True)

#Checking again
print("\nDoes not have in World Pop Dataset: ", looking_for_countries_with_diferent_name(df_happyness_index, df_world_pop).Country.unique())

#Checking countries are in happyness df and not into GDP PER CAPITA DATASET
print("\nNot founded in Gdp Per Capita Dataset: ", looking_for_countries_with_diferent_name(df_happyness_index, df_gdp_per_capita).Country.unique())

#Replacing the most equivalent countries
replace_countries_names_for_df_gdp_pop = {"Palestinian Territories" : "Palestine"}

#Replacing countries names on gdp dataset
df_gdp_per_capita["Country"].replace(replace_countries_names_for_df_gdp_pop, inplace=True)

#Checking again
print("\nDoes not have in Gdp Per Capita Dataset: ", looking_for_countries_with_diferent_name(df_happyness_index, df_gdp_per_capita).Country.unique())

Where is created the four levels by the income per capita. You can see more about in:
https://www.gapminder.org/tools/#$chart-type=bubbles&url=v1

In [None]:
df_happyness_full = df_happyness_index.merge(df_world_pop, on=["Country", "year"]).merge(df_gdp_per_capita, on=["Country", "year"])

#Renaming columns to improvement the code experience
df_happyness_full.rename(columns={
    'Country': 'country', 
    'Life Ladder': 'life_ladder', 
    'Log GDP per capita': 'log_gdp_per_capita',
    'Social support': 'social_support',
    'Healthy life expectancy at birth': 'healthy_life_expectancy_at_birth',
    'Freedom to make life choices': 'freedom_to_make_life_choices',
    'Generosity': 'generosity',
    'Perceptions of corruption': 'perceptions_of_corruption',
    'Positive affect': 'positive_affect',
    'Negative affect': 'negative_affect',
}, inplace=True)


#Creating country level using the range gdp per person from Gapminder
country_level_by_income = [
    (df_happyness_full.gdp_per_capita <= 2500),
    (df_happyness_full.gdp_per_capita > 2500) & (df_happyness_full.gdp_per_capita <= 8000),
    (df_happyness_full.gdp_per_capita > 8000) & (df_happyness_full.gdp_per_capita <= 25000),
    (df_happyness_full.gdp_per_capita > 25000)
]

levels = ['Level 1', 'Level 2', 'Level 3', 'Level 4']

df_happyness_full["levels"] = np.select(country_level_by_income, levels)

df_happyness_full = df_happyness_full.replace(np.nan, 0)

df_happyness_full_min_max_year = df_happyness_full.groupby('country').head(1).append(df_happyness_full.groupby('country').tail(1), sort=True).reset_index(inplace=True)

df_happyness_most_recent_year = df_happyness_full.groupby('country').tail(1)

# Exploring the data

**The Happyness on the four levels coutries**

We can see that income per person impacts the overall happiness of countries, but is it just income?
Why the happyness in the level 2 and 3 is so similar?

In [None]:
df_happyness_most_recent_year.columns

df_happyness_most_recent_year.describe()

list_of_columns = df_happyness_most_recent_year.columns[[2,4,5,6,7,8,9,10]]

## How the variables is distributed in the levels based on income per capita

In [None]:
sns.set_theme(style="whitegrid")
f, axes = plt.subplots(3, 3, figsize=(22, 22), sharex=True)


level_order = ["Level 1", "Level 2", "Level 3", "Level 4"]

for ax, feature in zip(axes.flat, list_of_columns):
    sns.boxplot(data=df_happyness_most_recent_year, y=feature, x="levels", order=level_order, ax=ax)

In [None]:
f, axes = plt.subplots(3, 3, figsize=(22, 22))

for ax, feature in zip(axes.flat, list_of_columns):
     sns.scatterplot(data=df_happyness_most_recent_year,
                y="gdp_per_capita",
                x=feature,  
                hue="levels",
                palette="deep",
                alpha=0.7,
                ax=ax)

In [None]:

# get coeffs of linear fit
slope, intercept, r_value, p_value, std_err = stats.linregress(df_happyness_most_recent_year.life_ladder,df_happyness_most_recent_year.gdp_per_capita)

# use line_kws to set line label for legend
ax = sns.lmplot(x="life_ladder", y="gdp_per_capita", data=df_happyness_most_recent_year,hue="levels",  
 line_kws={'label':"y={0:.1f}x+{1:.1f}".format(slope,intercept)})

plt.show()