In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_hapiness = pd.read_csv("../input/world-happiness-report-2021/world-happiness-report.csv")
df_hapiness.head()

In [None]:
df_hapiness_2021 = pd.read_csv("../input/world-happiness-report-2021/world-happiness-report-2021.csv")
df_hapiness_2021.head()

In [None]:
df_hapiness.dtypes

In [None]:
df_hapiness_2021.dtypes

In [None]:
print("====== Null Percentage on df_hapiness ======")
df_hapiness.isnull().sum()*100/len(df_hapiness)

Let's fill all the missing values to 0

In [None]:
df_hapiness.fillna(0,inplace= True)

In [None]:
print("====== Null Percentage on df_hapiness_2021 ======")
df_hapiness_2021.isnull().sum()*100/len(df_hapiness_2021)

# Data Understanding

Life Ladder - It's a range from 0 to 10 where 0 represents the worst possible life while 10 stands for the "perfect life".

GDP per capita - It indicates a country's economic by its population. In other words, it's known as a country's standard of living based on its economy.

Social support - Perception of available assistance among people, involving a network of a family and friends. It's also related to psychological health

Healthy life expectancy at birth - It's the average life in good health that a person could expect to live

Freedom to make life choices - Concerns to someone's autonomy according to a will or preference.

Generosity - Refers to an overall spirit of kindness. It can involve offering time, assets or talents to aid someone in need.

Perceptions of corruption - Perceived levels of public sector corruption, as determined by expert assessments and opinion surveys

In [None]:
df_hapiness.describe()

In [None]:
df_hapiness_2021.describe()

# Global Analysis

In [None]:
figure, axes = plt.subplots(7,2,figsize=(15,20))


sns.barplot(data=df_hapiness_2021.sort_values(by="Ladder score", ascending=False).head(10), 
            x="Ladder score", y="Country name", palette="Greens_r", ax=axes[0,0]).set_title("Top 10 - Ladder score", fontweight="bold")
sns.barplot(data=df_hapiness_2021.sort_values(by="Ladder score", ascending=False).tail(10), 
            x="Ladder score", y="Country name", palette="Reds", ax=axes[0,1]).set_title("Last 10 - Ladder score", fontweight="bold")

sns.barplot(data=df_hapiness_2021.sort_values(by="Logged GDP per capita", ascending=False).head(10), 
            x="Logged GDP per capita", y="Country name", palette="Greens_r", ax=axes[1,0]).set_title("Top 10 - Logged GDP per capita", fontweight="bold")
sns.barplot(data=df_hapiness_2021.sort_values(by="Logged GDP per capita", ascending=False).tail(10), 
            x="Logged GDP per capita", y="Country name", palette="Reds", ax=axes[1,1]).set_title("Last 10 - Logged GDP per capita", fontweight="bold")

sns.barplot(data=df_hapiness_2021.sort_values(by="Social support", ascending=False).head(10), 
            x="Social support", y="Country name", palette="Greens_r", ax=axes[2,0]).set_title("Top 10 - Social support", fontweight="bold")
sns.barplot(data=df_hapiness_2021.sort_values(by="Social support", ascending=False).tail(10), 
            x="Social support", y="Country name", palette="Reds", ax=axes[2,1]).set_title("Last 10 - Social support", fontweight="bold")

sns.barplot(data=df_hapiness_2021.sort_values(by="Healthy life expectancy", ascending=False).head(10), 
            x="Healthy life expectancy", y="Country name", palette="Greens_r", ax=axes[3,0]).set_title("Top 10 - Healthy life expectancy", fontweight="bold")
sns.barplot(data=df_hapiness_2021.sort_values(by="Healthy life expectancy", ascending=False).tail(10), 
            x="Healthy life expectancy", y="Country name", palette="Reds", ax=axes[3,1]).set_title("Last 10 - Healthy life expectancy", fontweight="bold")

sns.barplot(data=df_hapiness_2021.sort_values(by="Freedom to make life choices", ascending=False).head(10), 
            x="Freedom to make life choices", y="Country name", palette="Greens_r", ax=axes[4,0]).set_title("Top 10 - Freedom to make life choices", fontweight="bold")
sns.barplot(data=df_hapiness_2021.sort_values(by="Freedom to make life choices", ascending=False).tail(10), 
            x="Freedom to make life choices", y="Country name", palette="Reds", ax=axes[4,1]).set_title("Last 10 - Freedom to make life choices", fontweight="bold")

sns.barplot(data=df_hapiness_2021.sort_values(by="Generosity", ascending=False).head(10), 
            x="Generosity", y="Country name", palette="Greens_r", ax=axes[5,0]).set_title("Top 10 - Generosity", fontweight="bold")
sns.barplot(data=df_hapiness_2021.sort_values(by="Generosity", ascending=False).tail(10), 
            x="Generosity", y="Country name", palette="Reds", ax=axes[5,1]).set_title("Last 10 - Generosity", fontweight="bold")

sns.barplot(data=df_hapiness_2021.sort_values(by="Perceptions of corruption", ascending=True).head(10), 
            x="Perceptions of corruption", y="Country name", palette="Greens_r", ax=axes[6,0]).set_title("Top 10 - Perceptions of corruption", fontweight="bold")
sns.barplot(data=df_hapiness_2021.sort_values(by="Perceptions of corruption", ascending=True).tail(10), 
            x="Perceptions of corruption", y="Country name", palette="Reds", ax=axes[6,1]).set_title("Last 10 - Perceptions of corruption", fontweight="bold")
        
plt.tight_layout()
plt.show()

**Ladder Score**:
- As we can see, the top 10 Happiest countires are mostly in Europe while the 10 Saddest are some in Sub Saharan Africa, some in South Asia, Caribbean and Middle East. Finland, Denamark and Switzerland are the top 3.

**Logged GDP per capita**:
- Luxembourg has the higher GDP per capita. United Arab Emirates and Hong Kong/China also stands out at this rank. However, all of the lastest countries are in Sub-Saharan Africa, except for Haiti, which is located in Caribbean.

**Social Support**:
- Iceland is on the top of Social support. Kazakhstan and Turkmenistan (Commonwealth of Idepentent States) have one of the greatest social support indicator. The thing is that, despite of its high populational density, India have one of the worst social support.

**Healthy life expectancy**:
- Singapore and Hong Kong/China are almost tied on Healthy life expectancy (around 77 y.o.). it's seen that the best healthy life expectancy are mixed between Europe and Asia. People in Chad live around only 48 y.o. and Afghaninstan is in this range as well.

**Freedom to make life choices**:
- Uzbekistan, Norway and Cambodia have the highest freedom values while Afghanistan has the lowest one. Greece, Urkey, Lebanon and Algeria are in bad rates too.

**Generosity**:
- Indonesia and Myanmar are the two most generous countries in the World, showing a great difference between the others which is in this rank. Haiti shows up in one of the best too. Greece and Japan have the worst generosity values, followed by Botswana and Portugal.

**Perceptions of corruption**
- Singapore and Rwanda have the lowest values of perception of corruption, which is good. But, Croatia, Romania and Bulgaria have high perception of corruption. These 3 are in Central and Eastern Europe.

Ok, now let's find out which variables are statistically significant with "Ladder Score":

In [None]:
corr = df_hapiness_2021[["Logged GDP per capita", "Social support", "Healthy life expectancy", "Freedom to make life choices","Generosity","Perceptions of corruption", "Ladder score"]].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr, annot=True)
plt.title("Heatmap", fontweight = "bold")
plt.show()

In [None]:
v_vars = ["Ladder score", "Logged GDP per capita", "Social support", "Healthy life expectancy", 
          "Freedom to make life choices", "Generosity", "Perceptions of corruption"]

sns.pairplot(data=df_hapiness_2021[v_vars])

plt.show()

GDP per capita, Healthy life expectancy and Social support respectively can clearly explain the overall country's happiness score. It's important to notice the high correlation between GDP per capita and Healthy life expectancy. Obviously, the higher the GDP, the greater are the investments on healthcare systems and as a consequence people live longer.

Let'se take a closer look:

In [None]:
figure, axes = plt.subplots(nrows=3, figsize=(15, 15))

sns.set(style="ticks")
sns.scatterplot(x="Healthy life expectancy", y="Logged GDP per capita", data=df_hapiness_2021, color="black",ax=axes[0])
sns.regplot(x="Logged GDP per capita", y="Ladder score", data=df_hapiness_2021,color="darkblue", ax=axes[1])
sns.regplot(x="Healthy life expectancy", y="Ladder score", data=df_hapiness_2021, color="darkred",ax=axes[2])

plt.show()
plt.close()

**How's Ladder Score according to the Regional Indicator?**

In [None]:
sns.set(style="white", font_scale=1.5)
plt.figure(figsize=(20, 6))

g = sns.boxplot(x="Regional indicator", y="Ladder score", data=df_hapiness_2021)
g.set_xlabel("Region",fontsize=15)
g.set_ylabel("Ladder Score",fontsize=15)

plt.xticks(rotation=90)
plt.title("Life Ladder Score Box Plot by Region", fontsize=20)
plt.show()

- Western Europe and North America & ANZ have the happiest people and this is also visible due to its small variability on ther opinion.

- The saddest people are both in South Asia and Sub-Saharan Africa

- Middle East/North Africa, Sub-Saharan Africa and South Asia have the widest ranges of Ladder Score values (Not considering Latin America/Caribbean outliers)

Creating a categorical variable for Ladder Score based on its range so that we can analyze the "behavioral happiness" along the years:

In [None]:
df_hapiness["Score_Catg"] = np.select(
                            [
                                df_hapiness["Life Ladder"].between(0,4,inclusive=True),
                                df_hapiness["Life Ladder"].between(4,7,inclusive=False),
                                df_hapiness["Life Ladder"].between(7,10,inclusive=True)
                            ],
                            [
                                "Sad",
                                "Normal",
                                "Happy"
                            ]
                        )

In [None]:
import matplotlib.ticker as mtick

stck = df_hapiness.groupby("year")["Score_Catg"].value_counts(normalize=True).unstack().reset_index().fillna(0)

order = ["year","Sad","Normal","Happy"]

stck = stck.reindex(columns=order)

stck.plot(
            x = "year",
            kind = "bar",
            stacked = True,
            color = ["#AD1524", "#FFBE16", "#00BF45"],
            figsize=(15,5)     
        ).yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))

plt.xlabel("Year", fontsize=15)
plt.ylabel("Percentage", fontsize=15)
plt.title("Happiness Distribuiton", fontweight="bold", fontsize=20)

plt.legend(loc="upper left", bbox_to_anchor=(1,1), ncol=1)

plt.show()


- In 2005, there was no "Sad" Country in the World and around 38% of them were "Happy"!
- Every year, most countries are classified as "Normal".
- The "Sad" percentange decreases between 2019 - 2020, even though the struggle against Covid-19 pandemic.

# Zooming in Brazil

In [None]:
for varname in v_vars:
    
    if varname != "Perceptions of corruption":            
        df_sort = df_hapiness_2021.sort_values(by=varname, ascending=False).reset_index(drop=True) 
        print("== Brazil is on {}th place of {} in the World {} Ranking ==".format(
            df_sort.index[df_sort["Country name"] == "Brazil"].tolist()[0]+1, len(df_sort), varname))
        
        del df_sort
        
    else:
        df_sort = df_hapiness_2021.sort_values(by=varname, ascending=True).reset_index(drop=True)      
        print("== Brazil is on {}th place of {} in the World {} Ranking ==".format(
            df_sort.index[df_sort["Country name"] == "Brazil"].tolist()[0]+1, len(df_sort), varname))
        
        del df_sort


The worst indicator in Brazil is Generosity, followed by Healthy life expectancy and Freedom to make life choices.

Brazil's Life Ladder (6.330) is above the 75 Percentile, so we can consider this value as a great one!

In [None]:
#Renaming columns
df_hapiness_2021.rename(columns={"Ladder score": "Life Ladder", "Logged GDP per capita": "Log GDP per capita",
                                "Healthy life expectancy": "Healthy life expectancy at birth"
                                }, inplace=True)

#Adding year
df_hapiness_2021["year"] = 2021

col = ["Country name", "year", "Life Ladder", "Log GDP per capita", "Social support", "Healthy life expectancy at birth",
      "Freedom to make life choices", "Generosity", "Perceptions of corruption"]

df_hapiness_2021_v2 = df_hapiness_2021[col]
df_hapiness_v2 = df_hapiness[col]

#Appending both datasets 
df_all = pd.concat([df_hapiness_v2, df_hapiness_2021_v2])

#Separates Brazil from the others
df_all["View"] = np.where(df_all["Country name"]=="Brazil","Brazil","World")


In [None]:
df_all[df_all["View"]=="Brazil"]

Well, we can see that Brazil's 2006 Life Score is missing, let's atrribute its value by the average of 2005 and 2007:


In [None]:
newRow = {"Country name":"Brazil", "year":2006, "Life Ladder":np.nan, 
          "Log GDP per capita":np.nan, "Social support": np.nan, "Healthy life expectancy at birth": np.nan,
          "Freedom to make life choices": np.nan, "Generosity": np.nan, "Perceptions of corruption": np.nan,
          "View": "Brazil"}

df_all = df_all.append(newRow, ignore_index=True)

to_fill = ["Life Ladder", "Log GDP per capita", "Social support", "Healthy life expectancy at birth",
          "Freedom to make life choices", "Generosity", "Perceptions of corruption"]

for i in to_fill:
        df_all[i] = df_all[i].fillna(
                     df_all[(df_all["Country name"]=="Brazil") & (df_all["year"].isin([2005,2007]))].mean()[i])

In [None]:
#Sorting by year
df_all.sort_values(by="year", inplace=True)
df_all[df_all["View"]=="Brazil"]

In [None]:
#In order to plot the features along years, let's get the average of our numerical variables and then group by year and View
df_plt = df_all.groupby(["year", "View"], as_index=False).mean()
df_plt.head()

In [None]:
figure, axes = plt.subplots(nrows=7, figsize=(15,20))

figure.suptitle("Brazil and the World average",fontweight="bold", fontsize=20)

palette = {c:"#008A2D" if c=="Brazil" else "#D6D8DD" for c in df_plt["View"].unique()}

sns.pointplot(x="year", y="Life Ladder", data=df_plt, hue="View", palette=palette, ax=axes[0])
sns.pointplot(x="year", y="Log GDP per capita", data=df_plt, hue="View", palette=palette, ax=axes[1])
sns.pointplot(x="year", y="Social support", data=df_plt, hue="View", palette=palette, ax=axes[2])
sns.pointplot(x="year", y="Healthy life expectancy at birth", data=df_plt, hue="View", palette=palette, ax=axes[3])
sns.pointplot(x="year", y="Freedom to make life choices", data=df_plt, hue="View", palette=palette, ax=axes[4])
sns.pointplot(x="year", y="Generosity", data=df_plt, hue="View", palette=palette, ax=axes[5])
sns.pointplot(x="year", y="Perceptions of corruption", data=df_plt, hue="View", palette=palette, ax=axes[6])

axes[0].set_ylabel("Life Ladder",fontsize=15)
axes[1].set_ylabel("Log GDP per capita",fontsize=15)
axes[2].set_ylabel("Social support",fontsize=15)
axes[3].set_ylabel("Healthy life expectancy",fontsize=15)
axes[4].set_ylabel("Freedom to make choices",fontsize=15)
axes[5].set_ylabel("Generosity",fontsize=15)
axes[6].set_ylabel("Perceptions of corruption",fontsize=15)



#Removing each subplot legend except for the first subplot
for k in range(7):
    if k == 0:
        axes[k].legend(loc="upper right", bbox_to_anchor=(1.15,1.1), ncol=1) #Legend position
    else:
        axes[k].get_legend().remove()

plt.tight_layout()
plt.show()

Brazil's Life Ladder Score decreases from 2014 on. But, it's still above the World average. Currently, it's showing a little sign of increasing.

Our GDP per capita has remained constant over the past few years. We can observe a little decrease between 2019 and 2020. This was certainly affected by Covid-19.

Social support suffered a high decrease in 2020, that's also explained by the Covid-19 pandemic.

Healthy life expectancy was slightly increasing but in 2021 it has been a little lower than in 2020.

Brazil's Freedom to make life choices has an unstable behaviour. On Generosity, Brazil is below the World average, what is bad.

Since 2014, Brazil has been on a high level on Perceptions of corruption, overcoming the World average.



# Comparison between South America's Countries

In [None]:
south_america = df_hapiness_2021[df_hapiness_2021["Country name"].isin(
                                                                    ["Argentina" ,"Bolivia" ,"Brazil" ,"Chile" ,
                                                                     "Colombia" ,"Ecuador" ,"Paraguay" ,"Peru" ,
                                                                     "Uruguay" ,"Venezuela"]
                                                                  )].sort_values(by="Country name").drop(["Regional indicator", "Life Ladder",
                                                                     "Standard error of ladder score", "upperwhisker", "lowerwhisker",
                                                                     "Log GDP per capita", "Social support",
                                                                     "Healthy life expectancy at birth", "Freedom to make life choices",
                                                                     "Generosity", "Perceptions of corruption", "Ladder score in Dystopia",
                                                                    "Dystopia + residual", "year"], axis=1)



south_america.plot(
            x = "Country name",
            kind = "bar",
            stacked = True,
            figsize=(15,5)     
        )

plt.title("South America Explained Life Ladder", fontweight="bold", fontsize=20)
plt.ylabel("Explained Score") 
plt.xlabel("Country Name") 
plt.xticks(rotation=45)

plt.legend(loc="lower center", bbox_to_anchor=(0.5,-0.7), ncol=3)

plt.show()

Firstly, we can easily notice that the explained features on Life Ladder are kinda balanced all over South America, with some caveats to consider:

- Our Life Ladder scores are mainly explained by GDP per capita and Social Support.

- Uruguay's Perception of corruption outstands among other countries, this is because they have the lowest rate of corruption in South America. Otherwise, we can't conclude the same for Peru and Paraguay, which have the highest values.

- About Generosity, the most prominent is Paraguay.

- Venezuela is the least free to make life choices.

- Bolivia has the lowest Healty life expectancy and GDP per capita values in South America.

- Argentina, Chile and Uruguay are leadding GDP per capita.
