In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.graph_objects as go # graphic objects plots 
from plotly.subplots import make_subplots # create multiple plots side by side
import plotly.express as px # plotly express

from sklearn.preprocessing import OrdinalEncoder # Encode Countries and Regions
from sklearn.metrics import mean_squared_error # Compare Models using the same metric
from sklearn.model_selection import cross_val_score # Avoid Overfitting

# Data

In [None]:
report21 = pd.read_csv("/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv")
report21

In [None]:
historic_report = pd.read_csv("/kaggle/input/world-happiness-report-2021/world-happiness-report.csv")

historic_report['year'] = historic_report['year'].astype(int)

historic_report = historic_report[(historic_report['year'] >= 2006) & (historic_report['year'] <= 2020)]

historic_report.describe()

# Features Seleccion

In [None]:
new_names = {
    'Country name': 'country',
    'Freedom to make life choices': 'freedom',
    'Generosity': 'generosity',
    'Healthy life expectancy': 'life_expectancy_21',
    'Healthy life expectancy at birth': 'life_expectancy_h',
    'Ladder score': 'ladder_score',
    'Life Ladder': 'life_ladder',
    'Logged GDP per capita': 'log_gdp_21',
    'Log GDP per capita': 'log_gdp_h',
    'lowerwhisker': 'lower_whisker',
    'Negative affect': 'negative_affect',
    'Perceptions of corruption': 'corruption',
    'Positive affect': 'positive_affect',
    'Regional indicator': 'region',
    'Social support': 'social_support',
    'Standard error of ladder score': 'ladder_score_std',
    'upperwhisker': 'upper_whisker',
    'year': 'year'
}

In [None]:
historic_report = historic_report.rename(columns=new_names)

In [None]:
# I am more interested in the first 13 columns
report21 = report21.iloc[:, :12]
report21 = report21.rename(columns=new_names)

report21 = report21.drop(['ladder_score_std', 'upper_whisker', 'lower_whisker'], axis=1)

# Plot Time

## Histograms
Is the majority of the world happy?

In [None]:
columns_to_plot = [column for column in report21.columns if column not in ['country', 'region']]


fig = make_subplots(
    rows=(len(columns_to_plot)//2)+1, 
    cols=2,
    subplot_titles=[
        'Ladder Score', 'Logged GDP per capita', 'Social Support', 'Healthy Life Expectancy', 'Freedom to make Life Choices', 
        'Generosity', 'Perceptions of Corruption'
    ]
)

for i, column in enumerate(columns_to_plot):
    trace = go.Histogram(x=report21[column])
    fig.append_trace(trace, (i//2)+1, (i%2)+1)

fig.update_layout(showlegend=False, height=1500,)
    
fig.show()

# Scatter 
How does each country behave individually?

In [None]:
columns_to_plot = [column for column in report21.columns if column not in ['country', 'region', 'ladder_score']]


fig = make_subplots(
    rows=(len(columns_to_plot)//2), 
    cols=2,
)

for i, column in enumerate(columns_to_plot):
    trace = go.Scatter(
        x=report21[column], 
        y=report21['ladder_score'],
        hovertext=report21['country'],
        mode='markers',
        marker=dict(color=report21['ladder_score'])
    )
    fig.append_trace(trace, (i//2)+1, (i%2)+1)
    
    original_column_name = [ key for key, value in new_names.items() if value == column]
    
    fig.update_xaxes(title_text=original_column_name[0], row=(i//2)+1, col=(i%2)+1)
    fig.update_yaxes(title_text="Ladder Score", row=(i//2)+1, col=(i%2)+1)

fig.update_layout( 
    showlegend=False, 
    height=1000,
)
    
fig.show()

In [None]:
# Countries where people perceive their government to be corrupt (> 70%)

corrupt_countries = report21[report21.corruption > 0.7]

print(f"{int((corrupt_countries.shape[0]/report21.shape[0])*100)}% of governments are perceived as corrupt")

In [None]:
# Is generosity linked to happiness?

top25 = report21[:25]
bottom25 = report21[-26:]

print(f"Average generosity in the top 25 countries: {round(top25.generosity.mean(), )}")
print(f"Average generosity in the bottom 25 countries: {round(bottom25.generosity.mean(), )}")

# Correlation


In [None]:
# Correlation matrix
cm = report21.corr()

fig = px.imshow(cm)
fig.show()

In [None]:
# One more beautiful correlation matrix

data = []

for index, row in cm.iterrows():
    for c in cm.columns:
        data.append([c, index, row[c]])
        
data = pd.DataFrame(data, columns=['Indicators X', 'Indicators Y', 'Correlation'])

fig = px.scatter(
    data,
    x = 'Indicators X',
    y = 'Indicators Y',
    color = 'Correlation',
    size = abs(data['Correlation'])
)
fig.show()

# Can we predict the ladder score?

In [None]:
X = report21.drop(['country', 'region', 'ladder_score'], axis=1).copy()
y = report21['ladder_score'].copy()

## Model 1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
slrmodel = LinearRegression(
    fit_intercept = False, # default = True
    normalize = False, # default = False
    copy_X = True, # default = True
    n_jobs = -1, # default=None
    positive = False # default = False
)
slrmodel.fit(X, y)

y_pred = slrmodel.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

slrscores = cross_val_score(slrmodel, X, y, cv=10, scoring='neg_root_mean_squared_error')

print(f"Model RMSE: {rmse} Cross-Validation Mean RMSE: {round(slrscores.mean(), 2)}")

## Model 2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfrmodel = RandomForestRegressor(
    n_estimators = 100, # default = 100
    criterion = "mse", # {"mse", "mae"}, default = "mse"
    max_depth = 20, # default = None
    min_samples_split = 2, # default = 2
    min_samples_leaf = 1, # default = 1
    min_weight_fraction_leaf = 0.0, # default = 0.0
    max_features = "auto", # {"auto", "sqrt", "log2"}, default="auto"
    max_leaf_nodes = None, # default = None
    min_impurity_decrease = 0.0, # default = 0.0
    min_impurity_split = None, # default = None
    bootstrap = True, # default = True
    oob_score = False, # default = False
    n_jobs = -1, # default = None
    random_state = None, # default = None
    verbose = 0, # default = 0
    warm_start = False, # default = False
    ccp_alpha = 0.0, # default = 0.0
    max_samples = None # default = None
)
rfrmodel.fit(X, y)

y_pred = rfrmodel.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

rfrscores = cross_val_score(rfrmodel, X, y, cv=10, scoring='neg_root_mean_squared_error')

print(f"Model RMSE: {rmse} Cross-Validation Mean RMSE: {round(rfrscores.mean(), 2)}")

## Model 3. K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knnmodel = KNeighborsRegressor(
    n_neighbors = 10, # default = 5
    weights = "uniform", # {"uniform", "distance"}, default = "uniform"
    algorithm = "auto", # {"auto", "ball_tree", "kd_tree", "brute"}, default = "auto"
    leaf_size = 30, # default = 30
    p = 2, # default = 2
    metric = "minkowski", # default = "minkowski"
    metric_params = None, # default = None
    n_jobs = -1, # default = None
)
knnmodel.fit(X, y)

In [None]:
knn_scores = []
ks = [k for k in range(1, 100, 3)]

for k in ks:
    knnmodel = KNeighborsRegressor(n_neighbors=k).fit(X, y)
    cvscores = cross_val_score(knnmodel, X, y, cv=10, scoring='neg_root_mean_squared_error')
    knn_scores.append(cvscores.mean())
    
fig = px.line(x=ks, y=knn_scores)

fig.update_layout(
    title="K value vs Accuracy",
    xaxis_title="K",
    yaxis_title="Accuracy",
)

fig.show()

In [None]:
y_pred = knnmodel.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

knnscores = cross_val_score(knnmodel, X, y, cv=10, scoring='neg_root_mean_squared_error')

print(f"Model RMSE: {rmse} Cross-Validation Mean RMSE: {round(knnscores.mean(), 2)}")

# What can we get from K-Means?
Does third world countries exist? Which ones are really at each group?

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

In [None]:
import pycountry

def try_get_iso(country_name):
    try:
        iso = pycountry.countries.search_fuzzy(country_name)[0].alpha_3
        return iso
    except:
        return None

report21['iso_alpha'] = report21['country'].apply(lambda x : try_get_iso(x))


In [None]:
report21_with_isocode = report21[report21.iso_alpha.notnull()].copy()
y_pred = kmeans.predict(report21_with_isocode[X.columns])

fig = px.choropleth(
    report21_with_isocode, 
    locations="iso_alpha",
    color=y_pred,
    hover_name="country"
)

fig.update_layout(
    showlegend=False,
    coloraxis_showscale = False,
    title="What did k-means find? First, Second and Third world countries maybe."
)

fig.show()

# Historical Data

In [None]:
fig = px.line(
    historic_report, 
    x="year", 
    y="life_ladder",
    color="country",
    hover_name="country"
)
fig.show()

# Animations
How have the indicators changed over the years?

In [None]:
# fix for category_orders
years = [x for x in range(historic_report["year"].min(),historic_report["year"].max())]

In [None]:
fig = px.scatter(
    historic_report, 
    x="log_gdp_h", 
    y="life_ladder", 
    animation_frame="year", 
    category_orders={"year": years},
    animation_group="country",
    color="country", 
    hover_name="country",
    range_x=[historic_report["log_gdp_h"].min(),historic_report["log_gdp_h"].max()], 
    range_y=[historic_report["life_ladder"].min(),historic_report["life_ladder"].max()]
)

fig.update_layout(
    title="How important has the gdp been for the happiness of citizens over the years?"
)

fig.show()

In [None]:
fig = px.scatter(
    historic_report, 
    x="social_support", 
    y="life_ladder", 
    animation_frame="year", 
    category_orders={"year": years},
    animation_group="country",
    color="country", 
    hover_name="country",
    range_x=[historic_report["social_support"].min(),historic_report["social_support"].max()], 
    range_y=[historic_report["life_ladder"].min(),historic_report["life_ladder"].max()]
)

fig.update_layout(
    title="How important has the social support been for the happiness of citizens over the years?"
)

fig.show()

In [None]:
fig = px.scatter(
    historic_report, 
    x="life_expectancy_h", 
    y="life_ladder", 
    animation_frame="year", 
    category_orders={"year": years},
    animation_group="country",
    color="country", 
    hover_name="country",
    range_x=[historic_report["life_expectancy_h"].min(),historic_report["life_expectancy_h"].max()], 
    range_y=[historic_report["life_ladder"].min(),historic_report["life_ladder"].max()]
)

fig.update_layout(
    title="How important has the life expectancy been for the happiness of citizens over the years?"
)

fig.show()

In [None]:
fig = px.scatter(
    historic_report, 
    x="freedom", 
    y="life_ladder", 
    animation_frame="year", 
    category_orders={"year": years},
    animation_group="country",
    color="country", 
    hover_name="country",
    range_x=[historic_report["freedom"].min(),historic_report["freedom"].max()], 
    range_y=[historic_report["life_ladder"].min(),historic_report["life_ladder"].max()]
)

fig.update_layout(
    title="How important has the freedom been for the happiness of citizens over the years?"
)

fig.show()

In [None]:
fig = px.scatter(
    historic_report, 
    x="generosity", 
    y="life_ladder", 
    animation_frame="year", 
    category_orders={"year": years},
    animation_group="country",
    color="country", 
    hover_name="country",
    range_x=[historic_report["generosity"].min(),historic_report["generosity"].max()], 
    range_y=[historic_report["life_ladder"].min(),historic_report["life_ladder"].max()]
)

fig.update_layout(
    title="How important has the generosity been for the happiness of citizens over the years?"
)

fig.show()

In [None]:
fig = px.scatter(
    historic_report, 
    x="corruption", 
    y="life_ladder", 
    animation_frame="year", 
    category_orders={"year": years},
    animation_group="country",
    color="country", 
    hover_name="country",
    range_x=[historic_report["corruption"].min(),historic_report["corruption"].max()], 
    range_y=[historic_report["life_ladder"].min(),historic_report["life_ladder"].max()]
)

fig.update_layout(
    title="How important has the perception of corruption been for the happiness of citizens over the years?"
)


fig.show()

# This is a work in progress, next version will have:

* Future Predictions
* More ML Models 
* Better description of the data