# World Happiness Report 
### Exploratory Data Analysis
Madison Agatstein

## Overview

This project explores the happiness scores, as well as factors used to explain the score, for 153 countries from 2015 through 2019. 
The score and rankings are based on answer to the Gallup World Survey. 

## What factors help to determine the Happiness Score? 
* GDP per capita
* Social support
* Healthy life expectancy
* Freedom to make life choices
* Generosity
* Perceptions of corruption

These factors come from the year 2019. There were additional factors in previous years, but these factors were common for every year. 


### Import Packages

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.metrics import r2_score

from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt

### Load Data

In [None]:
# load datasets 
happy_2015 = pd.read_csv("../input/world-happiness-report/2015.csv")
happy_2016 = pd.read_csv("../input/world-happiness-report/2016.csv")
happy_2017 = pd.read_csv("../input/world-happiness-report/2017.csv")
happy_2018 = pd.read_csv("../input/world-happiness-report/2018.csv")
happy_2019 = pd.read_csv("../input/world-happiness-report/2019.csv")

## Preparing the Data

### 2015

In [None]:
happy_2015.head()

In [None]:
# get column names 
for col in happy_2015.columns: 
    print(col) 

In [None]:
# check for missing values
happy_2015.isna().sum(axis=0)

### 2016

In [None]:
happy_2016.head()

In [None]:
# get column names 
for col in happy_2016.columns: 
    print(col) 

In [None]:
# check for missing values
happy_2019.isna().sum(axis=0)

### 2017

In [None]:
happy_2017.head()

In [None]:
# get column names 
for col in happy_2017.columns: 
    print(col) 

In [None]:
# check for missing values
happy_2017.isna().sum(axis=0)

### 2018

In [None]:
happy_2018.head()

In [None]:
# get column names 
for col in happy_2018.columns: 
    print(col) 

In [None]:
# check for missing values
happy_2018.isna().sum(axis=0)

In [None]:
# drop row with missing value
happy_2018 = happy_2018.dropna()

### 2019

In [None]:
happy_2019.head()

In [None]:
print(happy_2018.shape)

In [None]:
# get column names 
for col in happy_2019.columns: 
    print(col) 

In [None]:
# check for missing values
happy_2018.isna().sum(axis=0)

## Scatter Plots

### Comparing Happiness Scores for each country and year

In [None]:
# create data frames with only top 20 countries 
top_20_2015 = happy_2015.iloc[:20,:]
top_20_2016 = happy_2016.iloc[:20,:]
top_20_2017 = happy_2017.iloc[:20,:]
top_20_2018 = happy_2018.iloc[:20,:]
top_20_2019 = happy_2019.iloc[:20,:]

In [None]:
# creating trace1
trace1 =go.Scatter(
                    x = top_20_2015['Country'],
                    y = top_20_2015['Happiness Score'],
                    mode = "markers",
                    name = "2015",
                    marker = dict(color = 'red'),
                    text= top_20_2015.Country)
# creating trace2
trace2 =go.Scatter(
                    x = top_20_2015['Country'],
                    y = top_20_2016['Happiness Score'],
                    mode = "markers",
                    name = "2016",
                    marker = dict(color = 'green'),
                    text= top_20_2015.Country)
# creating trace3
trace3 =go.Scatter(
                    x = top_20_2015['Country'],
                    y = top_20_2017['Happiness.Score'],
                    mode = "markers",
                    name = "2017",
                    marker = dict(color = 'blue'),
                    text= top_20_2015.Country)

# creating trace4
trace4 =go.Scatter(
                    x = top_20_2015['Country'],
                    y = top_20_2018['Score'],
                    mode = "markers",
                    name = "2018",
                    marker = dict(color = 'black'),
                    text= top_20_2015.Country)

# creating trace5
trace5 =go.Scatter(
                    x = top_20_2015['Country'],
                    y = top_20_2019['Score'],
                    mode = "markers",
                    name = "2019",
                    marker = dict(color = 'pink'),
                    text= top_20_2015.Country)


data = [trace1, trace2, trace3, trace4, trace5]
layout = dict(title = 'Happiness Rate Changing 2015 to 2019 for Top 20 Countries',
              xaxis= dict(title= 'Country',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'Happiness',ticklen= 5,zeroline= False),
              hovermode="x unified"
             )
fig = dict(data = data, layout = layout)
iplot(fig)

### Exploring the relationships between variables
Looking at 2019 only. 


In [None]:
# pairs plot 
plt.figure(figsize = [16, 16])
for i, r in enumerate(happy_2019.columns[2:]): 
    for j, c in enumerate(happy_2019.columns[2:]): 
        plt.subplot(7, 7, 7*i+j+1)
        if i == j: 
            plt.hist(happy_2019[c], bins = 20, edgecolor = 'k', color = 'cornflowerblue')
        else:
            plt.scatter(happy_2019[c], happy_2019[r], s = 5, alpha = 0.5, color = 'cornflowerblue')
        plt.xlabel(c)
        plt.ylabel(r)
plt.tight_layout()
plt.savefig('pairs_plot.png')

## Map Plots
Looking at 2019 only. 


In [None]:
data = dict(
        type = 'choropleth',
        colorscale = 'Viridis',
        marker_line_width=1,
        locations = happy_2019['Country or region'],
        locationmode = "country names",
        z = happy_2019['Score'],
        text = happy_2019['Country or region'],
        colorbar = {'title' : 'Score'},
        
      )
layout = dict(title = 'Happiness Map 2019',
              geo = dict(projection = {'type':'mercator'}, showocean = False, showlakes = True, showrivers = True, )
             )
choromap = go.Figure(data = [data],layout = layout)

choromap.update_layout(autosize = False, 
                       width = 600,
                       height = 600)

iplot(choromap, validate=False)

## Bubble Plots
Looking at 2019 only. 

In [None]:
data = [
    {
        'y': happy_2019['GDP per capita'],
        'x': happy_2019['Healthy life expectancy'],
        'mode': 'markers',
        'marker': {
            'color': happy_2019['Freedom to make life choices'],
            'size': happy_2019['Score'],
            'showscale': True
        },
        "text" :  happy_2019['Country or region']    
    }
]
layout = go.Layout(barmode='overlay', hovermode="x",
                   title='Bubble Chart: x = Healthy life expectancy, y = GDP per capita, size = Happiness Score, Color = Freedom to make life choices, year = 2019',
                   xaxis=dict(title='Healthy life expectancy'),
                   yaxis=dict(title='GDP per capita'),
                  
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

## Predictive Models
Looking at 2019 only. <br> <br> 
Because this is a particularly small data set, it doesn't lend itself to predictive modeling. However, we can use both Linear and Elastic Net models to understand the importance of factors. By looking at the coefficients, particularly on the Elstic Net model, we can determine which factors are the least and most important in predicting the Happiness Score. While this data set is already small, this can also be useful for dimensionality reduction.  


In [None]:
# shuffle the dataframe
happy_2019 = happy_2019.sample(frac=1).reset_index(drop=True)

happy_2019.head()

In [None]:
# create 2D array of features 
X = happy_2019.loc[:,['GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']].values

# create array for labels 
y = happy_2019.loc[:,['Score']]

In [None]:
# scale variables 
scaler = preprocessing.MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

### Linear Regression

In [None]:
# set random seed 
np.random.seed(1)

#linear model
lr_model = LinearRegression()
lr_model.fit(X, y)
print(lr_model.coef_)
 
lm_result = cross_val_score(lr_model , X, y, cv = 10, n_jobs = -1)
 
print(np.mean(lm_result))

### Elastic Net

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet 

In [None]:
%%time

elasticnet_model = ElasticNet(normalize = True)
parameters = {
    'l1_ratio' : [0.0, 0.5, 1.0],
    'alpha' : [0.003, 0.005, 0.0075, 0.01]
}

en_grid = GridSearchCV(elasticnet_model, parameters, cv = 10, refit = True, 
                       n_jobs = -1, verbose = 0, scoring = 'r2')
en_grid.fit(X, y)

In [None]:
grid = en_grid
en_model = grid.best_estimator_

print('Best Parameters:', grid.best_params_)
print('Best CV Score:  ', grid.best_score_)
print('Training r2:    ', grid.best_estimator_.score(X, y))

y1 = y
y2 = grid.best_estimator_.predict(X)
print('Training MSE:   ', r2_score(y1, y2))

print()
cv_res = grid.cv_results_
for params, score in zip(cv_res['params'], cv_res['mean_test_score']): 
    print(f'{str(params):<40} {score:.8f}')

In [None]:
# compare coefficients from unregularized model to regularized model 
print(lr_model.coef_)
print(en_model.coef_)

Here, we can see that the coefficient for "Genorosity" is the lowest. According to this model, this variable is least important in predicting the Happiness Score. While this data set only has 6 predictor variables, if we wanted to reduce dimensionality, we might remove this column from our model. 

Conversely, "Freedom to make life choices" and "Healthy life expectancy" appear to be the most important variables in predicting Happiness Score.