# Import Libraries

In [60]:
%matplotlib notebook
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
import chart_studio.plotly as py 
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# import plotly.express as px
from mpl_toolkits import mplot3d
plt.style.use('seaborn-whitegrid')

# Load Data & Preprocess

## 2017 Data

In [2]:
# Load 2017 data
raw_input_data = pd.read_csv("2017.csv")

raw_input_data.head() # Glance of data
print('2017 Raw Data Shape :',raw_input_data.shape) # Shape of the data
raw_input_data.describe() # Statistics of the data

# Columns to be dropped
drop_columns = ['Country','Happiness.Rank','Whisker.high','Whisker.low']
input_data = raw_input_data.drop(columns = drop_columns)
input_data.columns = ['Happiness_Score','GDP','Family','Life_Expectancy','Freedom','Generosity','Trust_government_Corruption','Dystopia_Residual']

input_data.head(); # Processed 2017 data

2017 Raw Data Shape : (155, 12)


## 2016 Data

In [3]:
# Load 2016 data
raw_input_2016 = pd.read_csv("2016.csv")
raw_input_2016.head()

# Columns to be dropped
dropCol2016 = ['Region', 'Country', 'Happiness Rank', 'Lower Confidence Interval', 'Upper Confidence Interval']
input2016 = raw_input_2016.drop(columns = dropCol2016)
input2016 = input2016.reindex(columns = ['Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Generosity', 'Trust (Government Corruption)', 'Dystopia Residual'])
input2016.columns = ['Happiness_Score','GDP','Family','Life_Expectancy','Freedom','Generosity','Trust_government_Corruption','Dystopia_Residual']

input2016.head(); # Processed 2016 data

## 2015 Data

In [4]:
# Load 2015 data
raw_input_2015 = pd.read_csv("2015.csv")
raw_input_2015.head()

# Columns to be dropped
dropCol2015 = ['Region', 'Country', 'Happiness Rank', 'Standard Error']
input2015 = raw_input_2015.drop(columns = dropCol2015)
input2015 = input2015.reindex(columns = ['Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Generosity', 'Trust (Government Corruption)', 'Dystopia Residual'])
input2015.columns = ['Happiness_Score','GDP','Family','Life_Expectancy','Freedom','Generosity','Trust_government_Corruption','Dystopia_Residual']

input2015.head(); # Processed 2015 data

In [5]:
# Combine all datasets (2015, 2016, 2017)
input_data = input_data.append(input2016, ignore_index = True)
input_data = input_data.append(input2015, ignore_index = True)

print('Shape of Merged dataset : ',input_data.shape); # Shape of final dataset

Shape of Merged dataset :  (470, 8)


In [6]:
# Seperate data into X,Y (X - Features, Y - Label/Ground Truth/ Actual Value)

# X
X = input_data.drop(columns='Happiness_Score')
X_without_residual = X.drop(columns=['Dystopia_Residual'])

# Y - Happiness Score
Y = input_data['Happiness_Score']

# Data Analysis

In [7]:
# Function to plot 3D scatter
def plot_comp_data(x,y,z,pred,xlabel,ylabel,zlabel):
    
    fig1 = plt.figure()
    fig_title = "Predicted vs Actual Happiness Score for " + xlabel + ", " + ylabel;
    fig1.suptitle(fig_title, fontsize=16)
    ax = plt.axes(projection='3d')
    ax.scatter(x,y,z,c='Green',marker='x',alpha=0.5, label='Actual')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_zlabel(zlabel)
    ax.scatter(x,y,pred,c='Red',marker='<',alpha=0.5, label='Predicted')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_zlabel(zlabel)
    ax.legend()
    
    
    fig2 = plt.figure(figsize=plt.figaspect(0.5))

    ax1 = fig2.add_subplot(2, 1, 1, projection='3d')
    ax1.set_title('Actual')
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(ylabel)
    ax1.set_zlabel(zlabel)
    ax1.scatter(x, y, z,c='Green',marker='x',alpha=0.9, label='Actual')
    ax1.legend()

    ax2 = fig2.add_subplot(2, 1, 2, projection='3d')
    ax2.set_title('Predicted')
    ax2.set_xlabel(xlabel)
    ax2.set_ylabel(ylabel)
    ax2.set_zlabel(zlabel)
    ax2.scatter(x, y, pred,c='Red',marker='x',alpha=0.9, label='Predicted')
    plt.tight_layout()
    ax2.legend()
    
    plt.show()

### Correlation Matrix

In [8]:
# Correlation matrix of features
corr_matrix = input_data.corr()
fig = go.Figure(data=go.Heatmap(
                    z=np.array(corr_matrix),
                      x=corr_matrix.columns,
                      y=corr_matrix.columns))
fig.show()

### Feature Selection

#### Visualize correlation between features and Happiness score

In [9]:
# Plot all features vs Happiness Score
fig, axs = plt.subplots(3, 2)
axs[0,0].set_title('GDP vs Happiness Score')
axs[0,0].scatter(input_data['GDP'],input_data['Happiness_Score'],s=1)
axs[0,0].set_xlabel('GDP')
axs[0,0].set_ylabel('Happiness_Score')

axs[0,1].set_title('Family vs Happiness Score')
axs[0,1].scatter(input_data['Family'],input_data['Happiness_Score'],s=1)
axs[0,1].set_xlabel('Family')
axs[0,1].set_ylabel('Happiness_Score')

axs[1,0].set_title('Life Expectancy vs Happiness Score')
axs[1,0].scatter(input_data['Life_Expectancy'],input_data['Happiness_Score'],s=1)
axs[1,0].set_xlabel('Life Expectancy')
axs[1,0].set_ylabel('Happiness_Score')

axs[1,1].set_title('Freedom vs Happiness Score')
axs[1,1].scatter(input_data['Freedom'],input_data['Happiness_Score'],s=1)
axs[1,1].set_xlabel('Freedom')
axs[1,1].set_ylabel('Happiness_Score')

axs[2,0].set_title('Generosity vs Happiness Score')
axs[2,0].scatter(input_data['Generosity'],input_data['Happiness_Score'],s=1)
axs[2,0].set_xlabel('Generosity')
axs[2,0].set_ylabel('Happiness_Score')

axs[2,1].set_title('Trust Government Corruption vs Happiness Score')
axs[2,1].scatter(input_data['Trust_government_Corruption'],input_data['Happiness_Score'],s=1)
axs[2,1].set_xlabel('Trust Government Corruption')
axs[2,1].set_ylabel('Happiness_Score')

fig.tight_layout()

<IPython.core.display.Javascript object>

In [10]:
fig = plt.figure()
plt.scatter(input_data['Dystopia_Residual'],input_data['Happiness_Score'],s=4)
plt.title('Dystopia vs Happiness Score')
plt.xlabel('Dystopia Residual')
plt.ylabel('Happiness Score')
plt.show()

<IPython.core.display.Javascript object>

In [11]:
# PDF of Dystopia Residual
fig = plt.figure()
dys_r = input_data['Dystopia_Residual']
ax_dr = dys_r.plot.kde()
ax_dr.set_xlabel('Dystopia Residual')

<IPython.core.display.Javascript object>

Text(0.5, 0, 'Dystopia Residual')

## Model on subset of features

### Features - GDP, Life Expectancy

In [12]:
# Linear Regression
regressor_sub = LinearRegression()
regressor_sub.fit(input_data[['GDP','Life_Expectancy']],Y)

# Predicted values
predicted_values = regressor_sub.predict(input_data[['GDP','Life_Expectancy']])

# Root Mean Square Error
RMSE = np.sqrt(np.mean((predicted_values - Y)**2))
print('Linear Regression RMSE:',RMSE)
print('Linear Regression Score :',regressor_sub.score(input_data[['GDP','Life_Expectancy']],Y))

Linear Regression RMSE: 0.6628353985350816
Linear Regression Score : 0.6594217386919982


In [13]:
# MLP 
mlp = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='identity',learning_rate='constant',learning_rate_init=0.2)
mlp.fit(input_data[['GDP','Life_Expectancy']],Y)

# Predicted values
predicted_values_mlp = mlp.predict(input_data[['GDP','Life_Expectancy']])

# Root Mean Square Error
RMSE_mlp = np.sqrt(np.mean((predicted_values_mlp - Y)**2))
print("RMSE :",RMSE_mlp)
mlp_score = mlp.score(input_data[['GDP','Life_Expectancy']],Y);
print("MLP Score :",mlp_score)

RMSE : 0.6658716576606769
MLP Score : 0.6562944094118971


### Features - GDP, Family

In [14]:
# Linear Regression
regressor_sub = LinearRegression()
regressor_sub.fit(input_data[['GDP','Family']],Y)

# Predicted values
predicted_values = regressor_sub.predict(input_data[['GDP','Family']])

# Root Mean Square Error
RMSE = np.sqrt(np.mean((predicted_values - Y)**2))
print('Linear Regression RMSE:',RMSE)
print('Linear Regression Score :',regressor_sub.score(input_data[['GDP','Family']],Y))

Linear Regression RMSE: 0.6579420134745354
Linear Regression Score : 0.6644318175457147


In [15]:
# MLP 
mlp = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='identity',learning_rate='constant',learning_rate_init=0.2)
mlp.fit(input_data[['GDP','Life_Expectancy']],Y)

predicted_values_mlp = mlp.predict(input_data[['GDP','Life_Expectancy']])

RMSE_mlp = np.sqrt(np.mean((predicted_values_mlp - Y)**2))
print("RMSE :",RMSE_mlp)
mlp_score = mlp.score(input_data[['GDP','Life_Expectancy']],Y);
print("MLP Score :",mlp_score)

RMSE : 0.6642297280636063
MLP Score : 0.6579873620710642


### Features - GDP, Generosity

In [16]:
# Linear Regression
regressor_sub = LinearRegression()
regressor_sub.fit(input_data[['GDP','Generosity']],Y)

# Predicted values
predicted_values = regressor_sub.predict(input_data[['GDP','Generosity']])

# Root Mean Square Error
RMSE = np.sqrt(np.mean((predicted_values - Y)**2))
print('Linear Regression RMSE:',RMSE)
print('Linear Regression Score :',regressor_sub.score(input_data[['GDP','Generosity']],Y))

Linear Regression RMSE: 0.6743796304096055
Linear Regression Score : 0.6474551096927383


In [17]:
# MLP 
mlp = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='identity',learning_rate='constant',learning_rate_init=0.2)
mlp.fit(input_data[['GDP','Generosity']],Y)

predicted_values_mlp = mlp.predict(input_data[['GDP','Generosity']])

RMSE_mlp = np.sqrt(np.mean((predicted_values_mlp - Y)**2))
print("RMSE :",RMSE_mlp)
mlp_score = mlp.score(input_data[['GDP','Generosity']],Y);
print("MLP Score :",mlp_score)

RMSE : 0.6758789303011202
MLP Score : 0.6458857915864376


### Features - Generosity, Trust Government Corruption

In [18]:
# Linear Regression
regressor_sub = LinearRegression()
regressor_sub.fit(input_data[['Generosity','Trust_government_Corruption']],Y)

# Predicted values
predicted_values = regressor_sub.predict(input_data[['Generosity','Trust_government_Corruption']])

# Root Mean Square Error
RMSE = np.sqrt(np.mean((predicted_values - Y)**2))
print('Linear Regression RMSE:',RMSE)
print('Linear Regression Score :',regressor_sub.score(input_data[['Generosity','Trust_government_Corruption']],Y))

Linear Regression RMSE: 1.0363112547133748
Linear Regression Score : 0.16749669540767398


In [19]:
# MLP 
mlp = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='identity',learning_rate='constant',learning_rate_init=0.2)
mlp.fit(input_data[['Generosity','Trust_government_Corruption']],Y)

predicted_values_mlp = mlp.predict(input_data[['Generosity','Trust_government_Corruption']])

RMSE_mlp = np.sqrt(np.mean((predicted_values_mlp - Y)**2))
print("RMSE :",RMSE_mlp)
mlp_score = mlp.score(input_data[['Generosity','Trust_government_Corruption']],Y);
print("MLP Score :",mlp_score)

RMSE : 1.0370591689008883
MLP Score : 0.16629461309437266


### Features - Family , Freedom

In [20]:
# Linear Regression
regressor_sub = LinearRegression()
regressor_sub.fit(input_data[['Family','Freedom']],Y)

# Predicted values
predicted_values = regressor_sub.predict(input_data[['Family','Freedom']])

# Root Mean Square Error
RMSE = np.sqrt(np.mean((predicted_values - Y)**2))
print('Linear Regression RMSE:',RMSE)
print('Linear Regression Score :',regressor_sub.score(input_data[['Family','Freedom']],Y))

Linear Regression RMSE: 0.7970282451093922
Linear Regression Score : 0.5075604100931427


In [21]:
# MLP 
mlp = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='identity',learning_rate='constant',learning_rate_init=0.2)
mlp.fit(input_data[['Family','Freedom']],Y)

predicted_values_mlp = mlp.predict(input_data[['Family','Freedom']])

RMSE_mlp = np.sqrt(np.mean((predicted_values_mlp - Y)**2))
print("RMSE :",RMSE_mlp)
mlp_score = mlp.score(input_data[['Family','Freedom']],Y);
print("MLP Score :",mlp_score)

RMSE : 0.7972934317327185
MLP Score : 0.5072326673375365


### Features - Generosity, Dystopia_Residual

In [22]:
# Linear Regression
regressor_sub = LinearRegression()
regressor_sub.fit(input_data[['Generosity','Dystopia_Residual']],Y)

# Predicted values
predicted_values = regressor_sub.predict(input_data[['Generosity','Dystopia_Residual']])

# Root Mean Square Error
RMSE = np.sqrt(np.mean((predicted_values - Y)**2))
print('Linear Regression RMSE:',RMSE)
print('Linear Regression Score :',regressor_sub.score(input_data[['Generosity','Dystopia_Residual']],Y))

Linear Regression RMSE: 0.9578897394588786
Linear Regression Score : 0.28872658075452484


In [23]:
# MLP 
mlp = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='identity',learning_rate='constant',learning_rate_init=0.2)
mlp.fit(input_data[['Generosity','Dystopia_Residual']],Y)

predicted_values_mlp = mlp.predict(input_data[['Generosity','Dystopia_Residual']])

RMSE_mlp = np.sqrt(np.mean((predicted_values_mlp - Y)**2))
print("RMSE :",RMSE_mlp)
mlp_score = mlp.score(input_data[['Generosity','Dystopia_Residual']],Y);
print("MLP Score :",mlp_score)

RMSE : 0.9580663460244349
MLP Score : 0.2884642809710156


#### Features with higher correlation with Happiness score performed better compared to features with lower correlation score.

# Linear Regression

In [24]:
regressor = LinearRegression()
regressor.fit(X,Y)

# Linear regression coefficients
print('Linear Regression coefficients :',regressor.coef_)

# Predicted values
predicted_values = regressor.predict(X)
comp_y = pd.DataFrame(np.transpose([Y,predicted_values]),columns=['Actual','Predicted'])
comp_y.head()

# Root Mean Square Error
RMSE = np.sqrt(np.mean((predicted_values - Y)**2))
print('Linear Regression RMSE:',RMSE)
print('Linear Regression Score :',regressor.score(X,Y))

Linear Regression coefficients : [1.00006954 0.99999328 0.99985107 0.99994278 1.00015935 0.99978973
 0.99997492]
Linear Regression RMSE: 0.0002850419545689709
Linear Regression Score : 0.9999999370170337


# Linear Regression Without "Dystopia Residual"

In [25]:
regressor_wo_residual = LinearRegression()
regressor_wo_residual.fit(X_without_residual,Y)

# Coefficients of linear regression
print('Coefficients :',regressor_wo_residual.coef_)

# Predicted Values
predicted_values_wo_residual = regressor_wo_residual.predict(X_without_residual)
comparision_y_wo_residual = pd.DataFrame(np.transpose([Y,predicted_values_wo_residual]),columns=['Actual','Predicted'])
comparision_y_wo_residual.head()

# RMSE
RMSE_wo_residual = np.sqrt(np.mean((predicted_values_wo_residual - Y)**2))
print('RMSE Without Residual',RMSE_wo_residual)
print('Score without residual',regressor_wo_residual.score(X_without_residual,Y))

Coefficients : [1.01575778 0.67251489 1.25198672 1.52407283 0.35853075 0.85852834]
RMSE Without Residual 0.5525850256050722
Score without residual 0.7632970010659248


# Multi-Layer Perceptron

In [28]:
# MLP 
mlp = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='identity',learning_rate='constant',learning_rate_init=0.02)
mlp.fit(X,Y)

predicted_values_mlp = mlp.predict(X)
comparision_y_mlp = pd.DataFrame(np.transpose([Y,predicted_values_mlp]),columns=['Actual','Predicted'])
comparision_y_mlp.head()

RMSE_mlp = np.sqrt(np.mean((predicted_values_mlp - Y)**2))
print("RMSE :",RMSE_mlp)
mlp_score = mlp.score(X,Y);
print("MLP Score :",mlp_score)

RMSE : 0.05153841171598315
MLP Score : 0.997940946657827


# Multi-Layer Perceptron without "Dystopia Residual"

In [70]:
mlp_wo_residual = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='tanh',learning_rate='constant',learning_rate_init=0.01)
mlp_wo_residual.fit(X_without_residual,Y)

predicted_values_wo_residual_mlp = mlp_wo_residual.predict(X_without_residual)
comparision_y_wo_residual_mlp = pd.DataFrame(np.transpose([Y,predicted_values_wo_residual_mlp]),columns=['Actual','Predicted'])
comparision_y_wo_residual_mlp.head()

RMSE_wo_mlp = np.sqrt(np.mean((predicted_values_wo_residual_mlp - Y)**2))
print("RMSE without residual :",RMSE_wo_mlp)

mlp_wo_residual_score = mlp_wo_residual.score(X_without_residual,Y);
print("MLP Score without residual :",mlp_wo_residual_score)

RMSE without residual : 0.5541729796109784
MLP Score without residual : 0.7619346277533314


# Linear Regression
### (Happiness Score modified as "Happiness Score - E[Dystopia Residual]")
### Features - GDP, Family, Life Expectancy, Trust Government Corruption, Generosity, Freedom

In [30]:
# Mean of Dystopia Residual
mean_dr = np.mean(input_data['Dystopia_Residual'])
Y_mod = Y - mean_dr

In [64]:
regressor = LinearRegression()
regressor.fit(X_without_residual,Y_mod)

# Linear regression coefficients
print('Linear Regression coefficients :',regressor.coef_)

# Predicted values
predicted_values = regressor.predict(X_without_residual)
comp_y = pd.DataFrame(np.transpose([Y_mod,predicted_values]),columns=['Actual','Predicted'])
comp_y.head()

# Root Mean Square Error
RMSE = np.sqrt(np.mean((predicted_values - Y_mod)**2))
print('Linear Regression RMSE:',RMSE)
print('Linear Regression Score :',regressor.score(X_without_residual,Y_mod))

regressor.intercept_

Linear Regression coefficients : [1.01575778 0.67251489 1.25198672 1.52407283 0.35853075 0.85852834]
Linear Regression RMSE: 0.5525850256050722
Linear Regression Score : 0.7632970010659248


0.12691473105055984

In [87]:
mlp_wo_residual = MLPRegressor(hidden_layer_sizes=(7),max_iter=2500,activation='tanh',learning_rate='constant',learning_rate_init=0.01)
mlp_wo_residual.fit(X_without_residual,Y_mod)

predicted_values_wo_residual_mlp = mlp_wo_residual.predict(X_without_residual)
comparision_y_wo_residual_mlp = pd.DataFrame(np.transpose([Y_mod,predicted_values_wo_residual_mlp]),columns=['Actual','Predicted'])
comparision_y_wo_residual_mlp.head()

RMSE_wo_mlp = np.sqrt(np.mean((predicted_values_wo_residual_mlp - Y_mod)**2))
print("RMSE :",RMSE_wo_mlp)

mlp_wo_residual_score = mlp_wo_residual.score(X_without_residual,Y_mod);
print("MLP residual :",mlp_wo_residual_score)

RMSE : 0.5426784999585395
MLP residual : 0.7717079603336311


# Results

## Predicted vs Actual (MLP without dystopia residual)

In [76]:
plot_comp_data(input_data['GDP'],input_data['Life_Expectancy'],input_data['Happiness_Score'],predicted_values_wo_residual_mlp,"GDP",'Life Expectancy','Happiness Score')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [75]:
 plot_comp_data(input_data['GDP'],input_data['Life_Expectancy'],input_data['Happiness_Score'],predicted_values_wo_residual_mlp,"GDP",'Life Expectancy','Happiness Score')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
 plot_comp_data(input_data['Generosity'],input_data['Life_Expectancy'],input_data['Happiness_Score'],predicted_values_wo_residual_mlp,"Generosity",'Life Expectancy','Happiness Score')