# Importing The Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Reading the Dataset (2018)
This notebook has done analysis on the 2018 happiness data which predicts the happiness score. 

In [None]:
df=pd.read_csv("data/happ.csv")
df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [None]:
df.describe()

Unnamed: 0,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0
mean,78.5,5.375917,0.891449,1.213237,0.597346,0.454506,0.181006,0.112449
std,45.177428,1.119506,0.391921,0.302372,0.247579,0.162424,0.098471,0.096343
min,1.0,2.905,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.75,4.45375,0.61625,1.06675,0.42225,0.356,0.1095,0.051
50%,78.5,5.378,0.9495,1.255,0.644,0.487,0.174,0.082
75%,117.25,6.1685,1.19775,1.463,0.77725,0.5785,0.239,0.139
max,156.0,7.632,2.096,1.644,1.03,0.724,0.598,0.457


In [None]:
df.columns

Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')

# Splitting Training Set and Test Set
Sincewe are going to predict the happiness score, we are going to define the output variable as the score and the input as all the remaining features of X.

In [None]:
X = df.drop(["Score"],axis = 1)
y = df.Score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=1)

In [None]:
X_train.head()

Unnamed: 0,Overall rank,Country or region,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
66,67,Moldova,0.657,1.301,0.62,0.232,0.171,0.0
33,34,Singapore,1.529,1.451,1.008,0.631,0.261,0.457
141,142,Angola,0.73,1.125,0.269,0.0,0.079,0.061
59,60,Kazakhstan,1.143,1.516,0.631,0.454,0.148,0.121
127,128,Georgia,0.853,0.592,0.643,0.375,0.038,0.215


In [None]:
y_train.head()

66     5.640
33     6.343
141    3.795
59     5.790
127    4.340
Name: Score, dtype: float64

# Identifying Missing Columns and Categorical Variables 

In [None]:
cols_miss = [col for col in X_train.columns if X_train[col].isnull().any()]
cols_miss

[]

We see there are no missing values in the dataset.

In [None]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

['Country or region']

We see that the only feature that is categorical is the "country or religion". The given dataset is of 156 countries so we can drop this column since it is not effective to use a technique such as one-hot-encoding.Moreover the ranking of the countries is given in the dataset.

We can also drop the ranking of the country since it is only based on the score of happiness.The score of happiness is what we need to predict in the first place. Rather than keeping it in X we can drop it too.

In [None]:
X_train = X_train.drop(["Country or region","Overall rank"],axis=1)
X_test = X_test.drop(["Country or region","Overall rank"],axis=1)

Note that we need to predict the happiness score. For a given country its name does not define how happy the country actually is.

In [None]:
X_train

Unnamed: 0,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
66,0.657,1.301,0.620,0.232,0.171,0.000
33,1.529,1.451,1.008,0.631,0.261,0.457
141,0.730,1.125,0.269,0.000,0.079,0.061
59,1.143,1.516,0.631,0.454,0.148,0.121
127,0.853,0.592,0.643,0.375,0.038,0.215
...,...,...,...,...,...,...
133,0.131,0.867,0.221,0.390,0.175,0.099
137,0.793,1.413,0.609,0.163,0.187,0.011
72,1.039,1.498,0.700,0.307,0.101,0.154
140,0.472,1.215,0.079,0.423,0.116,0.112


# Model
There will be various models used here depending on tthe accuracy score we will be concluding which of them perform good over the test set that we split.

**Linear Regression**

In [None]:
model1 = LinearRegression()
model1.fit(X_train, y_train)

**Decision Tree**

In [None]:
model2 = DecisionTreeRegressor(random_state = 0)
model2.fit(X_train, y_train)

**Random Forest**

In [None]:
model3 = RandomForestRegressor(random_state = 0)
model3.fit(X_train, y_train)

**XGBoost**

In [None]:
model4 = XGBRegressor()
model4.fit(X_train, y_train)

# Prediction and Accuracy

In [None]:
pred1 = model1.predict(X_test)
mse1 = mean_squared_error(y_test,pred1)

pred2 = model2.predict(X_test)
mse2 = mean_squared_error(y_test,pred2)

pred3 = model3.predict(X_test)
mse3 = mean_squared_error(y_test,pred3)

pred4 = model4.predict(X_test)
mse4 = mean_squared_error(y_test,pred3)
print(mse1,mse2,mse3,mse4)

0.3464394873885773 0.5641519062500001 0.27611012194687534 0.27611012194687534
