In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Importing Happiness report for the year 2019

In [None]:
happiness = pd.read_csv('../input/world-happiness/2019.csv')
print(happiness.head())
print(happiness.columns)

# Finding missing values

In [None]:
# In case there is any missing values, it should be treated or replaced.
print(happiness.isnull().sum())

In [None]:
# No missing values were found

# Finding Correlation

In [None]:
# To find the strength of relationship between different input variables 
#and also to find the dependency of the target variable 'Score' on input variables.

plt.figure(figsize=(17,8))
sns.heatmap(happiness.corr(),square=True,annot=True,cmap=sns.diverging_palette(20,200))
plt.show()

From the heatmap, it is observed that strong positive correlation exist between 'GDP per capita' and 'Healthy life expectancy', with a value of 0.84.The target variable,'Score', has relatively high positive correlation with 'GDP per capita' of 0.79.The variables 'Healthy life expectancy' and 'Social support' also contributes to a good correlation with the final variable,with a magnitude of 0.78 each.

# Seperating target and input variables

In [None]:
Y_variables = happiness['Score']
print(Y_variables.shape)
X_variables = happiness.drop(['Score','Country or region','Overall rank'],axis=1)
print(X_variables.shape)

# Splitting dataset into test dataset and train dataset

In [None]:
#Choosing a split of 25 % for test data , with a random_state=0 inorder 
# to reproduce the result if needed.

x_train,x_test,y_train,y_test = train_test_split(X_variables,Y_variables,test_size=0.25,random_state=0)

# Finding Outliers

In [None]:
column_list=x_train.columns.values
for i in range(0,len(column_list)):
    plt.subplot(4,3,i+1)
    sns.boxplot(x_train[column_list[i]])
    plt.tight_layout()

 Since there shows outliers outside the whiskers,finding Z score of each variable to check if there are values outside the limit of +3 and -3.Also,the score value should vary between 0 and 10.

In [None]:
happiness['Zscore_GDP'] = stats.zscore(happiness['GDP per capita'])
print(happiness.query('Zscore_GDP > 3 | Zscore_GDP < -3'))
# No outliers

In [None]:
happiness['Zscore_Social'] = stats.zscore(happiness['Social support'])
happiness.query('Zscore_Social > 3 | Zscore_Social < -3')
# Central African Republic has Social support score of 0.0

In [None]:
happiness['Zscore_Healthy'] = stats.zscore(happiness['Healthy life expectancy'])
happiness.query('Zscore_Healthy > 3 | Zscore_Healthy < -3')
# Swaziland has Healthy life expectancy score of 0.0

In [None]:
happiness['Zscore_Freedom'] = stats.zscore(happiness['Freedom to make life choices'])
happiness.query('Zscore_Freedom > 3 | Zscore_Freedom < -3')
# no outliers

In [None]:
happiness['Zscore_Generosity'] = stats.zscore(happiness['Generosity'])
happiness.query('Zscore_Generosity > 3 | Zscore_Generosity < -3')
# Indonesia has 0.498 and Myanmar has 0.566

In [None]:
happiness['Zscore_Perceptions'] = stats.zscore(happiness['Perceptions of corruption'])
happiness.query('Zscore_Perceptions > 3 | Zscore_Perceptions < -3')
# Denmark has 0.410 , Singapore has 0.453 and Rwanda has  0.411

7 country/region out of 156 has z score outside the limit.However, since the scores are allowed to have a value ranging from 0.0 to 10.0,we choose not to remove these rows.Moreover,removing these rows removes the country record entirely.

# Linear Regression

In [None]:
#Multiple Linear regression is choosen since 3 of these variables shown high correlation to the target variable,Score.

feature_cols=['GDP per capita', 'Social support', 'Healthy life expectancy'] #list(x_train.columns)
X = x_train[feature_cols]
happiness_model = LinearRegression()
happiness_model.fit(X,y_train)
print('Intercept : ',happiness_model.intercept_)
print('Coefficients : ')
list(zip(feature_cols,happiness_model.coef_))

In [None]:
#Predicting using x_test

Xtest= x_test[feature_cols]
y_pred = happiness_model.predict(Xtest)
print(y_pred)

In [None]:
#Evaluating model using rsquared value

r2 = metrics.r2_score(y_test,y_pred)
print('R square : ',r2)

In [None]:
# Calculating Adjusted rsquare to see if the added variable is relevant enough.

x=x_train[feature_cols]
k=len(x_train[feature_cols].columns)
n = x.size
adj2=1 -((1-r2)*(n-1)/(n-k-1))
print('Adjusted R square : ',adj2)

In [None]:
#Adding more columns to improve Rsquared value.

feature_cols2=['GDP per capita', 'Social support', 'Healthy life expectancy','Freedom to make life choices'] 
X= x_train[feature_cols2]
happiness_model=LinearRegression()
happiness_model.fit(X,y_train)
print('Intercept : ',happiness_model.intercept_)
print('Coefficients : ')
list(zip(feature_cols2,happiness_model.coef_))

In [None]:
#Predicting and evaluating

Xtest= x_test[feature_cols2]
y_pred = happiness_model.predict(Xtest)
r2 = metrics.r2_score(y_test,y_pred)
print('R square : ',r2)

# Calculating Adjusted rsquare
x=x_train[feature_cols2]
k=len(x_train[feature_cols2].columns)
n = x.size
adj2=1 -((1-r2)*(n-1)/(n-k-1))
print('Adjusted R square : ',adj2)

In [None]:
#Since there is an increase in Adjusted r square,the added columns are not irrelavant.
#So,adding more columns to improve accuracy.

feature_cols3=['GDP per capita', 'Social support', 'Healthy life expectancy','Freedom to make life choices','Perceptions of corruption'] 
X= x_train[feature_cols3]
happiness_model=LinearRegression()
happiness_model.fit(X,y_train)
print('Intercept : ',happiness_model.intercept_)
print('Coefficients : ')
list(zip(feature_cols3,happiness_model.coef_))

In [None]:

#Predicting and evaluating
Xtest = x_test[feature_cols3]
y_pred = happiness_model.predict(Xtest)
r2 = metrics.r2_score(y_test,y_pred)
print('R square : ',r2)

#Calculating Adjusted rsquare
x=x_train[feature_cols3]
k=len(x_train[feature_cols3].columns)
n = x.size
adj2=1 -((1-r2)*(n-1)/(n-k-1))
print('Adjusted R square : ',adj2)

There is a decrease in adjusted r square which means that the added column is irrelevant to Score.Out of the lot, second model is better with an accuracy of 60%

 # Random Forest

They say 'there is no free lunch in data science'.To make a prediction model with different algorithm, considering one of the popular ensemble methods- RandomForest.For regression problems, it is RandomforestRegressor.

In [None]:
RFmodel = RandomForestRegressor()
RFmodel.fit(x_train,y_train)

In [None]:
#Predicting and evaluating

y_predRF= RFmodel.predict(x_test)
r2 = metrics.r2_score(y_test,y_predRF)
print('R square : ',r2)

Here, the rsquared value is better for RandomForest than any of the previous models.Further,inorder to optimise the performance going ahead with hyperparameter tuning.

# Hyperparameter Tuning

In [None]:

estimator = RandomForestRegressor()
param_grid = { "n_estimators" :[50,100,200] ,
               "max_features" : ["auto", "sqrt", "log2"] ,
               "bootstrap": [True, False] ,
               "min_samples_split" : [2,4,8]
             }
grid = GridSearchCV(estimator, param_grid, cv=5)
grid.fit(x_train,y_train)
print(f"best parameters: {grid.best_params_}")
print(f"best score: {grid.score(x_test,y_test)}")


So, this shows the best accuracy and finalising the happiness score prediction model as follows :
* Algorithm : Random Forest
* n_estimators : 100
* min_samples_split :8