In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as stats
from statsmodels.formula.api import ols
import sklearn
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
happiness =  pd.read_csv('../happiness_score_dataset.csv')
happiness.head(5)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [3]:
happiness = pd.read_csv('../happiness_score_dataset.csv')
happiness.drop(['Region','Standard Error'],axis=1,inplace=True) 
happiness.columns = ['Country','Rank','Score','GDP',
                'Support','Health',
                'Freedom','Corruption','Generosity', 'Dystopia Residual']
happiness.head()

Unnamed: 0,Country,Rank,Score,GDP,Support,Health,Freedom,Corruption,Generosity,Dystopia Residual
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [4]:
happiness.describe()

Unnamed: 0,Rank,Score,GDP,Support,Health,Freedom,Corruption,Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [5]:
drop_rank = happiness.drop("Rank", axis = 1)

In [6]:
corr_matrix_happy = drop_rank.corr()
trace_corr_happy = go.Heatmap(z=np.array(corr_matrix_happy), x=corr_matrix_happy.columns, y=corr_matrix_happy.columns)
data_happy=[trace_corr_happy]
iplot(data_happy)

In [7]:
dropped_happy = happiness.drop(["Country", "Rank"], axis=1)
dropped_happy.head()

Unnamed: 0,Score,GDP,Support,Health,Freedom,Corruption,Generosity,Dystopia Residual
0,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [8]:
from sklearn.linear_model import LinearRegression
X = dropped_happy.drop("Score", axis = 1)
lm = LinearRegression()
lm.fit(X, dropped_happy.Score)

LinearRegression()

In [9]:
print("Estimated Intercept is", lm.intercept_)
print("The number of coefficients in this model are", lm.coef_)

Estimated Intercept is 6.404859145714425e-05
The number of coefficients in this model are [1.0001014  0.99997035 0.99988261 0.99969531 0.99991914 1.00006126
 1.00003038]


In [10]:
coef = zip(X.columns, lm.coef_)
coef_df = pd.DataFrame(list(zip(X.columns, lm.coef_)), columns=['features', 'coefficients'])
coef_df

Unnamed: 0,features,coefficients
0,GDP,1.000101
1,Support,0.99997
2,Health,0.999883
3,Freedom,0.999695
4,Corruption,0.999919
5,Generosity,1.000061
6,Dystopia Residual,1.00003


In [11]:
lm.predict(X)[0:100]

array([7.58687306, 7.56086907, 7.5269951 , 7.5221512 , 7.42687605,
       7.40600634, 7.37809683, 7.36361171, 7.28599343, 7.2839794 ,
       7.27769703, 7.22570377, 7.19980857, 7.18729884, 7.11949892,
       6.98256953, 6.94617741, 6.94021052, 6.93733396, 6.90131819,
       6.86719164, 6.85292965, 6.81021897, 6.797729  , 6.78623169,
       6.75011207, 6.66967905, 6.61123979, 6.57503565, 6.57405074,
       6.50507315, 6.48510158, 6.47677038, 6.45467149, 6.41114045,
       6.32891816, 6.30240061, 6.29814877, 6.29475912, 6.26925717,
       6.16769691, 6.13015796, 6.12277968, 6.00271476, 5.99509138,
       5.98705124, 5.98374981, 5.97522081, 5.95964459, 5.94818659,
       5.88973714, 5.88900759, 5.87831141, 5.85520035, 5.84765004,
       5.83265734, 5.82827612, 5.82411745, 5.8125162 , 5.79059048,
       5.77040889, 5.75860389, 5.75439185, 5.71585921, 5.70885935,
       5.69529008, 5.68873893, 5.60463238, 5.58863927, 5.54813882,
       5.47703345, 5.47377669, 5.42855813, 5.3987171 , 5.36026

In [12]:
trace = go.Scatter(
    x = lm.predict(X),
    y = dropped_happy.Score,
    mode = 'lines+markers'
)
data = [trace]
layout = go.Layout(
    title='Happiness Score vs. Predicted Happiness Score',
    xaxis=dict(
        title='Happiness Score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Predicted Happiness Score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [13]:
msehappy = np.mean((dropped_happy.Score - lm.predict(X)) ** 2 ) 
print(msehappy)

7.554900759571514e-08


In [14]:
lm2=LinearRegression()
lm2.fit(X[['Support']], dropped_happy.Score)

LinearRegression()

In [15]:
msefamily = np.mean((dropped_happy.Score - lm2.predict(X[['Support']])) **2)
print(msefamily)

0.5881969569191737


In [16]:
print("Based on the above analysis, the Multiple Linear Regression Model is")
print("HappinessScore = 0.0001289+ (1.000041 * economy) + (1.000005*family) + (0.999869*health) + (0.999912*freedom)+ (1.000020*trust) + (1.000006*generosity) + (0.999972*DystopiaResidual)")

Based on the above analysis, the Multiple Linear Regression Model is
HappinessScore = 0.0001289+ (1.000041 * economy) + (1.000005*family) + (0.999869*health) + (0.999912*freedom)+ (1.000020*trust) + (1.000006*generosity) + (0.999972*DystopiaResidual)
