In [1]:
# Import Main Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
import statsmodels.formula.api as stats
from statsmodels.formula.api import ols
import sklearn
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error
import chart_studio.plotly as py 
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
sourcedata=pd.read_csv('https://raw.githubusercontent.com/raju-kpr2/DT_Practice_Project_2/master/happiness_score_dataset.csv.txt')
sourcedata

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.03880,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
...,...,...,...,...,...,...,...,...,...,...,...,...
153,Rwanda,Sub-Saharan Africa,154,3.465,0.03464,0.22208,0.77370,0.42864,0.59201,0.55191,0.22628,0.67042
154,Benin,Sub-Saharan Africa,155,3.340,0.03656,0.28665,0.35386,0.31910,0.48450,0.08010,0.18260,1.63328
155,Syria,Middle East and Northern Africa,156,3.006,0.05015,0.66320,0.47489,0.72193,0.15684,0.18906,0.47179,0.32858
156,Burundi,Sub-Saharan Africa,157,2.905,0.08658,0.01530,0.41587,0.22396,0.11850,0.10062,0.19727,1.83302


# Predicting World Happiness

In [3]:
# find out the null values 
sourcedata.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [4]:

#  IQR score technique to remove Outliers 
df=pd.DataFrame(data=sourcedata)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
dfIQR = df[~((df < (Q1-1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
dfIQR.shape

Happiness Rank                   78.500000
Happiness Score                   1.717750
Standard Error                    0.015032
Economy (GDP per Capita)          0.612640
Family                            0.357582
Health (Life Expectancy)          0.371828
Freedom                           0.220762
Trust (Government Corruption)     0.118580
Generosity                        0.159330
Dystopia Residual                 0.703005
dtype: float64


(126, 12)

In [5]:
dfIQR

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
6,Netherlands,Western Europe,7,7.378,0.02799,1.32944,1.28017,0.89284,0.61576,0.31814,0.47610,2.46570
9,Australia,Australia and New Zealand,10,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646
10,Israel,Middle East and Northern Africa,11,7.278,0.03470,1.22857,1.22393,0.91387,0.41319,0.07785,0.33172,3.08854
...,...,...,...,...,...,...,...,...,...,...,...,...
148,Chad,Sub-Saharan Africa,149,3.667,0.03830,0.34193,0.76062,0.15010,0.23501,0.05269,0.18386,1.94296
149,Guinea,Sub-Saharan Africa,150,3.656,0.03590,0.17417,0.46475,0.24009,0.37725,0.12139,0.28657,1.99172
150,Ivory Coast,Sub-Saharan Africa,151,3.655,0.05141,0.46534,0.77115,0.15185,0.46866,0.17922,0.20165,1.41723
151,Burkina Faso,Sub-Saharan Africa,152,3.587,0.04324,0.25812,0.85188,0.27125,0.39493,0.12832,0.21747,1.46494


In [6]:
dfIQR.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,80.706349,5.343857,0.045018,0.846701,1.006476,0.635748,0.413728,0.11577,0.220932,2.104521
std,42.533952,1.025715,0.011101,0.365123,0.240096,0.232259,0.146472,0.082813,0.115686,0.499865
min,2.0,3.34,0.01848,0.0,0.35386,0.0,0.0,0.0,0.0,0.89991
25%,46.25,4.57625,0.037583,0.593558,0.865637,0.514817,0.319635,0.05845,0.130265,1.75941
50%,80.5,5.203,0.04394,0.910245,1.028915,0.695745,0.41953,0.09077,0.207305,2.053755
75%,116.5,5.98625,0.050585,1.134618,1.214405,0.790795,0.530975,0.153798,0.281142,2.448193
max,155.0,7.561,0.07446,1.55422,1.40223,0.99111,0.66246,0.35637,0.51912,3.26001


In [7]:
#  We can clearly see that countries in the European, and Americas region have a fairly high ranking than ones in the Asian and African regions.
happiness=dfIQR.copy()
data6 = dict(type = 'choropleth', 
           locations = happiness['Country'],
           locationmode = 'country names',
           z = happiness['Happiness Rank'], 
           text = happiness['Country'],
          colorscale = 'Viridis', reversescale = False)
layout = dict(title = 'Happiness Rank Across the World', 
             geo = dict(showframe = False, 
                       projection = {'type': 'mercator'}))
choromap6 = go.Figure(data = [data6], layout=layout)
iplot(choromap6)

In [8]:
data2 = dict(type = 'choropleth', 
           locations = happiness['Country'],
           locationmode = 'country names',
           z = happiness['Happiness Score'], 
           text = happiness['Country'],
           colorbar = {'title':'Happiness'})
layout = dict(title = 'Happiness Score Across the World', 
             geo = dict(showframe = False, 
                       projection = {'type': 'mercator'}))
choromap3 = go.Figure(data = [data2], layout=layout)
iplot(choromap3)

In [9]:
trace4 = go.Scatter(
    x = happiness['Happiness Score'],
    y = happiness['Happiness Rank'],
    mode = 'markers'
)
data4 = [trace4]
layout = go.Layout(
    title='Happiness Rank Determined by Score',
    xaxis=dict(
        title='Happiness Score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Happiness Rank',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

fig4 = go.Figure(data=data4, layout=layout)
iplot(fig4)

In [10]:
drop_rank = happiness.drop("Happiness Rank", axis = 1)
corr_matrix_happy = drop_rank.corr()
trace_corr_happy = go.Heatmap(z=np.array(corr_matrix_happy), x=corr_matrix_happy.columns, y=corr_matrix_happy.columns)
data_happy=[trace_corr_happy]
iplot(data_happy)

In [11]:
drop_rank

Unnamed: 0,Country,Region,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
1,Iceland,Western Europe,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
4,Canada,North America,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
6,Netherlands,Western Europe,7.378,0.02799,1.32944,1.28017,0.89284,0.61576,0.31814,0.47610,2.46570
9,Australia,Australia and New Zealand,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646
10,Israel,Middle East and Northern Africa,7.278,0.03470,1.22857,1.22393,0.91387,0.41319,0.07785,0.33172,3.08854
...,...,...,...,...,...,...,...,...,...,...,...
148,Chad,Sub-Saharan Africa,3.667,0.03830,0.34193,0.76062,0.15010,0.23501,0.05269,0.18386,1.94296
149,Guinea,Sub-Saharan Africa,3.656,0.03590,0.17417,0.46475,0.24009,0.37725,0.12139,0.28657,1.99172
150,Ivory Coast,Sub-Saharan Africa,3.655,0.05141,0.46534,0.77115,0.15185,0.46866,0.17922,0.20165,1.41723
151,Burkina Faso,Sub-Saharan Africa,3.587,0.04324,0.25812,0.85188,0.27125,0.39493,0.12832,0.21747,1.46494


In [12]:
x = happiness
x = happiness.drop(["Region","Country", "Happiness Rank","Happiness Score"], axis=1)
x.head()

Unnamed: 0,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
1,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
4,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
6,0.02799,1.32944,1.28017,0.89284,0.61576,0.31814,0.4761,2.4657
9,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646
10,0.0347,1.22857,1.22393,0.91387,0.41319,0.07785,0.33172,3.08854


In [26]:
y = happiness
y = happiness["Happiness Score"]
y=np.array(y).reshape(-1,1)


In [27]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=.22,random_state=42)
# default random random_state=42


In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_curve,auc
lm = LinearRegression()
lm.fit(x,y)
pre=lm.predict(test_x)


In [29]:
print("Estimated Intercept is", lm.intercept_)

Estimated Intercept is [-0.00011285]


In [30]:
print("The number of coefficients in this model are", lm.coef_)

The number of coefficients in this model are [[0.00111643 1.00010283 1.0000421  0.99987519 0.99967501 0.99991414
  0.9999704  1.00006883]]


In [32]:
msehappy = np.mean((y - lm.predict(x)) ** 2 ) 
print(msehappy)

7.609408962970158e-08


In [35]:
lm2=LinearRegression()
lm2.fit(x[['Family']], y)
msefamily = np.mean((y - lm2.predict(x[['Family']])) **2)
print(msefamily)

0.5371005463860277
