In [1]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from sklearn import datasets
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
! ls Data/

2015.csv  2016.csv  2017.csv


In [3]:
df1 = pd.read_csv("Data/2015.csv")
df2 = pd.read_csv("Data/2016.csv")
df3 = pd.read_csv("Data/2017.csv")

In [4]:
df1["Year"] = 2015
df2["Year"] = 2016
df3["Year"] = 2017

In [5]:
df3 = df3.rename(columns={'Happiness.Rank':'Happiness Rank','Happiness.Score':'Happiness Score','Economy..GDP.per.Capita.':'Economy (GDP per Capita)','Health..Life.Expectancy.':'Health (Life Expectancy)','Trust..Government.Corruption.':'Trust (Government Corruption)','Dystopia.Residual':'Dystopia Residual'})

In [6]:
df1 = df1.drop(['Standard Error'],axis=1)
df2 = df2.drop(['Lower Confidence Interval','Upper Confidence Interval'],axis=1)
df3 = df3.drop(['Whisker.high','Whisker.low'],axis=1)

In [7]:
df = pd.concat([df1, df2, df3])

In [8]:
df.head()

Unnamed: 0,Country,Dystopia Residual,Economy (GDP per Capita),Family,Freedom,Generosity,Happiness Rank,Happiness Score,Health (Life Expectancy),Region,Trust (Government Corruption),Year
0,Switzerland,2.51738,1.39651,1.34951,0.66557,0.29678,1,7.587,0.94143,Western Europe,0.41978,2015
1,Iceland,2.70201,1.30232,1.40223,0.62877,0.4363,2,7.561,0.94784,Western Europe,0.14145,2015
2,Denmark,2.49204,1.32548,1.36058,0.64938,0.34139,3,7.527,0.87464,Western Europe,0.48357,2015
3,Norway,2.46531,1.459,1.33095,0.66973,0.34699,4,7.522,0.88521,Western Europe,0.36503,2015
4,Canada,2.45176,1.32629,1.32261,0.63297,0.45811,5,7.427,0.90563,North America,0.32957,2015


In [9]:
# Basic Scatter Plot between Happiness Score and Economy: ( Positive correlation )

trace1 = go.Scatter(
    x = df["Happiness Score"],
    y = df["Economy (GDP per Capita)"],
    name = "Economy (GDP per Capita)",
    mode = 'markers'
)

trace2 = go.Scatter(
    x = df["Happiness Score"],
    y = df["Trust (Government Corruption)"],
    mode = 'markers',
    name = "Trust (Government Corruption)",
    yaxis = "y2"
)

layout = go.Layout(
    title= "Correlation of Economy in a Country with Happiness Score",
    xaxis= dict(
        title= "Happiness Score",
    ),
    yaxis=dict(
        title= "Economy (GDP per Capita)",
    ),
    yaxis2=dict(
        title="Trust (Government Corruption)",
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)

data = [trace1, trace2]
figure = go.Figure(data = data, layout= layout)
iplot(figure)

## Using Univariate Feature Selection:

In [13]:
features = list(df.columns)
features.remove("Happiness Score")
features.remove("Country")
features.remove("Region")
features.remove("Happiness Rank")

In [14]:
X = df[features]
y = df["Happiness Score"]

In [15]:
y = y.reshape(y.shape[0],1)


reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



In [16]:
X.shape,y.shape

((470, 8), (470, 1))

In [17]:
X_new = SelectKBest(f_regression, k=3).fit_transform(X, y)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [18]:
X_new.shape

(470, 3)

In [19]:
X_new[0:10]

array([[ 1.39651,  1.34951,  0.94143],
       [ 1.30232,  1.40223,  0.94784],
       [ 1.32548,  1.36058,  0.87464],
       [ 1.459  ,  1.33095,  0.88521],
       [ 1.32629,  1.32261,  0.90563],
       [ 1.29025,  1.31826,  0.88911],
       [ 1.32944,  1.28017,  0.89284],
       [ 1.33171,  1.28907,  0.91087],
       [ 1.25018,  1.31967,  0.90837],
       [ 1.33358,  1.30923,  0.93156]])

In [20]:
X.head()

Unnamed: 0,Dystopia Residual,Economy (GDP per Capita),Family,Freedom,Generosity,Health (Life Expectancy),Trust (Government Corruption),Year
0,2.51738,1.39651,1.34951,0.66557,0.29678,0.94143,0.41978,2015
1,2.70201,1.30232,1.40223,0.62877,0.4363,0.94784,0.14145,2015
2,2.49204,1.32548,1.36058,0.64938,0.34139,0.87464,0.48357,2015
3,2.46531,1.459,1.33095,0.66973,0.34699,0.88521,0.36503,2015
4,2.45176,1.32629,1.32261,0.63297,0.45811,0.90563,0.32957,2015


## Thus, the important 3 features are Economy, Family and Health.

In [22]:
# Basic Scatter Plot between Happiness Score and Economy: ( Positive correlation )

trace1 = go.Scatter(
    x = df["Happiness Score"],
    y = df["Economy (GDP per Capita)"],
    name = "Economy (GDP per Capita)",
    mode = 'markers'
)

layout = go.Layout(
    title= "Correlation of Economy in a Country with Happiness Score",
    xaxis= dict(
        title= "Happiness Score",
    ),
    yaxis=dict(
        title= "Economy (GDP per Capita)",
    )
)

data = [trace1]
figure = go.Figure(data = data, layout= layout)
iplot(figure)

In [23]:
# Basic Scatter Plot between Happiness Score and Health: ( Positive correlation )

trace1 = go.Scatter(
    x = df["Happiness Score"],
    y = df["Health (Life Expectancy)"],
    name = "Health (Life Expectancy)",
    mode = 'markers'
)

layout = go.Layout(
    title= "Correlation of Health with Happiness Score",
    xaxis= dict(
        title= "Happiness Score",
    ),
    yaxis=dict(
        title= "Health (Life Expectancy)",
    )
)

data = [trace1]
figure = go.Figure(data = data, layout= layout)
iplot(figure)

In [24]:
# Basic Scatter Plot between Happiness Score and Family: ( Positive correlation )

trace1 = go.Scatter(
    x = df["Happiness Score"],
    y = df["Family"],
    name = "Family",
    mode = 'markers'
)

layout = go.Layout(
    title= "Correlation of Family with Happiness Score",
    xaxis= dict(
        title= "Happiness Score",
    ),
    yaxis=dict(
        title= "Family",
    )
)

data = [trace1]
figure = go.Figure(data = data, layout= layout)
iplot(figure)

## Now, lets look for specific countries which increased its Happiness Score:

In [25]:
CountryList = df.Country.unique()
df_temp2 = pd.DataFrame(columns=["Country","Increase"])

In [26]:
df_temp2_index = 0
err_cnt = 0
for country in CountryList:
    df_temp3 = list(df[df["Country"]==country]["Happiness Score"])
    try:
        inc_score = df_temp3[2] - df_temp3[0]
        df_temp2.loc[df_temp2_index] = [country, inc_score]
        df_temp2_index = df_temp2_index + 1
    except Exception:
        err_cnt = err_cnt + 1

In [27]:
df_temp2.sort_values(by="Increase", ascending=False)

Unnamed: 0,Country,Increase
84,Latvia,0.752
81,Romania,0.701
145,Togo,0.656
130,Senegal,0.631
131,Gabon,0.569
124,Egypt,0.541
138,Ivory Coast,0.525
94,Hungary,0.524
123,Bulgaria,0.496
143,Syria,0.456


In [28]:
CountryGrouped = df.groupby("Country")

In [29]:
df_temp = CountryGrouped.get_group("Yemen")

In [30]:
df_temp

Unnamed: 0,Country,Dystopia Residual,Economy (GDP per Capita),Family,Freedom,Generosity,Happiness Rank,Happiness Score,Health (Life Expectancy),Region,Trust (Government Corruption),Year
135,Yemen,1.92313,0.54649,0.68093,0.35571,0.09131,136,4.077,0.40064,Middle East and Northern Africa,0.07854,2015
146,Yemen,1.97295,0.57939,0.47493,0.2287,0.09821,147,3.724,0.31048,Middle East and Northern Africa,0.05892,2016
145,Yemen,1.345601,0.591683,0.935382,0.249464,0.104125,146,3.593,0.310081,,0.056767,2017


## Region Based Happiness Score:

In [31]:
df.head()

Unnamed: 0,Country,Dystopia Residual,Economy (GDP per Capita),Family,Freedom,Generosity,Happiness Rank,Happiness Score,Health (Life Expectancy),Region,Trust (Government Corruption),Year
0,Switzerland,2.51738,1.39651,1.34951,0.66557,0.29678,1,7.587,0.94143,Western Europe,0.41978,2015
1,Iceland,2.70201,1.30232,1.40223,0.62877,0.4363,2,7.561,0.94784,Western Europe,0.14145,2015
2,Denmark,2.49204,1.32548,1.36058,0.64938,0.34139,3,7.527,0.87464,Western Europe,0.48357,2015
3,Norway,2.46531,1.459,1.33095,0.66973,0.34699,4,7.522,0.88521,Western Europe,0.36503,2015
4,Canada,2.45176,1.32629,1.32261,0.63297,0.45811,5,7.427,0.90563,North America,0.32957,2015


In [32]:
RegionGrouped = df.groupby("Region")

In [33]:
RegionGrouped = RegionGrouped.agg(np.mean)

In [35]:
## Bar plot for average Happiness Score

trace1 = go.Bar(
        x= RegionGrouped.index.values,
        y= RegionGrouped["Happiness Score"],
        name = "Average Happiness Score"
)

trace2 = go.Bar(
        x= RegionGrouped.index.values,
        y= RegionGrouped["Health (Life Expectancy)"],
        name = "Average Health Value"
)

trace3 = go.Bar(
        x= RegionGrouped.index.values,
        y= RegionGrouped["Economy (GDP per Capita)"],
        name = "Average Economy Value"
)


data = [trace1, trace2, trace3]

layout = go.Layout(
    title = "Happiness Score across Regions",
    xaxis = dict(
        title = "Region",
    ),
    yaxis = dict(
        title = "Value",
    ),
    barmode = "group"
)

figure = go.Figure(data=data,layout=layout)
iplot(figure)

In [36]:
RegionGrouped.index.values

array(['Australia and New Zealand', 'Central and Eastern Europe',
       'Eastern Asia', 'Latin America and Caribbean',
       'Middle East and Northern Africa', 'North America',
       'Southeastern Asia', 'Southern Asia', 'Sub-Saharan Africa',
       'Western Europe'], dtype=object)