In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import plotly.express as px
import plotly.graph_objs as go
import statsmodels.api as sm
import math

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
%matplotlib inline

In [None]:
df_2015 = pd.read_csv('../input/world-happiness/2015.csv')
df_2016 = pd.read_csv('../input/world-happiness/2016.csv')
df_2017 = pd.read_csv('../input/world-happiness/2017.csv')
df_2018 = pd.read_csv('../input/world-happiness/2018.csv')
df_2019 = pd.read_csv('../input/world-happiness/2019.csv')

# Cleaning and Merging of data from each year:
We have five different files for every from 2015 - 2019. Lets see what are all the columns in every dataset, look into which columns we would required for EDA. We will look for each year, include additional column to identify from the year and eventually merge the data to final dataset for our analysis.

#### Year 2015

In [None]:
df_2015.head()


In [None]:
df_2015.columns


In [None]:
# Use Year 2015 columns in our analysis and insert another column for year. 

df_2015_temp = df_2015.filter(['Happiness Rank', 'Country', 'Region','Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)','Generosity', 
                        'Happiness Score' ])


In [None]:
df_2015_temp.insert(0, "Year", 2015)


####2. Year 2016


In [None]:
df_2016.head()


In [None]:
df_2016.columns


In [None]:
# Use Year 2016 columns in our analysis and insert another column for year. 

df_2016_temp = df_2016.filter(['Happiness Rank','Country', 'Region','Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity',
                        'Happiness Score'])


In [None]:
df_2016_temp.insert(0, "Year", 2016)


#### 3. Year 2017

In [None]:
df_2017.head()


In [None]:
df_2017.columns


In [None]:
df_2017.rename(columns = {'Happiness.Rank':'Happiness Rank',
                          'Happiness.Score':'Happiness Score',
                          'Economy..GDP.per.Capita.' : 'Economy (GDP per Capita)',
                          'Health..Life.Expectancy.' : 'Health (Life Expectancy)',
                          'Trust..Government.Corruption.' : 'Trust (Government Corruption)',
                           }, inplace = True)

In [None]:
# Use Year 2017 columns in our analysis and insert another column for year. 

df_2017_temp = df_2017.filter(['Happiness Rank','Country','Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity',
                        'Happiness Score'])


In [None]:
df_2017_temp.insert(0, "Year", 2017)

#### 4. Year 2018

In [None]:
df_2018.head()


In [None]:
df_2018.columns


In [None]:
df_2018.rename(columns = {'Overall rank' : 'Happiness Rank',
                          'Country or region' : 'Country',
                          'Score' : 'Happiness Score',
                          'Social support' : 'Family',
                          'GDP per capita' : 'Economy (GDP per Capita)',
                          'Healthy life expectancy' : 'Health (Life Expectancy)',
                          'Freedom to make life choices' :'Freedom',
                          'Perceptions of corruption' : 'Trust (Government Corruption)'
                          }, inplace = True)

In [None]:
# Use Year 2018 columns in our analysis and insert another column for year. 

df_2018_temp = df_2018.filter(['Happiness Rank','Country','Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity',
                        'Happiness Score'])


In [None]:
df_2018_temp.insert(0, "Year", 2018)


#### 5. Year 2019

In [None]:
df_2019.head()


In [None]:
df_2019.rename(columns = {'Overall rank' : 'Happiness Rank',
                          'Country or region' : 'Country',
                          'Score' : 'Happiness Score',
                          'Social support' : 'Family',
                          'GDP per capita' : 'Economy (GDP per Capita)',
                          'Healthy life expectancy' : 'Health (Life Expectancy)',
                          'Freedom to make life choices' :'Freedom',
                          'Perceptions of corruption' : 'Trust (Government Corruption)'
                          }, inplace = True)

In [None]:
# Use Year 2019 columns in our analysis and insert another column for year. 

df_2019_temp = df_2019.filter(['Happiness Rank','Country','Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity',
                        'Happiness Score'])


In [None]:
df_2019_temp.insert(0, "Year", 2019)

In [None]:
df_2016_temp


In [None]:
print ('Numbers of rows and columns in year 2015 :', df_2015_temp.shape)
print ('Numbers of rows and columns in year 2016 :', df_2016_temp.shape)
print ('Numbers of rows and columns in year 2017 :', df_2017_temp.shape)
print ('Numbers of rows and columns in year 2018 :', df_2018_temp.shape)
print ('Numbers of rows and columns in year 2019 :', df_2019_temp.shape)

#### 6. Merge Data.

In [None]:
# Merge Data so that we can get Region in each missing database. This can be done using comparing two columns from 
# to different databases and merging it together and dropping all the columns.

df_2017_temp = df_2017_temp.merge(df_2016_temp,left_on = 'Country', right_on = 'Country', how = 'inner')
df_2018_temp = df_2018_temp.merge(df_2016_temp,left_on = 'Country', right_on = 'Country', how = 'inner')
df_2019_temp = df_2019_temp.merge(df_2015_temp,left_on = 'Country', right_on = 'Country', how = 'inner')


Year 17,18 and 19 do not have Region column, as Region column can be used for EDA. So with the help of year 2015 and 2016 we will merge the data of 17,18,19 on the bases of Country.   

In [None]:
print(df_2017_temp.columns)
print(df_2018_temp.columns)
print(df_2019_temp.columns)

As we have merged the data on inner join on Country, columns of 16 are also getting joined and since the column names of 16 and 17 are same,
we can see that a suffix has been added to those all the columns. 
_x represents year 17
_y represents year 16

### Now to retain our data we will drop and rename _y columns.

In [None]:
df_2017_temp.drop(columns = ['Year_y', 'Happiness Rank_y','Economy (GDP per Capita)_y',
       'Family_y', 'Health (Life Expectancy)_y', 'Freedom_y',
       'Trust (Government Corruption)_y', 'Generosity_y', 'Happiness Score_y'], inplace = True)

In [None]:
df_2018_temp.drop(columns = ['Year_y','Happiness Rank_y','Economy (GDP per Capita)_y',
       'Family_y', 'Health (Life Expectancy)_y', 'Freedom_y','Year_y',
       'Trust (Government Corruption)_y', 'Generosity_y', 'Happiness Score_y'], inplace = True)

In [None]:
df_2019_temp.drop(columns = ['Year_y', 'Happiness Rank_y','Economy (GDP per Capita)_y',
       'Family_y', 'Health (Life Expectancy)_y', 'Freedom_y',
       'Trust (Government Corruption)_y', 'Generosity_y', 'Happiness Score_y'], inplace = True)

In [None]:
df_2017_temp.rename(columns = {'Year_x' : 'Year',
                          'Happiness Rank_x' : 'Happiness Rank',
                          'Happiness Score_x':'Happiness Score',
                          'Family_x':'Family',
                          'Economy (GDP per Capita)_x':'Economy (GDP per Capita)',
                          'Health (Life Expectancy)_x': 'Health (Life Expectancy)',
                          'Freedom_x' : 'Freedom',
                          'Trust (Government Corruption)_x': 'Trust (Government Corruption)',
                          'Generosity_x':'Generosity'
                          }, inplace = True)

In [None]:
df_2018_temp.rename(columns = {'Year_x' : 'Year',
                          'Happiness Rank_x' : 'Happiness Rank',
                          'Happiness Score_x':'Happiness Score',
                          'Family_x':'Family',
                          'Economy (GDP per Capita)_x':'Economy (GDP per Capita)',
                          'Health (Life Expectancy)_x': 'Health (Life Expectancy)',
                          'Freedom_x' : 'Freedom',
                          'Trust (Government Corruption)_x': 'Trust (Government Corruption)',
                          'Generosity_x':'Generosity'
                          }, inplace = True)

In [None]:
df_2019_temp.rename(columns = {'Year_x' : 'Year',
                          'Happiness Rank_x' : 'Happiness Rank',
                          'Happiness Score_x':'Happiness Score',
                          'Family_x':'Family',
                          'Economy (GDP per Capita)_x':'Economy (GDP per Capita)',
                          'Health (Life Expectancy)_x': 'Health (Life Expectancy)',
                          'Freedom_x' : 'Freedom',
                          'Trust (Government Corruption)_x': 'Trust (Government Corruption)',
                          'Generosity_x':'Generosity'
                          }, inplace = True)

Now rearranging every data frame.

In [None]:
df_2015_temp = df_2015.filter(['Year','Country','Region', 'Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity', 
                        'Happiness Score', 'Happiness Rank'])

In [None]:
df_2016_temp = df_2016.filter(['Year','Country','Region', 'Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity', 
                        'Happiness Score', 'Happiness Rank'])

In [None]:
df_2017_temp = df_2017_temp.filter(['Year','Country','Region', 'Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity', 
                        'Happiness Score', 'Happiness Rank'])

In [None]:
df_2018_temp = df_2018_temp.filter(['Year','Country','Region', 'Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity', 
                        'Happiness Score', 'Happiness Rank'])

In [None]:
df_2019_temp = df_2019_temp.filter(['Year','Country','Region', 'Economy (GDP per Capita)','Family',
                        'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity', 
                        'Happiness Score', 'Happiness Rank'])

In [None]:
df_2015_temp.insert(0, "Year", 2015)


In [None]:
df_2016_temp.insert(0, "Year", 2016)


In [None]:
# Create dataset including all the data from all the years together.

df_final = pd.concat([df_2015_temp,df_2016_temp,df_2017_temp,df_2018_temp,df_2019_temp], 
                     sort = False, ignore_index=True)

In [None]:
df_final


In [None]:
df_final.columns


In [None]:
df_final.isna().sum()


In [None]:
#identify the NA value

df_final[df_final['Trust (Government Corruption)'].isna()]

In [None]:
df_final.info()


In [None]:
#Replace the NaN value with the mean of all values from each year for United Arab Emirates.

df_UAE = df_final[df_final['Country'] == 'United Arab Emirates']
df_UAE

In [None]:
df_UAE['Trust (Government Corruption)'].mean()  #find mean 


In [None]:
# replace the value to mean

df_final.fillna(0.311982, inplace=True)

# Exploratory Data Analysis




####1. Finding Correlation between different columns with Happiness Score.

In [None]:
plt.figure(figsize=(10,10))
corr_mat = sns.heatmap(df_final.corr(), vmin=-1, vmax=1, annot=True)
corr_mat.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

From the above correlation, it can be inferred that Generosity and Trust in Government has less impact to the individual happiness. GDP and Health programs in the country has most impact to the individual happiness.

####2. Pair Plot:

In [None]:
sns.pairplot(df_final)


####3. Relationship of Economy (GDP per Capita) and Happiness Score by Region

In [None]:
df_final['Region'].value_counts()


**3.1 GDP per Capita w.r.t. Asia Region**


Interested in different GDP for the countries, we first find only the southern Asia region and then combine Southeastrn and Eastern together.

In [None]:
#only for Southern Asia

df_region_southasia = df_final[df_final['Region'] == 'Southern Asia']

In [None]:
fig = px.bar(data_frame=df_region_southasia,
            x = 'Year',
            y = 'Economy (GDP per Capita)',
            color = 'Country',
            opacity=0.5,
            template='plotly_dark', 
            title='Economy per Capita For Southern Asian Countries',
             
            barmode='group')
            
fig.show()

Top 5 Asian contries in each year with highest GDP.
From the Graph we can see that Sri Lanka has the highest GDP for all 5 years.



 **Combining Southeastern and Eastern Regions from Asia**


In [None]:

df_region_asia = df_final[df_final['Region'].isin(['Southeastern Asia', 'Eastern Asia','Southern Asia'])]

df_region_asia.head()

In [None]:
fig = px.bar(data_frame=df_region_asia,
            x = 'Country',
            y = 'Economy (GDP per Capita)',
            color = 'Country',
            barmode='group',
            orientation= 'v',
            
            title='Economy per Capita For Eastern,Southern Asia and Southern Eastern Asia Countries',
             
            animation_frame='Year', 
            range_y=[0,2],
             
            template='plotly_dark',
            text='Happiness Score'
            )
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside',
                 width = [0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4]) # thickness of the bar
fig.show()

Here we can see that even though happiness score is same for few countries but their GDP is quite different. So even though asian countries have highest GDP they do not have highest happiness score. 

**3.2 GDP per Capita w.r.t. African Countries.**

1. Different GDP for the countries, we look for Countries in Africa and their GDP per Capita.

In [None]:
# Plot with Sub-Saharian-Africaan countries vs GDP

df_region_africa = df_final[df_final['Region'].isin(['Sub-Saharan Africa', 'Middle East and Northern Africa'])]

df_region_africa.head()




In [None]:
df_final.describe()

In [None]:
df_final[df_final['Generosity']==0]

In [None]:
fig = px.bar(data_frame=df_region_africa,
            x = 'Year',
            y = 'Economy (GDP per Capita)',
            color = 'Country',
            barmode='group',
            orientation= 'v',
            template='plotly_dark', 
            text='Happiness Score',            
           
            labels={'Economy (GDP per Capita)':'Economy',
                    'Year':'Year'},           
            title='Economy per Capita For Sub - African Countries', 
            )



fig.show()

In year 2018 UAE has overtake Quatar in GDP. But in 2019 Quatar retained back its rank.
From the above graph Africa dose not follow the same trend as Asia, as in Africa happiness score and GDP are highly correlated.   

**3.3 Economy (GDP per Capita) w.r.t. European Countries.**

In [None]:
df_region_europe = df_final[df_final['Region'].isin(['Western Europe', 'Central and Eastern Europe'])]


In [None]:
fig = px.bar(data_frame=df_region_europe,
            x = 'Year',
            y = 'Economy (GDP per Capita)',
            color = 'Country',
            text='Happiness Score',            

            opacity=0.5,
            template='plotly_dark',
            title='Economy per Capita For Europian Countries',
             
            barmode='group')
            
fig.show()

Luxembourg has the highest GDP for all 5 years.
Switzerland having the highest happiness score and having 3rd highest GDP score





#### 4. Heath (Life Expectancy) and Happiness Score by each Region and Country

From the correlation table the next highly influencial index for Happiness score is health of the country. Lets find out how health of the country has changed over years.

**4.1 Health (Life Expectancy) for Southern Asia Countries for different Years**

In [None]:
go.Figure(data=[go.Pie(labels=df_final.Region.value_counts().index, values=df_final.Region.value_counts().values)])

In [None]:
 df_region_southasia.shape


In [None]:
fig = px.scatter(df_region_southasia,
                x = 'Health (Life Expectancy)',
                y = 'Happiness Score',
                color = 'Country',
                
                facet_row = 'Year',
                labels ={"Happiness Score":"H Score"},
                template='plotly_dark',

                title = 'Health vs Happiness Score for South Asia'
                )

fig.show()

Every year Sri Lanka and Bangladesh have highest Health life expectancy.
But their happiness score is ranging between 4 and 5.

Although Bhutan has highest happiness score it has average Health life expectancy. 


**4.2 Life Expectency vs Happiness for European Countries**

In [None]:
t= df_region_europe[df_region_europe['Region']=='Western Europe']
t.Country.unique()

In [None]:
fig = px.scatter(df_region_europe,
                x = 'Health (Life Expectancy)',
                y = 'Happiness Score',
                color = 'Country', 
                template='plotly_dark',
                facet_col = 'Year',
                facet_col_wrap = 5,
                labels ={"Health (Life Expectancy)":"Health"},
                title = 'Health vs Happiness Score for Europe, with Bubble size indication of GDP',
                )
fig.show()

There is bigger difference in European Countries from Western Europe which has higher Happiness score than Central and Eastern countries having less happiness score.

In [None]:
df_region_europe1 = df_region_europe[df_region_europe['Country'].isin(['Switzerland', 
                                                                       'Norway','Finland','Netherlands',
                                                                      'Sweden', 'Austria'])]

In [None]:
fig = px.scatter(df_region_europe1,
                x = 'Health (Life Expectancy)',
                y = 'Happiness Score',
                color = 'Country',
                size = 'Economy (GDP per Capita)', 
                template='plotly_dark',
                facet_col = 'Year',
                facet_col_wrap = 5,
                labels ={"Health (Life Expectancy)":"Health"},
                title = 'Health vs Happiness Score for Western European Countries, with Bubble size indicating of GDP',
                )
fig.show()

From the above graph, can see how Western European have increased their Health(Life Expectancy) over the years. 

Austria had a dip from year 2015 to 2017 since one of the factor Health(Life Expectancy) was dropping, as the countries Health improved the Happiness score also gained.

Finland has most gain in Europe in Happiness score, as they constantly improved the countries health. Switzerland has better health score and GDP than Finland but lack in overall Happiness Score.

Next, lets compare with Central and Eastern European Countries.

In [None]:
df_region_europe2 = df_final[df_final['Region']=='Central and Eastern Europe']

In [None]:
fig = px.scatter(df_region_europe2,
                x = 'Health (Life Expectancy)',
                y = 'Happiness Score',
                color = 'Country',
                size = 'Economy (GDP per Capita)', 
                template='plotly_dark',
                facet_col = 'Year',
                facet_col_wrap = 5,
                labels ={"Health (Life Expectancy)":"Health"},
                title = 'Health vs Happiness Score for Central and Eastern Europe, with Bubble size indicating of GDP',
                )
fig.show()

Czech Republic has the highest happiness score, more that average health life expectancy and it is gradually increasing

Slovenia has the highest health life expectance for all 5 years but it has above average GDP and happiness score.

Although Tajikistan has the lowest GDP for all 5 years its Happiness score and Health life expectance has an upward trend. 

**4.3 Life Expectency vs Happiness for North America Countries**

In [None]:
df_final['Happiness Score'].median()


In [None]:
df_region_america = df_final[df_final['Region']=='North America']


In [None]:
fig = px.scatter(df_region_america,
                x = 'Health (Life Expectancy)',
                y = 'Happiness Score',
                color = 'Country',
                template='plotly_dark',
                text = 'Year',
                size = 'Economy (GDP per Capita)',
                title = 'Health vs Happiness Score for America, with Bubble size indication of GDP'
                )

fig.show()

From scatterplot above, Canada has better Happiness Score than United States, Canada has better Life Expectancy than US. Understandably, Canada has free health care, but over the year Canada Happiness has got lower score. 

**4.4 Life Expectency vs Happiness for Selected Countries and changes over the year in Life Expectancy vs Happiness.**

Handpicked countries from all region. Will try using animation over scatter plot.



In [None]:
df_custom_countries = df_final[df_final['Country'].isin(['Switzerland', 'Norway',
                               'Austria', 'New Zealand',  'India', 'Bhutan',  'Israel',
                                'Bangladesh', 'Mauritius', 'Nigeria', 'Zambia', 'Czech Republic', 
                                'Uzbekistan', 'Slovakia','Canada', 'United States', 
                                'Poland', 'Turkmenistan', 'Costa Rica', 'Mexico', 'Brazil',
                                'Israel', 'United Arab Emirates', 
                                'Qatar', 'Saudi Arabia', 'Singapore', 'Thailand', 
                                'Philippines', 'Malaysia', 
                                'South Korea', 'Japan',  'Sri Lanka'
                                 ])]


In [None]:
fig = px.scatter(df_custom_countries,
                x = 'Health (Life Expectancy)',
                y = 'Happiness Score',
                color = 'Country',
                size = 'Happiness Score', 
                template='plotly_dark',

                title = 'Health vs Happiness Score, with Bubble size indication of GDP',
                animation_frame = 'Year',
                range_x = [-0.05,1.3], # define the x and y limit so that graph is not outbound.
                range_y = [2,9]
                #category_orders={'Year': [2015,2016,2017,2018,2019]},
                )

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000 #speed of change from one from to next frames.
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500 #speed of change in graph.

fig.show()                 

After selecting US, Canada, Austria and Norway for year 2015, they clustered together closely. By year 2017, they get declustered. US by year 2019 not only falls back in Health Life Expectancy but also gets lower score in Happiness Index. Where other three country have similiar Happiness Index and Health (Life Expectancy) by year 2019.

After selecting Sri Lanka, Bangladesh , Bhutan and India their happiness score ranges form 3-5.5 for all 5 years, in 2016 and 2017 all 4 countries decreased a little. But after 2018 the incrased gradually. 



####5. Importance of Family Support in Happiness Index

Another important index for the dataset is Family Support. We will compare with various region and country to check its importance.

**5.1 Effect of Family Support on Happiness Index for countries.**

We look into southern Asia region.



In [None]:
df_region_southasia_2019 =  df_region_southasia[df_region_southasia['Year'] == 2019]
df_region_southasia_2019 

In [None]:
df_region_southasia_2019.groupby('Country')['Happiness Score'].max().sort_values(ascending=False).reset_index()

In [None]:
fig = px.pie(data_frame=df_region_southasia_2019,
             values='Family',
             names='Country',
             color='Country', 
            hover_name='Happiness Score',                        
             title='2019 Family Support Index for Southasia Country',
             template='plotly_dark',
             width=800,                          
             height=600,                         
             hole=0.5,                          
            )

fig.update_traces(textposition='inside', textinfo='percent+label')

fig.show()

From the above piechart, can see for year 2019, Bhutan has higher percent in pie for Family Support Index. Bhutan is the second happiest country for year 2019.

**5.2 Comparing Family Support with Happiness Index for African Countries.**

For next pie chart we shall use African Region



In [None]:
df_region_africa_2019 =  df_region_africa[df_region_africa['Year'] == 2019] 

In [None]:
df_region_africa_2019.groupby('Country')['Happiness Score'].max().sort_values(ascending=False).reset_index()

In [None]:
fig = px.pie(data_frame=df_region_africa_2019,
             values='Family',
             names='Country',
             color='Country',                           
             hover_name='Happiness Score',              
             title='2019 Family Support Index for Sub-Sahara African Countries',     
              template='plotly_dark',            
             width=800,                       
             height=600                         
                                        
            )

fig.show()

From the piechart above, Israel has highest percent for Family Support Index, and also has most Happiness Index in the region.

**5.3 Combine North America, Australia and New Zealand for Fmaily Index.**

For next pie chart we shall use North America, Australia and New Zealand Region



In [None]:
df_region_comb = df_final[df_final['Region'].isin(['North America', 'Australia and New Zealand'])]

In [None]:
df_region_comb.groupby('Country')['Happiness Score'].max().sort_values(ascending=False).reset_index()

In [None]:
fig = px.pie(data_frame=df_region_comb,
             values='Family',
             names='Country',
             color='Country',
                                                             
             title='Family Index for America, Australia and New Zealand Countries for Years 2015 - 2019',     
             template='plotly_dark',            
             width=800,                          
             height=600,                         
             hole=0.5,                           
            )
fig.update_traces(textposition='inside', textinfo='percent+label'),


fig.show()

Of the four countries above, US has least Family Support Index, so is its Happiness Score lower than the other countries.

**5.4 Effect of Family Support for countries.**

We look into Eastern Asia region

In [None]:
df_region_eastasia = df_final[df_final['Region'] == 'Eastern Asia']
df_region_eastasia_2019 = df_region_eastasia[df_region_eastasia['Year'] == 2019]
df_region_eastasia_2019

In [None]:
fig = px.pie(data_frame=df_region_eastasia_2019,
             values='Family',
             names='Country',
             color='Country',                      
             hover_name='Happiness Score',             
             template='plotly_dark',   
             title='2019 Family Support Index for East Asian Countries wiht Happiness Score.',     
                                                      
             width=800,                          
             height=600,                         
             hole=0.5,                           
            )

fig.update_traces(textposition='outside', textinfo='percent+label')



fig.show()

From the above pie chart, we can find that China has least Family Support Index whereas Taiwan has better Family support, so is the rank of Taiwan on overall East Asian countries.

# Splitting Data

In [None]:
df_final.columns

In [None]:
df = df_final
df.head()

In [None]:
# for each column heading we replace " " with "_" and conver the heading in lower case
cleancolumn = []
for i in range(len(df.columns)):
    cleancolumn.append(df.columns[i].replace(' ','_').lower())
df.columns = cleancolumn  

In [None]:
df.columns

In [None]:
df.region.unique()

In [None]:
# performing label encoding on 'region' columns
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['region'] = labelencoder.fit_transform(df['region'])
df.head(10)



In [None]:
plt.figure(figsize=(10,10))
corr_mat = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)
corr_mat.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
happyall_score_top10 = df.groupby(['year'])['year','country','happiness_score']
happyall_score_top10.columns = ['Year','country','happiness_score']
happyall_score_top10.head()

In [None]:
X = df[['economy_(gdp_per_capita)', 'family','region',
       'health_(life_expectancy)', 'freedom', 'trust_(government_corruption)',
       'generosity']]
X.head()

In [None]:
y= df.happiness_score
y.head()

In [None]:
# load the libraries required for data processing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
! pip install xgboost
import xgboost as xgb

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)
X_train.shape, X_test.shape, y_train.shape ,y_test.shape

# Model

### Linear Regression

In [None]:
model_lr = LinearRegression()

In [None]:
model_lr.fit(X_train,y_train)
model_lr.score(X_train,y_train)

In [None]:
model_lr.coef_  #value of m i.e slope

In [None]:
model_lr.intercept_  #Value of C

In [None]:
y_pred_test = model_lr.predict(X_test)
y_pred_test[:5]

In [None]:
rmse_test_lr = np.sqrt(mean_squared_error(y_test, y_pred_test))  #root mean squared error
print(model_lr.score(X_test,y_test))    
SSE = np.sum((y_pred_test-y_test)**2)  #sum of square error
SST = np.sum((y_test-np.mean(y_train))**2)  #sum of square total
r2_test_lr = 1 - SSE/SST #statistical measure of how close the data are to the fitted regression line
print("Test RMSE LR : ", rmse_test_lr)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 LR: ", r2_test_lr)

Ploting bestfit line on highly coorelated feature vs Happiness score

In [None]:
model_lr1 = LinearRegression()
model_lr1.fit(X_train[['economy_(gdp_per_capita)']],y_train)
print(model_lr1.score(X_train[['economy_(gdp_per_capita)']],y_train))
print(model_lr1.coef_)
print(model_lr1.intercept_)
y_pred_train = model_lr1.predict(df[['economy_(gdp_per_capita)']])

In [None]:
plt.scatter(df[['economy_(gdp_per_capita)']],df['happiness_score']) #bestfit line  
plt.plot(df[['economy_(gdp_per_capita)']],y_pred_train,'r')
plt.axis([-1,2.5,2,7])
plt.xlabel('gdp_per_capita')
plt.ylabel('score')
plt.title('Linear Regression on GDP ~ Happiness Score')

#### RFE

In [None]:
from sklearn.feature_selection import RFE  #recursive feature elimination

In [None]:
model_rfe = LinearRegression()

In [None]:
rfe = RFE(estimator=model_rfe, step=1,verbose=2)
rfe.fit(X_train, y_train)
rfe.ranking_

In [None]:
selected_rfe_features = pd.DataFrame({'Feature':list(X_train.columns),
                                      'Ranking':rfe.ranking_})
selected_rfe_features.sort_values(by='Ranking')

In [None]:
X_train_rfe = rfe.transform(X_train) 
X_train_rfe.shape

In [None]:
model_rfe.fit(X_train_rfe,y_train)
model_rfe.score(X_train_rfe,y_train)

In [None]:
y_pred_rfe = model_rfe.predict(X_test[['economy_(gdp_per_capita)','health_(life_expectancy)','freedom']])

In [None]:
rmse_test_rfe = np.sqrt(mean_squared_error(y_test, y_pred_rfe))
print(model_rfe.score(X_test[['economy_(gdp_per_capita)','health_(life_expectancy)','freedom']],y_test))   
SSE = np.sum((y_pred_rfe-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_rfe = 1 - SSE/SST
print("Test RMSE LR : ", rmse_test_rfe)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 LR: ", r2_test_rfe)

### Decision Tree

In [None]:
max_depths = np.linspace(1, 15, 15, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:
   dt = DecisionTreeRegressor(max_depth=max_depth)
   dt.fit(X_train, y_train)
   train_pred = dt.predict(X_train)
   rmse_train_dt = np.sqrt(mean_squared_error(y_train, train_pred))
   SSE = np.sum((train_pred-y_train)**2)
   SST = np.sum((y_train-np.mean(y_train))**2)
   r2_train_dt = 1 - SSE/SST
   train_results.append(r2_train_dt)

   test_pred = dt.predict(X_test)
   rmse_test_dt = np.sqrt(mean_squared_error(y_test, test_pred))
   SSE = np.sum((test_pred -y_test)**2)
   SST = np.sum((y_test-np.mean(y_test))**2)
   r2_test_dt = 1 - SSE/SST
   test_results.append(r2_test_dt)
   

line1 = plt.plot(max_depths, train_results, 'b')
line2 = plt.plot(max_depths, test_results, 'r')
plt.ylabel('R2 score')
plt.xlabel('Tree depth')
plt.title('Depth vs R2 Score')
plt.show()

Max_depth of 4 gives minimum difference between train and test.

If Max_depth > 4 model will overfit according to graph as difference between test and train increases 

If Max_depth < 4 model is giving less accuracy on train 

In [None]:
model_dt = DecisionTreeRegressor(max_depth=4)

In [None]:
model_dt.fit(X_train,y_train)
model_dt.score(X_train,y_train)

In [None]:
y_pred_test_ds = model_dt.predict(X_test)
y_pred_test_ds[:5]
rmse_test_dt = np.sqrt(mean_squared_error(y_test, y_pred_test_ds))
print(model_dt.score(X_test,y_test))   
SSE = np.sum((y_pred_test_ds-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_dt = 1 - SSE/SST
print("Test RMSE DT : ", rmse_test_dt)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 DT: ", r2_test_dt)

In [None]:
from sklearn.tree import export_graphviz
from IPython.display import Image
export_graphviz(model_dt,out_file='Regtree.dot',feature_names=X_train.columns)
! dot -Tpng Regtree.dot -o Regtree.png
Image("Regtree.png")

### Ensemble Method

#### Bagging

In [None]:
model_bag = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=4),n_estimators=100,oob_score=True,random_state=410,verbose=2)

In [None]:
model_bag.fit(X_train,y_train)

In [None]:
model_bag.score(X_train,y_train)

In [None]:
model_bag.oob_score_ 

In [None]:
y_pred_bag = model_bag.predict(X_test)

In [None]:
rmse_test_bag = np.sqrt(mean_squared_error(y_test, y_pred_bag))
print(model_bag.score(X_test,y_test))    
SSE = np.sum((y_pred_bag-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_bag = 1 - SSE/SST
print("Test RMSE Bag : ", rmse_test_bag)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 Bag: ", r2_test_bag)

#### Random Forest

In [None]:
max_depths = np.linspace(1, 15, 15, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:
   dt = RandomForestRegressor(max_depth=max_depth)
   dt.fit(X_train, y_train)
   train_pred = dt.predict(X_train)
   rmse_train_dt = np.sqrt(mean_squared_error(y_train, train_pred))
   SSE = np.sum((train_pred-y_train)**2)
   SST = np.sum((y_train-np.mean(y_train))**2)
   r2_train_dt = 1 - SSE/SST
   train_results.append(r2_train_dt)

   test_pred = dt.predict(X_test)
   rmse_test_dt = np.sqrt(mean_squared_error(y_test, test_pred))
   SSE = np.sum((test_pred -y_test)**2)
   SST = np.sum((y_test-np.mean(y_test))**2)
   r2_test_dt = 1 - SSE/SST
   test_results.append(r2_test_dt)
   
from matplotlib.legend_handler import HandlerLine2D
line1 = plt.plot(max_depths, train_results, 'b')
line2 = plt.plot(max_depths, test_results, 'r')
plt.ylabel('R2 score')
plt.xlabel('Tree depth')
plt.title('Depth vs R2 Score')
plt.show()

In [None]:
train_results = []
test_results = []
list_nb_trees =  [10, 20, 40, 60, 80, 100,120,140,160,180,200,250,300]

for nb_trees in list_nb_trees:
    rf = RandomForestRegressor(n_estimators=nb_trees)
    rf.fit(X_train, y_train)
    

    train_results.append(mean_squared_error(y_train, rf.predict(X_train)))
    #test_results.append(mean_squared_error(y_test, rf.predict(X_test)))
line1, = plt.plot(list_nb_trees, train_results, color="r", label="Training Score")
#line2, = plt.plot(list_nb_trees, test_results, color="g", label="Testing Score")

plt.ylabel('MSE')
plt.xlabel('n_estimators')
plt.show()

In [None]:
train_results = []
test_results = []
list_nb_trees =  [10, 20, 40, 60, 80, 100,120,140,160,180,200,250,300]

for nb_trees in list_nb_trees:
    rf = RandomForestRegressor(n_estimators=nb_trees)
    rf.fit(X_train, y_train)
    

    #train_results.append(mean_squared_error(y_train, rf.predict(X_train)))
    test_results.append(mean_squared_error(y_test, rf.predict(X_test)))
#line1, = plt.plot(list_nb_trees, train_results, color="r", label="Training Score")
line2, = plt.plot(list_nb_trees, test_results, color="g", label="Testing Score")

plt.ylabel('MSE')
plt.xlabel('n_estimators')
plt.show()

In [None]:
model_rf = RandomForestRegressor(n_estimators=150,max_features=4,random_state=10,oob_score=True,verbose=2,max_depth=3)

In [None]:
model_rf.fit(X_train,y_train)

In [None]:
model_rf.score(X_train,y_train) 

In [None]:
model_rf.feature_importances_ 

In [None]:
data = pd.Series(model_rf.feature_importances_,index=X_train.columns)

In [None]:
data.sort_values(ascending=True,inplace=True)

In [None]:
data.plot.barh()

In [None]:
y_pred_rf = model_rf.predict(X_test)

In [None]:
rmse_test_rf1 = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(model_rf.score(X_test,y_test))   
SSE = np.sum((y_pred_rf-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_rf1 = 1 - SSE/SST
print("Test RMSE RF1 : ", rmse_test_rf1)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 RF1: ", r2_test_rf1)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'n_estimators':[100,120,150],'max_depth': [1,2,3,4],'max_features': [1,2,3,4]}

In [None]:
tune_model = GridSearchCV(model_rf, parameters,cv=5) 

In [None]:
tune_model.fit(X_train,y_train)

tune_model.best_params_

In [None]:
tune_model.score(X_train,y_train)

In [None]:
y_pred_cv = tune_model.predict(X_test)

In [None]:
rmse_test_rf2= np.sqrt(mean_squared_error(y_test, y_pred_cv))
print(tune_model.score(X_test,y_test))    
SSE = np.sum((y_pred_cv-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_rf2 = 1 - SSE/SST
print("Test RMSE RF2 : ", rmse_test_rf2)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 RF2: ", r2_test_rf2)

#### Boosting

In [None]:
train_results = []
test_results = []
list_nb_trees =  [10, 20, 40, 60, 80, 100,120,140,160,180,200,250,300]

for nb_trees in list_nb_trees:
    rf = xgb.XGBRegressor(objective='reg:squarederror',n_estimators=nb_trees)
    rf.fit(X_train, y_train)
    

    train_results.append(mean_squared_error(y_train, rf.predict(X_train)))
    #test_results.append(mean_squared_error(y_test, rf.predict(X_test)))
line1, = plt.plot(list_nb_trees, train_results, color="r", label="Training Score")
#line2, = plt.plot(list_nb_trees, test_results, color="g", label="Testing Score")

plt.ylabel('MSE')
plt.xlabel('n_estimators')
plt.show()

In [None]:
train_results = []
test_results = []
list_nb_trees =  [10, 20, 40, 60, 80, 100,120,140,160,180,200,250,300]

for nb_trees in list_nb_trees:
    rf = xgb.XGBRegressor(objective='reg:squarederror',n_estimators=nb_trees)
    rf.fit(X_train, y_train)
    

    #train_results.append(mean_squared_error(y_train, rf.predict(X_train)))
    test_results.append(mean_squared_error(y_test, rf.predict(X_test)))
#line1, = plt.plot(list_nb_trees, train_results, color="r", label="Training Score")
line2, = plt.plot(list_nb_trees, test_results, color="g", label="Testing Score")

plt.ylabel('MSE')
plt.xlabel('n_estimators')
plt.show()

In [None]:
model_xgb = xgb.XGBRegressor(objective='reg:squarederror',n_estimators=50)

In [None]:
model_xgb.fit(X_train,y_train)
model_xgb.score(X_train,y_train)

In [None]:
y_preds = model_xgb.predict(X_test)

In [None]:
rmse_test_bost1 = np.sqrt(mean_squared_error(y_test, y_preds))
print(model_xgb.score(X_test,y_test))    
SSE = np.sum((y_preds-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_bost1 = 1 - SSE/SST
print("Test RMSE Bost1: ", rmse_test_bost1)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 Bost1: ", r2_test_bost1)

In [None]:
param_grid = {'max_depth':np.arange(1,4),'learning_rate':[0.1,0.01,0.001],'n_estimators':[100,120,150]}

In [None]:
tune_model1 = GridSearchCV(model_xgb,param_grid)

In [None]:
tune_model1.fit(X_train,y_train)

In [None]:
tune_model1.best_params_

In [None]:
tune_model1.best_score_

In [None]:
model_xgb_cv = xgb.XGBRegressor(objective='reg:squarederror',n_estimators=100, max_depth = 3, learning_rate = 0.1)

In [None]:
model_xgb_cv.fit(X_train,y_train)

In [None]:
model_xgb_cv.score(X_train,y_train)

In [None]:
y_preds_cv = model_xgb_cv.predict(X_test)

In [None]:
rmse_test_bost2 = np.sqrt(mean_squared_error(y_test, y_preds_cv))
print(model_xgb_cv.score(X_test,y_test))    
SSE = np.sum((y_preds_cv-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_bost2 = 1 - SSE/SST
print("Test RMSE Bost2: ", rmse_test_bost2)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 Bost2: ", r2_test_bost2)

### KNN

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.fit_transform(X_test)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
model_knn = KNeighborsRegressor()   # n_neighbors:  (default = 5)

In [None]:
model_knn.fit(X_train_scaled,y_train)
model_knn.score(X_train_scaled,y_train)

In [None]:
y_pred_test = model_knn.predict(X_test_scaled)
rmse_test_knn = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(model_knn.score(X_test_scaled,y_test))    
SSE = np.sum((y_pred_test-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_knn = 1 - SSE/SST
print("Test RMSE LR : ", rmse_test_knn)
print("Test SSE : ", SSE)
print("Test SST : ", SST)

print("Test R2 LR: ", r2_test_knn)

In [None]:
from sklearn.metrics import mean_squared_error 
from math import sqrt
rmse_val = [] #to store rmse values for different k
for K in range(20):
    K = K+1
    model = KNeighborsRegressor(n_neighbors = K)

    model.fit(X_train_scaled, y_train)  #fit the model
    pred=model.predict(X_test_scaled) #make prediction on test set
    error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

In [None]:
#plotting the rmse values against k values
curve = pd.DataFrame(rmse_val) #elbow curve 
curve.plot()


In [None]:
#hypertuning 
# Use cross validation for selecting optimum value of 'k'
parameter = {'n_neighbors':[1,2,3,4]}

In [None]:
model_knn_tune = GridSearchCV(model_knn, parameter, cv=5)

In [None]:
model_knn_tune.fit(X_train_scaled,y_train)
model_knn_tune.score(X_train_scaled,y_train)

In [None]:
model_knn_tune.best_params_

In [None]:
y_pred_test = model_knn_tune.predict(X_test_scaled)
rmse_test_knn_tune = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(model_knn_tune.score(X_test_scaled,y_test))    
SSE = np.sum((y_pred_test-y_test)**2)
SST = np.sum((y_test-np.mean(y_train))**2)
r2_test_knn_tune = 1 - SSE/SST
print("Test RMSE LR : ", rmse_test_knn_tune)
print("Test SSE : ", SSE)
print("Test SST : ", SST)
print("Test R2 LR: ", r2_test_knn_tune)

### Results

In [None]:
MODEL_NAME = ['model_lr','model_rfe','model_dt','model_bag','model_rf','tune_model','model_xgb','model_xgb_cv','model_knn','model_knn_tune']
RMSE = [rmse_test_lr,rmse_test_rfe,rmse_test_dt,rmse_test_bag,rmse_test_rf1,rmse_test_rf2,rmse_test_bost1,rmse_test_bost2,rmse_test_knn,rmse_test_knn_tune]
R_square = [r2_test_lr,r2_test_rfe,r2_test_dt,r2_test_bag,r2_test_rf1,r2_test_rf2,r2_test_bost1,r2_test_bost2,r2_test_knn,r2_test_knn_tune]
Score = [model_lr.score(X_train,y_train),model_rfe.score(X_train_rfe,y_train),model_dt.score(X_train,y_train),model_bag.score(X_train,y_train),model_rf.score(X_train,y_train),
        tune_model.score(X_train,y_train),model_xgb.score(X_train,y_train),model_xgb_cv.score(X_train,y_train),model_knn.score(X_train_scaled,y_train),model_knn_tune.score(X_train_scaled,y_train)]

In [None]:
list_of_tuples = list(zip(MODEL_NAME, RMSE,R_square,Score)) 
list_of_tuples

In [None]:
Results = pd.DataFrame(list_of_tuples,
                  columns = ['MODEL_NAME', 'RMSE','R_Square_Test','Score']
                  )

In [None]:
Results['Difference'] = Results['Score'] - Results['R_Square_Test']
Results

# Model Deployment 

In [None]:
X_test.head()

In [None]:
X_test.columns

In [None]:
y_test.head()

In [None]:
import pickle
pickle.dump(model_dt, open('model_final.pkl','wb'))    

In [None]:
model = pickle.load(open('model_final.pkl','rb'))    # row 1
print(model.predict([[0.06940,0.77265,8,0.29707,0.47692,0.15639,0.19387]]))

In [None]:
model = pickle.load(open('model_final.pkl','rb'))    # row 2
print(model.predict([[0.26200,0.90800,8,0.40200,0.22100,0.04900,0.15500]]))

In [None]:
model = pickle.load(open('model_final.pkl','rb'))    # row 3
print(model.predict([[1.13062,1.04993,1,0.63104,0.29091,0.17457,0.13942]]))

In [None]:
model = pickle.load(open('model_final.pkl','rb'))    # row 4
print(model.predict([[1.05266,0.83309,4,0.61804,0.21006,0.16157,0.07044]]))

In [None]:
model = pickle.load(open('model_final.pkl','rb'))   # row 5
print(model.predict([[1.27000,1.52500,9,0.88400,0.64500,0.14200,0.37600]]))