In [1]:
# Package imports

# Basics
import numpy as np
import pandas as pd

# Visualization
from plotnine import *
import matplotlib.pyplot as plt

# Fixed Effects
import linearmodels as lm
import statsmodels.formula.api as smf
#from statsmodels.regression.linear_model import OLS

# General ML
import sklearn
from sklearn.pipeline import Pipeline

# Splits and CV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV 

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Machine learning regression algorithms 
from sklearn.linear_model import LinearRegression as LR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RF

# Model Interpretation
from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence
from sklearn.inspection import permutation_importance
from pdpbox import pdp

In [2]:
# Data import
df = pd.read_csv("Data/GeneratedData/Cleaned_Data_Restricted.csv", index_col = "Unnamed: 0")
df.sample(5)

Unnamed: 0,Country,Life_Satisfaction,Diag_Account,EDI,Suffrage,GDPpc,Ineq_Frac,AFI
41,Argentina,6.671114,0.907,0.764,100.0,19183.0,9.509527,0.888
1497,Thailand,6.201763,0.281,0.15,99.23,15020.0,18.63903,0.192
736,Japan,6.515817,0.927,0.857,100.0,34857.75582,13.246765,0.748
1587,United Kingdom,7.029364,0.949,0.869,100.0,34754.472261,10.293201,0.92
742,Japan,5.968216,0.928,0.858,100.0,35580.0,12.879483,0.748


In [3]:
len(df)

1240

In [4]:
# Data import 2
df_full = pd.read_csv("Data/GeneratedData/Cleaned_Data_Full.csv", index_col = "Unnamed: 0")
df_full.sample(5)

Unnamed: 0,Country,Year,Life_Satisfaction,EDI,Suffrage,Diag_Account,AFI,Democracy,GDPpc,Avg_Hours_Worked,U_Coverage,Ineq_Diff,Ineq_Frac
174,Bosnia and Herzegovina,2014,5.248954,0.58,100.0,0.879,0.719,1.0,9201.0,42.3732,,16.780696,11.598294
411,Denmark,2015,7.514425,0.902,100.0,0.976,0.94,1.0,44635.0,,68.6,9.658316,5.382833
1245,Russia,2014,6.036977,0.295,100.0,0.409,0.43,0.0,24387.0,38.8897,31.2,15.100248,6.547498
908,Malawi,2013,4.035084,0.599,100.0,0.875,0.726,1.0,1104.0,34.1332,5.5,59.180354,37.387067
1222,Qatar,2011,6.591604,0.084,100.0,0.22,,0.0,155533.0,,,26.834926,11.06744


In [5]:
len(df_full)

1696

In [6]:
# Data import 3
df_I = pd.read_csv("Data/GeneratedData/Cleaned_Data_Imputed.csv", index_col = "Unnamed: 0")
df_I.sample(5)

Unnamed: 0,Country,Life_Satisfaction,EDI,Suffrage,Diag_Account,AFI,GDPpc,Avg_Hours_Worked,U_Coverage,Ineq_Frac
644,Iceland,7.510035,0.858,100.0,0.94,0.848738,42085.0,36.228011,90.4,6.846441
210,Bulgaria,4.837561,0.713,100.0,0.873,0.863,17037.0,41.415136,13.7,5.471968
689,Iraq,4.541502,0.438,100.0,0.682,0.552812,12817.0,42.049323,16.608328,16.364711
763,Kazakhstan,5.718554,0.247,100.0,0.463,0.328,18181.059822,41.226567,20.735813,7.92754
645,Iceland,7.476214,0.89,100.0,0.948,0.917756,42978.234912,38.687921,26.528031,6.840148


In [7]:
len(df_I)

1696

# Methods
In this notebook, I will set up a machine learning pipeline, run various models, assess their results, and interpret these results.

## Logging Variables
Before we get started, we're going to log both `GDPpc_2011_prices` and `Inequality_Frac`.

In [8]:
# Logs GDP per capita
df['GDPpc'] =  np.where(df['GDPpc']==0,0,np.log(df['GDPpc']))
df_full['GDPpc'] =  np.where(df_full['GDPpc']==0,0,np.log(df_full['GDPpc']))

In [9]:
# Logs GDP per capita
df['Ineq_Frac'] =  np.where(df['Ineq_Frac']==0,0,np.log(df['Ineq_Frac']))
df_full['Ineq_Frac'] =  np.where(df_full['Ineq_Frac']==0,0,np.log(df_full['Ineq_Frac']))

## Models

## Restricted Fixed Effects
The following analysis does NOT use the union membership and average work hours variables

In [10]:
FE_ols = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + AFI + C(Country)', data = df).fit()
#print(FE_ols.summary())

In [11]:
# Converts the results to a dataframe
results = FE_ols.summary()
results_as_html = results.tables[1].as_html()
results = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [12]:
results.loc[results["P>|t|"] < 0.05]

Unnamed: 0,index,coef,std err,t,P>|t|,[0.025,0.975]
1,C(Country)[T.Algeria],0.9878,0.222,4.443,0.000,0.552,1.424
2,C(Country)[T.Argentina],0.9277,0.174,5.334,0.000,0.586,1.269
3,C(Country)[T.Armenia],-0.4156,0.160,-2.598,0.009,-0.729,-0.102
4,C(Country)[T.Austria],1.1134,0.220,5.064,0.000,0.682,1.545
6,C(Country)[T.Bahrain],0.7059,0.256,2.758,0.006,0.204,1.208
...,...,...,...,...,...,...,...
125,C(Country)[T.Zimbabwe],1.4220,0.301,4.717,0.000,0.830,2.014
126,Diag_Account,-0.6363,0.298,-2.137,0.033,-1.220,-0.052
129,GDPpc,0.6646,0.093,7.160,0.000,0.482,0.847
130,Ineq_Frac,-0.3155,0.096,-3.299,0.001,-0.503,-0.128


## Full Fixed Effects
The following analysis uses the union membership and average work hours variables

In [13]:
FE_ols_full = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + AFI + Avg_Hours_Worked + U_Coverage + C(Country)', data = df_full).fit()
#print(FE_ols.summary())

In [14]:
# Converts the results to a dataframe
results_full = FE_ols_full.summary()
results_as_html = results_full.tables[1].as_html()
results_full = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [15]:
results_full.loc[results_full["P>|t|"] < 0.05]

Unnamed: 0,index,coef,std err,t,P>|t|,[0.025,0.975]
14,C(Country)[T.Bhutan],9.289e-14,4.49e-14,2.069,0.042,3.62e-15,1.82e-13
17,C(Country)[T.Botswana],4.211e-14,1.79e-14,2.347,0.021,6.43e-15,7.78e-14
18,C(Country)[T.Brazil],1.0754,0.306,3.512,0.001,0.467,1.684
27,C(Country)[T.Chile],0.9102,0.414,2.2,0.031,0.087,1.733
29,C(Country)[T.Colombia],0.8699,0.306,2.84,0.006,0.261,1.479
34,C(Country)[T.Croatia],-1.0978,0.427,-2.571,0.012,-1.947,-0.249
56,C(Country)[T.Guatemala],1.2044,0.546,2.205,0.03,0.118,2.291
57,C(Country)[T.Guinea],1.462e-16,6.990000000000001e-17,2.092,0.039,7.25e-18,2.85e-16
69,C(Country)[T.Israel],1.2769,0.441,2.893,0.005,0.399,2.155
94,C(Country)[T.Mexico],1.8659,0.341,5.467,0.0,1.187,2.545


## Imputed Fixed Effects
The following analysis uses the union membership and average work hours variables

In [16]:
FE_ols_I = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + AFI + Avg_Hours_Worked + U_Coverage + C(Country)', data = df_I).fit()
#print(FE_ols.summary())

In [17]:
# Converts the results to a dataframe
results_I = FE_ols_I.summary()
results_as_html = results_I.tables[1].as_html()
results_I = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [18]:
results_I.loc[results_I["P>|t|"] < 0.05]

Unnamed: 0,index,coef,std err,t,P>|t|,[0.025,0.975]
0,Intercept,4.444100,0.464000,9.570,0.000,3.533000,5.355000
1,C(Country)[T.Albania],0.498500,0.203000,2.452,0.014,0.100000,0.897000
2,C(Country)[T.Algeria],1.177900,0.210000,5.620,0.000,0.767000,1.589000
4,C(Country)[T.Argentina],1.578900,0.221000,7.142,0.000,1.145000,2.013000
6,C(Country)[T.Australia],1.793400,0.301000,5.954,0.000,1.203000,2.384000
...,...,...,...,...,...,...,...
157,C(Country)[T.Vietnam],1.091200,0.191000,5.713,0.000,0.717000,1.466000
159,C(Country)[T.Zambia],0.410600,0.188000,2.183,0.029,0.042000,0.780000
161,Diag_Account,-0.612900,0.241000,-2.545,0.011,-1.085000,-0.141000
164,GDPpc,0.000024,0.000005,5.268,0.000,0.000015,0.000033


In [19]:
len(results_I)

169