In [1]:
# Package imports

# Basics
import numpy as np
import pandas as pd

# Visualization
from plotnine import *
import matplotlib.pyplot as plt

# Fixed Effects
import linearmodels as lm
import statsmodels.formula.api as smf
#from statsmodels.regression.linear_model import OLS

# General ML
import sklearn
from sklearn.pipeline import Pipeline

# Splits and CV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV 

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Machine learning regression algorithms 
from sklearn.linear_model import LinearRegression as LR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RF

# Model Interpretation
from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence
from sklearn.inspection import permutation_importance
from pdpbox import pdp

In [2]:
# Data import
df = pd.read_csv("../Data/GeneratedData/Cleaned_Data_Restricted.csv", index_col = "Unnamed: 0")
df.sample(5)

Unnamed: 0,Country,Life_Satisfaction,Diag_Account,EDI,Suffrage,GDPpc,Ineq_Frac
1692,Zimbabwe,3.703191,0.441,0.297,100.0,1560.0,36.120936
1091,Niger,4.26717,0.821,0.439,100.0,775.253443,697.387433
652,India,4.720147,0.859,0.68,100.0,4932.0,97.705538
764,Kazakhstan,5.88642,0.456,0.251,100.0,18665.765332,7.850014
786,Kenya,4.475654,0.805,0.467,100.0,3249.877118,29.887165


In [3]:
len(df)

1546

In [42]:
df.isnull().any(axis=1).sum()

0

In [4]:
# Data import 2
df_full = pd.read_csv("../Data/GeneratedData/Cleaned_Data_Full.csv", index_col = "Unnamed: 0")
df_full.sample(5)

Unnamed: 0,Country,Life_Satisfaction,EDI,Suffrage,Diag_Account,AFI,Democracy,GDPpc,Avg_Hours_Worked,U_Coverage,Ineq_Frac,Min_Wage,Gender_Pay_Gap,Mean_Earnings,Percent_Low_Income_Female
1358,South Africa,4.652429,0.753,100.0,0.946,0.768,1.0,11318.604549,44.3325,30.1,20.142587,284.65,,,49.83
1409,Sri Lanka,4.180569,0.463,100.0,0.529,0.352,0.0,8918.0,46.1582,,16.513093,,,137.73,
279,Chad,4.393482,0.275,100.0,0.549,,0.0,1661.0,,,273.471168,127.32,,,
848,Liberia,4.196063,0.653,100.0,0.943,0.779,1.0,854.234973,,,406.864091,,,4.66,
1215,Portugal,5.126912,0.868,100.0,0.948,0.967,1.0,24423.0,,17.1,6.212048,751.71,14.85,,56.2


In [5]:
len(df_full)

1696

In [43]:
df_full.isnull().any(axis=1).sum()

1690

In [6]:
# Data import 3
df_I = pd.read_csv("../Data/GeneratedData/Cleaned_Data_Imputed.csv", index_col = "Unnamed: 0")
df_I.sample(5)

Unnamed: 0,Country,Life_Satisfaction,EDI,Suffrage,Diag_Account,AFI,GDPpc,Avg_Hours_Worked,U_Coverage,Ineq_Frac,Min_Wage,Gender_Pay_Gap,Mean_Earnings,Percent_Low_Income_Female
1566,Ukraine,4.710803,0.451,100.0,0.794,0.727,10404.0,53.152892,13.366328,7.589133,152.38,-61.502706,408.48,21.574545
743,Japan,5.959362,0.833,100.0,0.911,0.748,36354.0,34.546778,17.7,12.845031,1356.89,23.985668,3029.85,63.323823
793,Kuwait,6.480031,0.31,100.0,0.625,0.483,75256.0,-5.884021,5.691644,11.591458,849.986716,250.319357,8070.743217,125.737297
1446,Syria,4.978971,0.15,100.0,0.088,0.097,6586.518348,54.135272,18.684727,21.606359,544.32,-102.442454,988.51,15.32252
1050,Nepal,4.910087,0.64,100.0,0.838,0.712,2727.423815,64.6441,24.78235,2.642749,866.449141,-111.771569,-1816.018017,9.419628


In [7]:
len(df_I)

1696

In [8]:
# Data import 3
df_U = pd.read_csv("../Data/GeneratedData/Cleaned_Data_Union.csv", index_col = "Unnamed: 0")
df_U.sample(5)

Unnamed: 0,Country,Life_Satisfaction,Diag_Account,EDI,Suffrage,GDPpc,U_Coverage,Ineq_Frac
1453,Taiwan,5.547682,0.926,0.758,100.0,34762.367306,38.5,13.604927
1191,Philippines,5.001965,0.924,0.582,100.0,6144.0,8.5,19.939938
1375,South Korea,5.767276,0.931,0.859,100.0,29716.57042,10.6,15.434059
1431,Sweden,7.434011,0.962,0.912,100.0,41811.0,67.7,6.720732
1586,United Kingdom,6.906547,0.951,0.867,100.0,34402.357979,27.4,9.730034


In [9]:
len(df_U)

572

# Methods
In this notebook, I will set up a machine learning pipeline, run various models, assess their results, and interpret these results.

## Logging Variables
Before we get started, we're going to log both `GDPpc_2011_prices` and `Inequality_Frac`.

In [10]:
# Logs GDP per capita
df['GDPpc'] =  np.where(df['GDPpc']==0,0,np.log(df['GDPpc']))
df_full['GDPpc'] =  np.where(df_full['GDPpc']==0,0,np.log(df_full['GDPpc']))

In [11]:
# Logs GDP per capita
df['Ineq_Frac'] =  np.where(df['Ineq_Frac']==0,0,np.log(df['Ineq_Frac']))
df_full['Ineq_Frac'] =  np.where(df_full['Ineq_Frac']==0,0,np.log(df_full['Ineq_Frac']))

## Models

## Restricted Fixed Effects
The following analysis does NOT use the union membership and average work hours variables

In [51]:
FE_ols = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + C(Country)', data = df).fit()
#print(FE_ols.summary())

In [52]:
# Converts the results to a dataframe
results = FE_ols.summary()
results_as_html = results.tables[1].as_html()
results = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [53]:
results.tail(10)

Unnamed: 0,index,coef,std err,t,P>|t|,[0.025,0.975]
151,C(Country)[T.Venezuela],0.1522,0.369,0.412,0.68,-0.572,0.876
152,C(Country)[T.Vietnam],0.2029,0.286,0.71,0.478,-0.357,0.763
153,C(Country)[T.Yemen],-0.7577,0.236,-3.212,0.001,-1.22,-0.295
154,C(Country)[T.Zambia],0.377,0.173,2.183,0.029,0.038,0.716
155,C(Country)[T.Zimbabwe],-0.1149,0.246,-0.468,0.64,-0.597,0.367
156,Diag_Account,-0.0049,0.232,-0.021,0.983,-0.459,0.449
157,EDI,0.2416,0.25,0.968,0.333,-0.248,0.731
158,Suffrage,-0.008,0.013,-0.617,0.537,-0.033,0.017
159,GDPpc,0.6199,0.087,7.104,0.0,0.449,0.791
160,Ineq_Frac,-0.2672,0.092,-2.907,0.004,-0.447,-0.087


## Full Fixed Effects
The following analysis uses the union membership and average work hours variables

In [15]:
FE_ols_full = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + Min_Wage + U_Coverage + C(Country)', data = df_full).fit()
#print(FE_ols.summary())

In [16]:
# Converts the results to a dataframe
results_full = FE_ols_full.summary()
results_as_html = results_full.tables[1].as_html()
results_full = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [34]:
results_full.tail(10)

Unnamed: 0,index,coef,std err,t,P>|t|,[0.025,0.975]
158,C(Country)[T.Yemen],0.0,0.0,,,0.0,0.0
159,C(Country)[T.Zambia],0.0,0.0,,,0.0,0.0
160,C(Country)[T.Zimbabwe],0.0,0.0,,,0.0,0.0
161,Diag_Account,-0.4147,0.722,-0.574,0.566,-1.836,1.007
162,EDI,1.0768,0.799,1.347,0.179,-0.496,2.65
163,Suffrage,0.2704,0.183,1.48,0.14,-0.089,0.63
164,GDPpc,1.4315,0.255,5.605,0.0,0.929,1.934
165,Ineq_Frac,0.1111,0.187,0.593,0.554,-0.258,0.48
166,Min_Wage,-1.2e-05,0.0,-0.084,0.933,-0.0,0.0
167,U_Coverage,0.0042,0.008,0.55,0.582,-0.011,0.019


## Imputed Fixed Effects
The following analysis uses the union membership and average work hours variables

In [30]:
FE_ols_I = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + Min_Wage + U_Coverage + C(Country)', data = df_I).fit()
#print(FE_ols.summary())

In [31]:
# Converts the results to a dataframe
results_I = FE_ols_I.summary()
results_as_html = results_I.tables[1].as_html()
results_I = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [37]:
results_I.tail(10)

Unnamed: 0,index,coef,std err,t,P>|t|,[0.025,0.975]
157,C(Country)[T.Vietnam],1.2476,0.184,6.781,0.0,0.887,1.608
158,C(Country)[T.Yemen],-0.1047,0.183,-0.572,0.567,-0.463,0.254
159,C(Country)[T.Zambia],0.7349,0.165,4.457,0.0,0.412,1.058
160,C(Country)[T.Zimbabwe],0.0772,0.178,0.434,0.665,-0.272,0.426
161,Diag_Account,-0.2222,0.217,-1.022,0.307,-0.649,0.204
162,EDI,0.5208,0.244,2.133,0.033,0.042,1.0
163,Suffrage,-0.0039,0.003,-1.358,0.175,-0.009,0.002
164,GDPpc,2.2e-05,5e-06,4.87,0.0,1.3e-05,3.1e-05
165,Ineq_Frac,-0.0012,0.0,-3.413,0.001,-0.002,-0.001
166,U_Coverage,-0.0015,0.002,-0.998,0.319,-0.005,0.001


In [21]:
len(results_I)

167

## Union Fixed Effects
The following analysis uses the union membership and average work hours variables

In [22]:
FE_ols_U = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + U_Coverage + C(Country)', data = df_U).fit()
#print(FE_ols.summary())

In [23]:
# Converts the results to a dataframe
results_U = FE_ols_U.summary()
results_as_html = results_U.tables[1].as_html()
results_U = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [36]:
results_U.tail(10)

Unnamed: 0,index,coef,std err,t,P>|t|,[0.025,0.975]
90,C(Country)[T.Venezuela],1.1089,0.286,3.871,0.0,0.546,1.672
91,C(Country)[T.Vietnam],0.5136,0.388,1.323,0.187,-0.249,1.277
92,C(Country)[T.Zambia],-0.1484,0.743,-0.2,0.842,-1.609,1.312
93,C(Country)[T.Zimbabwe],-0.2549,0.388,-0.658,0.511,-1.017,0.507
94,Diag_Account,-0.7375,0.621,-1.187,0.236,-1.958,0.483
95,EDI,0.8584,0.686,1.25,0.212,-0.491,2.207
96,Suffrage,0.4024,0.185,2.172,0.03,0.038,0.766
97,GDPpc,4e-05,8e-06,5.065,0.0,2.4e-05,5.5e-05
98,Ineq_Frac,-0.0031,0.006,-0.556,0.578,-0.014,0.008
99,U_Coverage,0.0071,0.006,1.174,0.241,-0.005,0.019


In [26]:
len(results_U)

100