In [1]:
# Package imports

# Basics
import numpy as np
import pandas as pd

# Visualization
from plotnine import *
import matplotlib.pyplot as plt

# Fixed Effects
import linearmodels as lm
import statsmodels.formula.api as smf
#from statsmodels.regression.linear_model import OLS

# General ML
import sklearn
from sklearn.pipeline import Pipeline

# Splits and CV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV 

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Machine learning regression algorithms 
from sklearn.linear_model import LinearRegression as LR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RF

# Model Interpretation
from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence
from sklearn.inspection import permutation_importance
from pdpbox import pdp

In [2]:
# Data import
df = pd.read_csv("Data/GeneratedData/Cleaned_Data_Restricted.csv", index_col = "Unnamed: 0")
df.sample(5)

Unnamed: 0,Country,Life_Satisfaction,Diag_Account,EDI,Suffrage_%,GDPpc,Ineq_Frac,AFI
1259,Rwanda,3.108374,0.364,0.243,100.0,1819.083115,52.442,0.186
122,Belarus,5.225308,0.284,0.218,100.0,17918.0,6.670614,0.178
1006,Montenegro,5.074342,0.816,0.485,100.0,16064.0,7.11244,0.529
295,Chile,6.844238,0.958,0.902,100.0,21335.0,12.093797,0.953
360,Croatia,5.820908,0.903,0.785,100.0,19984.037431,4.300204,0.882


In [3]:
len(df)

1240

In [4]:
# Data import 2
df_full = pd.read_csv("Data/GeneratedData/Cleaned_Data_Full.csv", index_col = "Unnamed: 0")
df_full.sample(5)

Unnamed: 0,Country,Year,Life_Satisfaction,EDI,Suffrage_%,Diag_Account,AFI,Democracy,GDPpc,Avg_Hours_Worked,U_Coverage,Ineq_Diff,Ineq_Frac
654,India,2014,4.424379,0.688,100.0,0.807,0.675,1.0,5458.0,,,78.440422,96.519879
561,Ghana,2017,5.481311,0.754,100.0,0.929,0.817,1.0,4103.556071,,,67.956332,60.850571
787,Kenya,2018,4.655703,0.435,100.0,0.783,,0.0,3377.470366,,,,
860,Lithuania,2007,5.808285,0.811,100.0,0.921,0.951,1.0,20138.47241,,9.3,22.274336,8.704684
1097,Niger,2015,3.671454,0.624,100.0,0.93,,1.0,913.0,,,93.073129,632.503452


In [5]:
len(df_full)

1696

In [6]:
# Data import 3
df_I = pd.read_csv("Data/GeneratedData/Cleaned_Data_Imputed.csv", index_col = "Unnamed: 0")
df_I.sample(5)

Unnamed: 0,Country,Life_Satisfaction,EDI,Suffrage_%,Diag_Account,AFI,GDPpc,Avg_Hours_Worked,U_Coverage,Ineq_Frac
1017,Morocco,5.386307,0.299,100.0,0.707,0.468,8039.0,42.695793,13.84946,22.384369
1457,Taiwan,6.340344,0.817,100.0,0.927,0.887,39971.0,39.233176,23.332356,13.137982
1454,Taiwan,6.228531,0.76,100.0,0.918,0.889,37385.489358,38.771868,39.3,13.387112
39,Argentina,6.468387,0.816,100.0,0.905,0.885,19599.0,40.733657,30.0,10.460968
251,Cameroon,4.240441,0.354,100.0,0.726,0.403,2696.0,44.8946,6.9,90.786271


In [7]:
len(df_I)

1696

# Methods
In this notebook, I will set up a machine learning pipeline, run various models, assess their results, and interpret these results.

## Logging Variables
Before we get started, we're going to log both `GDPpc_2011_prices` and `Inequality_Frac`.

In [8]:
# Logs GDP per capita
df['GDPpc'] =  np.where(df['GDPpc']==0,0,np.log(df['GDPpc']))
df_full['GDPpc'] =  np.where(df_full['GDPpc']==0,0,np.log(df_full['GDPpc']))

In [9]:
# Logs GDP per capita
df['Ineq_Frac'] =  np.where(df['Ineq_Frac']==0,0,np.log(df['Ineq_Frac']))
df_full['Ineq_Frac'] =  np.where(df_full['Ineq_Frac']==0,0,np.log(df_full['Ineq_Frac']))

## Models

## Restricted Fixed Effects
The following analysis does NOT use the union membership and average work hours variables

In [10]:
FE_ols = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + AFI + C(Country)', data = df).fit()
#print(FE_ols.summary())

SyntaxError: invalid syntax (<unknown>, line 1)

In [None]:
# Converts the results to a dataframe
results = FE_ols.summary()
results_as_html = results.tables[1].as_html()
results = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [None]:
results.loc[results["P>|t|"] < 0.05]

## Full Fixed Effects
The following analysis uses the union membership and average work hours variables

In [None]:
FE_ols_full = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + AFI + Avg_Hours_Worked + U_Coverage + C(Country)', data = df_full).fit()
#print(FE_ols.summary())

In [None]:
# Converts the results to a dataframe
results_full = FE_ols_full.summary()
results_as_html = results_full.tables[1].as_html()
results_full = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [None]:
results_full.loc[results_full["P>|t|"] < 0.05]

## Imputed Fixed Effects
The following analysis uses the union membership and average work hours variables

In [None]:
FE_ols_I = smf.ols(formula='Life_Satisfaction ~ 1 + Diag_Account + EDI + Suffrage + GDPpc + Ineq_Frac + AFI + Avg_Hours_Worked + U_Coverage + C(Country)', data = df_I).fit()
#print(FE_ols.summary())

In [None]:
# Converts the results to a dataframe
results_I = FE_ols_I.summary()
results_as_html = results_I.tables[1].as_html()
results_I = pd.read_html(results_as_html, header=0, index_col=0)[0].reset_index()

In [None]:
results_I.loc[results_I["P>|t|"] < 0.05]

In [None]:
len(results_I)