## Experiments with real world data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing


### Carlson (2015)

In [None]:
base = importr('base')
utils = importr('utils')
robjects.r('install.packages("FindIt", repos="https://CRAN.R-project.org/")')

In [118]:
findit = importr('FindIt')
robjects.r('data("Carlson", package = "FindIt")')
carlson_rdf = robjects.r['Carlson']
carlson_df = pandas2ri.rpy2py(carlson_rdf)

In [126]:
carlson_df.head(10)

Unnamed: 0,won,newRecordF,promise,coeth_voting,relevantdegree,respcodeS,contestresp
1,1,6,1,2,1,1,1014310000.0
2,0,7,3,1,2,1,1014310000.0
3,1,7,3,1,2,1,1014210000.0
4,0,3,2,1,2,1,1014210000.0
5,0,7,1,1,1,1,1014110000.0
6,1,1,3,1,2,1,1014110000.0
7,0,6,3,1,1,2,1011310000.0
8,1,1,3,2,1,2,1011310000.0
9,0,3,3,2,2,2,1011210000.0
10,1,5,2,1,2,2,1011210000.0


In [121]:
# Convert Record, Coethnicity and Degree to binary
carlson = carlson_df.iloc[:, :5].to_numpy(dtype=int)
carlson[:, 1] = np.where(carlson[:, 1] < 4, 1, 0)
carlson[:, 3:] = np.where(carlson[:, 3:] == 2, 0, 1)

In [122]:
# Convert promises to one-hot representations
one_hot_promises = np.zeros((carlson.shape[0], 3))
one_hot_promises[np.arange(carlson.shape[0]), carlson[:, 2] - 1] = 1
carlson = np.delete(carlson, 2, axis=1)
carlson = np.insert(carlson, 2, one_hot_promises.T, axis=1)

In [124]:
carlson[:10]

array([[1, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1, 1, 0],
       [0, 1, 0, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 1, 1],
       [1, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 1, 1, 1],
       [1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 1, 0]])

### 5-bit DAC process (Liu and Huang, NIST)

In [51]:
# Load data and take treatment and response variables only
dac = np.loadtxt("data/LIU.DAT", skiprows=25)
dac = dac[:, :6]

In [52]:
dac[11]

array([0.207254, 1.      , 1.      , 1.      , 0.      , 0.      ])

### Hainmueller and Hopkins (2014)