# Setup

In [57]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

# uncomment the wanted dataset 
# data = pd.read_csv("data_by_country/ireland_treatment_2.csv")
# data = pd.read_csv("data_by_country/canada_treatment_2.csv")
data = pd.read_csv("data_by_country/australia_treatment_2.csv")
# data = pd.read_csv("data_by_country/canada_treatment_1.csv")

treatment = data["T"]
outcome = data["outcome"]
covariates = data.loc[:, ~data.columns.isin(['T', 'outcome'])]

scaler = preprocessing.StandardScaler()
scaler.fit(outcome.to_numpy().reshape(-1, 1))
y_scaled = scaler.transform(outcome.to_numpy().reshape(-1, 1))

x_t = np.column_stack([covariates, treatment])
X_train, X_test, y_train, y_test = train_test_split(x_t, outcome, test_size=0.2)

# Demonstrating performance for xgboost

In [61]:
def xgboost_learning_rate_tuning(X_train, X_test, y_train, y_test):
    learning_rates = [0.001, 0.1, 1, 10]
    test_score = []
    train_score = []
    for rate in learning_rates:
        xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = rate,
                    max_depth = 5, alpha = 10, n_estimators = 10)
        xg_reg.fit(X_train, y_train)
        test_score.append(r2_score(y_test, xg_reg.predict(X_test)))
        train_score.append(r2_score(y_train, xg_reg.predict(X_train)))
    return train_score, test_score

train, test = xgboost_learning_rate_tuning(X_train, X_test, y_train, y_test)
print(train)
print(test)

In [60]:
def xgboost_n_estimators_tuning(X_train, X_test, y_train, y_test):
    estimators = [6, 8,10,20]
    test_score = []
    train_score = []
    for val in estimators:
        xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 1,
                    max_depth = 5, alpha = 10, n_estimators = val)
        xg_reg.fit(X_train, y_train)
        test_score.append(r2_score(y_test, xg_reg.predict(X_test)))
        train_score.append(r2_score(y_train, xg_reg.predict(X_train)))
    return test_score, train_score

train, test = xgboost_n_estimators_tuning(X_train, X_test, y_train, y_test)
print(train)
print(test)

# Demonstrating performance for Lasso

In [None]:
def lasso_model(x_train, y_train, x_test, y_test, score='r2', folds=5):
    cv_scores = []
    test_scores = []
    alphas=np.logspace(-3, 0, 4)
    for param in alphas:    
        crossval_scores = cross_val_score(linear_model.Lasso(alpha= param, max_iter=10000), 
                                          x_train, y_train, scoring = score, cv = folds)
        cv_scores.append(crossval_scores.mean())
        lasso = linear_model.Lasso(alpha = param, max_iter=10000)
        lasso.fit(x_train, y_train)
        pred = lasso.predict(x_test)
        test_scores.append(r2_score(pred, y_test))
    print(cv_scores)
    print(test_scores)

lasso_model(X_train, y_train, X_test, y_test)

# Performance on the treated and the control groups

In [42]:
treated = covariates[treatment==1]
treated_outcome = outcome[treatment==1]

control = covariates[treatment==0]
control_outcome = outcome[treatment==0]

In [43]:
# check model performance on treated
X_train, X_test, y_train, y_test = train_test_split(treated, treated_outcome, test_size=0.2)
train, test = xgboost_learning_rate_tuning(X_train, X_test, y_train, y_test)
print(train)
print(test)

[-31.87309997044204, -3.840533795311935, 0.40076336409748736, -3.9120797134461003e+20]
[-31.661107383334695, -3.7713236124752063, 0.05535531660080373, -3.9183531607277076e+20]


In [44]:
# check model performance on control
X_train, X_test, y_train, y_test = train_test_split(control, control_outcome, test_size=0.2)
train, test = xgboost_learning_rate_tuning(X_train, X_test, y_train, y_test)
print(train)
print(test)

[-32.41995071033531, -3.9295724598916006, 0.3573944496696332, -3.988108848630128e+20]
[-31.33074665769398, -3.7050736864274727, 0.0778178967974037, -3.9172927566738915e+20]
