In [4]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn import linear_model
import xgboost as xgb


def data_preprocessing(country_data, treatment):
    data = country_data.drop(columns=['CNT', 'StIDStd']).copy()
    data['siblings'] = ((data['ST11Q03']==1) | (data['ST11Q04']==1)).astype(int)
    if treatment == "Treatment 1":
        treatment = (data["ST05Q01"]>1).astype(int)
    else:
        data = data[data["ST05Q01"]>1].copy()
        treatment = (data["ST05Q01"] == 3).astype(int)
    outcome = data['PV1READ']
    categorial_properties = ['ST01Q01', 'ST04Q01', 'ST03Q02', 'ST03Q01', 'ST11Q01', 
                             'ST11Q02', 'siblings', 'ST28Q01', 'ST06Q01',
                             'ST25Q01', 'misced', 'fisced', "OCOD1", "OCOD2", 'STRATUM','IMMIG'
                            ]
    numerical_properties = ["ST21Q01", "BMMJ1", "BFMJ2", "ESCS", "WEALTH"]
    
    numerical_covariates = data[numerical_properties]
    categorial_covariates = data[categorial_properties]
    
    # scaling numerical properties
    scaler = preprocessing.StandardScaler()
    scaler.fit(numerical_covariates)
    x_scaled = scaler.transform(numerical_covariates)
    
    # encoding categorial covariates
    ohe = preprocessing.OneHotEncoder()
    ohe.fit(categorial_covariates)
    categorial_encoding = ohe.transform(categorial_covariates).toarray()
    
    #remove dependent categorial columns
    variables_in_category = [data[col].nunique() for col in categorial_properties]
    columns_to_remove = [variables_in_category[0]-1]
    for i in range(1, len(variables_in_category)):
        columns_to_remove.append(variables_in_category[i] + columns_to_remove[i-1])
    categorial_encoding = np.delete(categorial_encoding, columns_to_remove, 1)
    
    X = np.concatenate([x_scaled, categorial_encoding], axis=1)

    return X, treatment, outcome

In [6]:
def estimate_propensity(X, treatment):
    # Learn propensity score
    lr_learner = LogisticRegression(C=10, solver='lbfgs', max_iter=10000)
    lr_learner.fit(X, treatment)
    propensity_score = lr_learner.predict_proba(X)[:, 1]   
    return propensity_score


def trim_common_support(X, treated_propensity_score, control_propensity_score, propensity_scores, treatment, outcome):
    """
    Trim data that does not appear to maintain common support, using min max approach on propensity core
    """
    min_treated = np.min(treated_propensity_score)
    max_treated = np.max(treated_propensity_score)
    min_control = np.min(control_propensity_score)
    max_control = np.max(control_propensity_score)
    max_min = np.maximum(min_control, min_treated)
    min_max = np.minimum(max_control, max_treated)

    indices_smaller_than_max_min = np.argwhere(propensity_scores < max_min)
    indices_greater_than_min_max = np.argwhere(propensity_scores > min_max)
    rows_to_delete = np.concatenate([indices_greater_than_min_max, indices_smaller_than_max_min])
    rows_to_delete = rows_to_delete.reshape((-1,))
    exclude_idx = set(rows_to_delete)
    mask = np.array([(i in exclude_idx) for i in range(len(X))])

    return X[~mask], propensity_scores[~mask], treatment[~mask], outcome[~mask]
    
    
def sample_size_table(original_treatment, trimmed_treatment):
    all_population = [len(original_treatment), len(trimmed_treatment)]
    treated = [original_treatment.sum(), trimmed_treatment.sum()]
    control = [(1-original_treatment).sum(), (1-trimmed_treatment).sum()]
    return pd.DataFrame([all_population, treated, control], 
                        index=['Sample size', 'Treated group size', 'Control group size'], 
                       columns=['Before trimming', 'After trimming'])

In [5]:
data = pd.read_csv("PISA2012_data.csv")

In [7]:
country = data[data["CNT"] == "CAN"].copy()
X, original_treatment, outcome = data_preprocessing(country, "Treatment 2")
original_propensity = estimate_propensity(X, original_treatment)
treated_propensity_score = original_propensity[original_treatment == 1]
control_propensity_score = original_propensity[original_treatment == 0]
X, propensity, treatment, outcome = trim_common_support(X, treated_propensity_score, 
                                                        control_propensity_score, original_propensity, 
                                                        original_treatment, outcome)
sample_size_table(original_treatment, treatment)

Unnamed: 0,Before trimming,After trimming
Sample size,8223,8207
Treated group size,3911,3906
Control group size,4312,4301


In [48]:
covariates = pd.DataFrame(X)
treatment_outcome = pd.DataFrame({"T":treatment, "outcome":outcome})
pd.concat([covariates, treatment_outcome.reset_index(drop=True)], 
          axis=1).to_csv("data_by_country/canada12_treatment_2.csv", index=False)