### Import Libraries

In [1]:
# Libraries for data processing and math 
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pymalts2 as pymalts

# Library for file path manipulation 
import os

# Set seed to control randomness
np.random.seed(156)

In [2]:
# Utility functions 
def binarize_treatment(df, treat):
    cutoffs_by_day = dict(cluster1_df.groupby('X').agg(np.median)[treat])
    cities = np.unique(df['City'])
    new_df = pd.DataFrame()
    for city in cities:
        new_df_city = pd.DataFrame()
        df_city = df[df['City'] == city]
        city_indicators = []
        for key in cutoffs_by_day:
            treat_indicator = int(
                df_city[df_city['X'] == key].loc[:, treat].iloc[0] >= cutoffs_by_day[key])
            city_indicators.append(treat_indicator)
        new_df_city = df_city
        new_df_city[treat] = city_indicators
        new_df = new_df.append(new_df_city)
    return new_df.reset_index().drop(columns=['index', 'City', 'X'])

treatments = ['PRES', 'TEMP', 'HUM', 'WSPD', 'NO2', 'O3', 'PM2.5', 'PM10', 'SO2', 'CO']

### Cluster 1 Overall- Matching Estimation

In [5]:
# Load the data and train the XGBoost models
root = os.path.dirname(os.getcwd())
data_dir = os.path.join(root, 'data')
### Cluster 1 Overall ###
c1_overall_path = os.path.join(data_dir, 'time_cluster_1.csv')
c1_overall = pd.read_csv(c1_overall_path)
all_vars = ['days', 'ACTV', 'TEMP', 'HUM', 'WSPD', 'PRES', 'NO2', 'O3', 'PM2.5', 'PM10', 'SO2', 'CO', 'Case', 'City', 'X']
cluster1_df = c1_overall[all_vars]

for treatment in treatments:
    binarized_treatment_df = binarize_treatment(cluster1_df, treatment)
    m = pymalts.malts_mf(outcome='Case', treatment=treatment, data=binarized_treatment_df,
                         estimator='linear')
    display(m.CATE_df)
    ate = m.CATE_df['avg.CATE'].mean()

KeyboardInterrupt: 

In [7]:
treatment = 'PRES'
binarized_treatment_df = binarize_treatment(cluster1_df, treatment)
m = pymalts.malts_mf(outcome='Case', treatment=treatment, data=binarized_treatment_df,
                         estimator='linear', output_format='brief')

In [9]:
m.CATE_df['avg.CATE'].mean()

11.858425203752844

In [10]:
m.CATE_df['std.CATE'].mean()

9.568477294399104