In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../data/training_sample.csv")
df.head(3)

Unnamed: 0,unitID,weekID,outcome,treatment,X1,X2,X3,C1,C2,C3
0,UNIT01155,0,470,0.0,64.225447,88362,0,M,KF7,E_2
1,UNIT01155,1,534,0.0,64.225447,87892,0,M,KF7,E_2
2,UNIT01155,2,550,0.0,64.225447,87358,0,M,KF7,E_2


Covariates
1. X1 - static cts
2. X2 - temporal cts
3. X3 - temporal binary
4. C1 - static categorical (15 levels)
5. C2 - (Discard) static categorical (2495 levels)
6. C3 - static categorical (6 levels)

# Preprocessing

In [3]:
df = pd.concat([df, pd.get_dummies(df["C3"], drop_first=True)], axis=1).drop("C3", axis=1)
df['outcome'] = np.log(df['outcome'] + 1) # Deskew and bring to same scale as other covariates
df['X1'] = np.log(df['X1'])
df['X2'] = np.log(df['X2'])
df = df.drop(["C1", "C2"], axis=1)
df.head()

Unnamed: 0,unitID,weekID,outcome,treatment,X1,X2,X3,E_2,E_3,E_4,E_5,E_6
0,UNIT01155,0,6.154858,0.0,4.1624,11.389197,0,1,0,0,0,0
1,UNIT01155,1,6.282267,0.0,4.1624,11.383864,0,1,0,0,0,0
2,UNIT01155,2,6.311735,0.0,4.1624,11.37777,0,1,0,0,0,0
3,UNIT01155,3,6.33328,0.0,4.1624,11.371454,0,1,0,0,0,0
4,UNIT01155,4,7.393878,0.1,4.1624,11.364959,1,1,0,0,0,0


# Cross-validation split (Train 80 - Val 10 -Test 10)

In [4]:
# Train - Val - Test split
units = df['unitID'].unique()
num_units = len(units)
cnt_train_units = int(0.8 * num_units)
cnt_val_units = int(0.1 * num_units)
cnt_test_units = num_units - cnt_train_units - cnt_val_units

train_units = np.random.choice(units, size=cnt_train_units, replace=False)
val_units = np.random.choice(list(set(units)-set(train_units)), size=cnt_val_units, replace=False)
test_units = list(set(units)-set(train_units)-set(val_units))

train = df[df['unitID'].isin(train_units)].sort_values(by=['unitID', 'weekID'])
val = df[df['unitID'].isin(val_units)].sort_values(by=['unitID', 'weekID'])
test = df[df['unitID'].isin(test_units)].sort_values(by=['unitID', 'weekID'])

In [5]:
len(train_units), len(val_units), len(test_units) # No. of units in each set

(3126, 390, 392)

In [6]:
len(train), len(val), len(test) # Size of each set

(296970, 37050, 37240)

# Data Preparation for CRN

## Keys required in Dataset object
1. current_covariates
2. current_treatments
3. previous_treatments
4. outputs
5. active_entries

In [7]:
# Modeling parameters
num_time_steps = 95
num_treatments = 6
num_outputs = 1
horizon = 1 # Output (horizon:t)
offset = 1 # Covariates (1:t-offset)
input_features = ['outcome', 'X1', 'X2', 'X3', 'E_2', 'E_3', 'E_4', 'E_5', 'E_6']

In [8]:
def process_data_encoder(df):
    cnt_units = df['unitID'].nunique()

    current_covariates = df[input_features].values.reshape(cnt_units, num_time_steps, len(input_features))
    current_covariates = current_covariates[:, :-offset, :] # (num_units, 1-94 timesteps, num_input_features)

    current_treatments = pd.get_dummies(
                            df['treatment']).values.reshape(cnt_units, num_time_steps, num_treatments)
    current_treatments = current_treatments[:, :-offset, :] # (num_units, 1-94, 6). One-Hot-encoded treatments
    previous_treatments = current_treatments[:, :-1, :] # (num_units, 1-93, 6)

    outputs = df['outcome'].values.reshape(cnt_units, num_time_steps, num_outputs) 
    outputs = outputs[:, horizon:, :] # (num_units, 2-95 timesteps, 1)

    active_entries = np.ones((cnt_train_units, num_time_steps, 1)) # Each unit has data for all 95 time steps
    
    data = {"current_covariates": current_covariates, 
            "current_treatments": current_treatments,
            "previous_treatments": previous_treatments,
            "outputs": outputs,
            "active_entries": active_entries}
    return data

In [9]:
train_obj = process_data_encoder(train)
val_obj = process_data_encoder(val)
test_obj = process_data_encoder(test)

In [10]:
# Export pickle files

with open('train.p', 'wb') as f:
    pickle.dump(train_obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('val.p', 'wb') as f:
    pickle.dump(val_obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('test.p', 'wb') as f:
    pickle.dump(test_obj, f, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# with open("train.p", "rb") as f:
#     obj = pickle.load(f)
# obj.keys()