In [1]:
import os

import pandas as pd
import pyreadr

from utils.data_utils import categorical_to_one_hot, standardize_column

In [2]:
REPO_DIR = "/path/to/repo/directory"
DATA_DIR = os.path.join(REPO_DIR, "datasets")
filepath = os.path.join(DATA_DIR, "jtpa.RData")

In [3]:
result = pyreadr.read_r(filepath)

In [4]:
df: pd.DataFrame = result[list(result.keys())[0]]
df.head()

Unnamed: 0,bfid,ra_stat,demog,enroll,sex,age,race,site,sitenum,single,...,earn_16,earn_17,earn_18,earn_19,earn_20,earn_21,earn_22,earn_23,earn_24,sum_earn_1.18i
0,200015.0,0,1,0.0,1,54,1,PR,15.0,0.0,...,3720.0,3600.0,3720.0,3720.0,3360.0,3720.0,3600.0,3720.0,3600.0,61713.5
1,200023.0,0,1,0.0,1,32,3,IN,5.0,0.0,...,2270.0,1664.0,0.0,0.0,,,,,,36869.5
2,200049.0,0,1,0.0,1,52,1,PR,15.0,1.0,...,1300.0,1343.0,1343.0,1213.0,1343.0,1300.0,1343.0,1300.0,1343.0,21142.75
3,200061.0,0,1,0.0,1,26,1,IN,5.0,0.0,...,1590.0,1539.0,1590.0,1590.0,1436.0,1590.0,1539.0,1590.0,1539.0,31355.0
4,200067.0,0,1,0.0,1,43,1,PR,15.0,0.0,...,1414.0,1461.0,1414.0,1461.0,1461.0,1320.0,1461.0,1414.0,1461.0,25220.5


In [5]:
# Select the columns we care about and rename the IV, treatment, outcome.
df = df[
    [
        "ra_stat",
        "enroll",
        "sum_earn_1.18i",
        "demog",
        "sex",
        "age",
        "race",
        "site",
        "single",
    ]
]
df = df.rename(columns={"ra_stat": "Z", "enroll": "X", "sum_earn_1.18i": "Y"})

In [6]:
# remove the ENP group. We only want the RCT data.
df = df[df["Z"] != 0]

# Set Z = 0 (1) to control (treatment) IV.
df.loc[df["Z"] == 2, "Z"] = 0
df.loc[df["Z"] == 1, "Z"] = 1

df["X"] = df["X"].astype(int)
df = df.reset_index(drop=True)

In [7]:
# do not convert "single" column to one-hot because it contains
# some values between (0, 1) due to imputation. But since it is
# binary, it should be fine to use in a regression as is.
categorical_columns = ["demog", "sex", "race", "site"]
for cat_col in categorical_columns:
    df = categorical_to_one_hot(df, cat_col)

df = standardize_column(df, "age")
df = standardize_column(df, "Y", rename=False)
df.head()

Unnamed: 0,Z,X,Y,age,single,demog_1,demog_2,sex_1,sex_2,race_1,race_2,race_3,race_4,race_5,site_CC,site_IN,site_JC,site_PR,age_standardized
0,1,1,0.502365,24,0.0,1,0,1,0,0,0,1,0,0,1,0,0,0,-1.015417
1,1,1,-0.999597,35,0.345352,1,0,1,0,0,1,0,0,0,1,0,0,0,0.408139
2,1,1,0.384344,31,1.0,1,0,1,0,0,0,1,0,0,1,0,0,0,-0.109518
3,1,1,2.620435,32,0.0,1,0,1,0,1,0,0,0,0,1,0,0,0,0.019896
4,1,0,-0.15835,22,0.0,1,0,1,0,1,0,0,0,0,1,0,0,0,-1.274246


In [100]:
df.to_pickle(os.path.join(DATA_DIR, "jtpa_processed.pkl"))