In [None]:
import pandas as pd
from src.preprocess_siab import preprocess_siab_data, filter_to_cohort_people
from src.cohort import define_cohort
from src.cohort_covariates import select_model_columns, generate_covariates_for_cohort_at_timepoint

## Preprocess SIAB data

In [None]:
reader = pd.read_stata(
    'data/siab_r_7521_v1.dta',
    iterator=True,
    chunksize=200_000,
    convert_categoricals=False # adjust
)

filtered_chunks = []
for chunk in reader:
    print("Reading chunk...")
    chunk['begorig'] = pd.to_datetime(chunk['begorig'])
    filtered_chunks.append(chunk)

df_all = pd.concat(filtered_chunks, ignore_index=True)


In [None]:
df_all = pd.concat(filtered_chunks, ignore_index=True)
df_filtered = filter_to_cohort_people(df_all, entry_month_start = '2012-01', entry_month_end= '2012-12')
df_processed = preprocess_siab_data(df_filtered)
df_processed.to_csv('data/siab_processed_2012_entry.csv', index=False)

## Load SIAB data and construct jobseeker cohort

In [None]:
df = pd.read_csv('data/siab_processed_2012_entry.csv')

In [None]:
train_cohort, test_cohort = define_cohort(df, entry_month_start="2012-01", entry_month_end="2012-12")

## Generalization at $t = 0$

In [None]:
train_t_0 = select_model_columns(generate_covariates_for_cohort_at_timepoint(df=df, outcome_horizon_days=14,
                                                                             cohort=train_cohort, time_offset_days=0))
test_t_0 = select_model_columns(generate_covariates_for_cohort_at_timepoint(df=df, outcome_horizon_days=14,
                                                                            cohort=test_cohort, time_offset_days=0))

# train_t_0 = select_model_columns(generate_covariates_for_cohort_at_timepoint(df=df, outcome_horizon_days=14,
#                                                                              cohort=train_cohort, time_offset_days=60))
# test_t_0 = select_model_columns(generate_covariates_for_cohort_at_timepoint(df=df, outcome_horizon_days=14,
#                                                                             cohort=test_cohort, time_offset_days=60))

In [None]:
min_ue_days = 7

train_t_0 = train_t_0[train_t_0['still_unemployed'] == 1]
#train_t_0 = train_t_0[train_t_0['days_remaining_in_spell'] >= min_ue_days]

test_t_0 = test_t_0[test_t_0['still_unemployed'] == 1]
#test_t_0 = test_t_0[test_t_0['days_remaining_in_spell'] >= min_ue_days]

ids = train_t_0["person_id"].unique()
train_cohort = train_cohort[train_cohort["person_id"].isin(ids)]

ids = test_t_0["person_id"].unique()
test_cohort = test_cohort[test_cohort["person_id"].isin(ids)]


In [None]:
train_t_0.to_csv('data/train-test-data/train_t0_3_7.csv', index=False)
test_t_0.to_csv('data/train-test-data/test_t0_3_7.csv', index=False)

# train_t_0.to_csv('data/train-test-data/train_t0_30.csv', index=False)
# test_t_0.to_csv('data/train-test-data/test_t0_30.csv', index=False)

## Generalization at $t = 1$

In [None]:
train_t_1 = select_model_columns(generate_covariates_for_cohort_at_timepoint(df=df, outcome_horizon_days=14,
                                                                             cohort=train_cohort, time_offset_days=14))
test_t_1 = select_model_columns(generate_covariates_for_cohort_at_timepoint(df=df, outcome_horizon_days=14,
                                                                            cohort=test_cohort, time_offset_days=14))

# train_t_1 = select_model_columns(generate_covariates_for_cohort_at_timepoint(df=df, outcome_horizon_days=14,
#                                                                              cohort=train_cohort, time_offset_days=74))
# test_t_1 = select_model_columns(generate_covariates_for_cohort_at_timepoint(df=df, outcome_horizon_days=14,
#                                                                             cohort=test_cohort, time_offset_days=74))

In [None]:
train_t_1.to_csv('data/train-test-data/train_t1_3_7.csv', index=False)
test_t_1.to_csv('data/train-test-data/test_t1_3_7.csv', index=False)

# train_t_0.to_csv('data/train-test-data/train_t1_3_7.csv', index=False)
# test_t_0.to_csv('data/train-test-data/test_t1_3_7.csv', index=False)