In [None]:
from src.training_pipeline import train_and_evaluate
from src.cohort import cohort_stability
from src.simulation import flip_top_positives
import pandas as pd
import numpy as np

### Load Data and Select Features

In [None]:
train_t0 = pd.read_csv('data/train-test-data/train_t0_30.csv')
test_t0 =  pd.read_csv('data/train-test-data/test_t0_30.csv')

train_t1 = pd.read_csv('data/train-test-data/train_t1_30.csv')
test_t1 =  pd.read_csv('data/train-test-data/test_t1_30.csv')

In [None]:
categorical_features = ['still_unemployed', 'employed_before', 'receipt_leh_before',
                        'receipt_lhg_before', 'se_before',
                        'ASU_notue_seeking_before', 'ASU_other_before', 'break_before',
                        'lastjob_none', 'lastjob_type', 'lastjob_pt', 'lastjob_niveau',
                        'lastjob_leih', 'lastjob_befrist', 'lastjob_industry',
                        'female', 'german_citizen',  'education_level_imputed', 'school_completed',
                        'occupation_skill_level', ]
ordinal_features = ['lastjob_tot_dur_cat', 'age_cat', 'LHG_total_cat', 'LHG_tot_dur_cat', 'LHG_m_dur_cat',
                    'LEH_total_cat', 'LEH_tot_dur_cat', 'LEH_m_dur_cat']

In [None]:
cols_to_drop = [
  'commuter', 
  'tsince_lm_contact_cat', 
  'tsince_ft_lm_contact_cat', 
  'tsince_ein_erw1_cat',
  'emp1_total_dur_cat',
  'emp1_total_dur_cat',
  'emp1_total_cat',
  'emp1_m_dur_cat',
  'bula',
  'est_total_cat',
  'days_remaining_in_spell'
]

train_t0 = train_t0.drop(columns=cols_to_drop)
test_t0 = test_t0.drop(columns=cols_to_drop)
train_t1 = train_t1.drop(columns=cols_to_drop)
test_t1 = test_t1.drop(columns=cols_to_drop)

### Analyze Feature and Outcome Stability over Time Points

In [None]:
result = cohort_stability(train_t0, train_t1)

In [None]:
cohort_stability(test_t0, test_t1)

## Train Models

### Train $t = 0$, Test $t = 1$

In [None]:
model, results = train_and_evaluate(
    train_df=train_t0,
    test_df=test_t1,
    outcome_col='remains_ue_horizon_days',
    categorical_features=categorical_features,
    ordinal_features=ordinal_features,
    filter_train_to_unemployed=False,
    filter_test_to_unemployed=True,
    run_name = 'train-t0-test-t1',
    save_dir='results/'
)

### Train $t = 1$, Test $t = 1$

In [None]:
model, results = train_and_evaluate(
    train_df=train_t1,
    test_df=test_t1,
    outcome_col='remains_ue_horizon_days',
    categorical_features=categorical_features,
    ordinal_features=ordinal_features,
    filter_train_to_unemployed=False,
    filter_test_to_unemployed=True,
    run_name = 'train-t1-test-t1',
    save_dir='results/'
)

### Train $t = 1$, Test $t = 0$

In [None]:
model, results = train_and_evaluate(
    train_df=train_t1,
    test_df=test_t0,
    outcome_col='remains_ue_horizon_days',
    categorical_features=categorical_features,
    ordinal_features=ordinal_features,
    filter_train_to_unemployed=False,
    filter_test_to_unemployed=True,
    run_name = 'train-t1-test-t0',
    save_dir='results/'
)

## Simulation of Performative Effects

In [None]:
model, results = train_and_evaluate(
    train_df=train_t0,
    test_df=test_t1,
    outcome_col='remains_ue_horizon_days',
    categorical_features=categorical_features,
    ordinal_features=ordinal_features,
    filter_train_to_unemployed=False,
    filter_test_to_unemployed=True,
    run_name = 'train-t0-test-t1',
    save_dir='results/'
)

In [None]:
p_values = np.linspace(0.01, 0.5, 50)
save_dir = "results/performative-sweep"

base_model, base_results = train_and_evaluate(
    train_df=train_t0,
    test_df=test_t1,
    outcome_col='remains_ue_horizon_days',
    categorical_features=categorical_features,
    ordinal_features=ordinal_features,
    filter_train_to_unemployed=False,
    filter_test_to_unemployed=True,
    run_name="base_model",
    save_dir=save_dir
)

In [None]:
base_predictions = base_results['train']['probabilities']
y_train_orig = base_results['train']['y_true'].values

# Run sweep
for i, p in enumerate(p_values):
    print(f"\n[{i+1}/{len(p_values)}] p = {p:.4f}")
    
    # Create modified training data with flipped outcomes
    train_t0_modified = train_t0.copy()
    y_flipped = flip_top_positives(base_predictions, y_train_orig, p)
    train_t0_modified['remains_ue_horizon_days'] = y_flipped
    
    print(f"   Flipped {(y_train_orig != y_flipped).sum()} outcomes")
    
    model, results = train_and_evaluate(
        train_df=train_t0_modified,
        test_df=test_t1,
        outcome_col='remains_ue_horizon_days',
        categorical_features=categorical_features,
        ordinal_features=ordinal_features,
        filter_train_to_unemployed=False,
        filter_test_to_unemployed=False,
        run_name=f"p_{p:.4f}",
        save_dir=save_dir
    )

## Theorem 3.7.

In [None]:
train_t0 = pd.read_csv('data/train-test-data/train_t0_3_7.csv')
test_t0 =  pd.read_csv('data/train-test-data/test_t0_3_7.csv')

train_t1 = pd.read_csv('data/train-test-data/train_t1_3_7.csv')
test_t1 =  pd.read_csv('data/train-test-data/test_t1_3_7.csv')

In [None]:
cols_to_drop = [
  'commuter', 
  'tsince_lm_contact_cat', 
  'tsince_ft_lm_contact_cat', 
  'tsince_ein_erw1_cat',
  'emp1_total_dur_cat',
  'emp1_total_dur_cat',
  'emp1_total_cat',
  'emp1_m_dur_cat',
  'bula',
  'est_total_cat',
  'days_remaining_in_spell',
  'still_unemployed',
  'ASU_notue_seeking_before',
  'ASU_other_before',
  'employed_before',
  'receipt_leh_before',
  'lastjob_industry',
  'lastjob_befrist',
  'LEH_total_cat'
]

train_t0 = train_t0.drop(columns=cols_to_drop)
test_t0 = test_t0.drop(columns=cols_to_drop)
train_t1 = train_t1.drop(columns=cols_to_drop)
test_t1 = test_t1.drop(columns=cols_to_drop)

In [None]:
result = cohort_stability(train_t0, train_t1, outcome_cols=['remains_ue_horizon_days'])

In [None]:
from src.training_pipeline import prepare_features_and_outcome
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Sort by person_id
train_t0_sorted = train_t0.sort_values('person_id').reset_index(drop=True)
train_t1_sorted = train_t1.sort_values('person_id').reset_index(drop=True)

# Prepare features
X_t0, y_t0, cat_categories, ord_mappings = prepare_features_and_outcome(
    train_t0_sorted, outcome_col='remains_ue_horizon_days',
    categorical_features=categorical_features,
    ordinal_features=ordinal_features,
    filter_to_unemployed=False
)

X_t1, y_t1, _, _ = prepare_features_and_outcome(
    train_t1_sorted, outcome_col='remains_ue_horizon_days',
    categorical_features=categorical_features,
    ordinal_features=ordinal_features,
    filter_to_unemployed=False,
    cat_categories=cat_categories,
    ord_mappings=ord_mappings
)

# Scale and PCA - use fit() + transform() for both
scaler = StandardScaler()
pca = PCA(n_components=4)

X_t0_scaled = scaler.fit_transform(X_t0)
X_t1_scaled = scaler.transform(X_t1)

pca.fit(X_t0_scaled)
X_t0_pca = pca.transform(X_t0_scaled)
X_t1_pca = pca.transform(X_t1_scaled)

print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2%}")

# Build DataFrames
pca_t0 = pd.DataFrame(X_t0_pca, columns=['PC1', 'PC2', 'PC3', 'PC4'])
pca_t0['person_id'] = train_t0_sorted['person_id'].values

pca_t1 = pd.DataFrame(X_t1_pca, columns=['PC1', 'PC2', 'PC3', 'PC4'])
pca_t1['person_id'] = train_t1_sorted['person_id'].values

cohort_stability(pca_t0, pca_t1)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Scale existing PCA to [0, 1]
minmax = MinMaxScaler()
pca_t0_scaled = minmax.fit_transform(pca_t0[['PC1', 'PC2', 'PC3', 'PC4']])
pca_t1_scaled = minmax.transform(pca_t1[['PC1', 'PC2', 'PC3', 'PC4']])

# Training data: x from t1, y from t0
pca_train = pd.DataFrame(pca_t1_scaled, columns=['PC1', 'PC2', 'PC3', 'PC4'])
pca_train['person_id'] = pca_t1['person_id'].values
pca_train['remains_ue_horizon_days'] = y_t0.values

# Train
model, results = train_and_evaluate(
    train_df=pca_train,
    test_df=pca_train,
    outcome_col='remains_ue_horizon_days',
    categorical_features=[],
    ordinal_features=[],
    filter_train_to_unemployed=False,
    filter_test_to_unemployed=False,
    run_name="pca_x2_y1",
    save_dir="results/pca-models"
)
