In [1]:
from data_prep.data_prep import prepare_all_data
from training.training import train_model, prepare_model_data, create_learner, grid_search
from training.feature_selection import forward_feature_selection, correlation_feature_selection
from model import burglary_model
from utils.utils import single_out_last, setup_reproducibility
from testing.testing import PredictionTester, StatisticalTester

model_tuple, occupation_mappings,ward_idx_map = prepare_all_data("../merged_data.parquet", "lsoa")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = setup_reproducibility(42)
training_data, testing_data = single_out_last(model_tuple[0])

In [None]:
#Automated Feature Selection Block 
#Correlation Feature Selection: Computationally Light
#FORWARD Feature Selection: Computationally Heavy, More Accurate but needs a GPU
inner_train,inner_val = single_out_last(training_data)
candidate_features = {
    "static": model_tuple[1],
    'dynamic': model_tuple[2],
    'seasonal': model_tuple[3],
    'time_trend': model_tuple[4],
    'temporal': model_tuple[5],
    'spatial': model_tuple[6],
    }
selected_feats = correlation_feature_selection(inner_train,candidate_features, max_features=30,print_progress=True)
print(f"Selected features: {selected_feats}")
model_tuple = (
    model_tuple[0],
    selected_feats['static'],
    selected_feats['dynamic'],
    selected_feats['seasonal'],
    selected_feats['time_trend'],
    selected_feats['temporal'],
    selected_feats['spatial'],
    )

Selected temporal:roll_12_mean corr=0.62
Selected temporal:roll_6_mean corr=0.60
Selected temporal:roll_3_mean corr=0.57
Selected temporal:lag_1 corr=0.48
Selected dynamic:lag_1_x_n_neighbors corr=0.46
Selected spatial:lag_1_x_n_neighbors corr=0.46
Selected temporal:lag_2 corr=0.46
Selected temporal:lag_3 corr=0.44
Selected temporal:lag_4 corr=0.41
Selected dynamic:lag1_diff_neighbors corr=0.39
Selected features: {'static': [], 'dynamic': ['lag_1_x_n_neighbors', 'lag1_diff_neighbors'], 'seasonal': [], 'time_trend': [], 'temporal': ['roll_12_mean', 'roll_6_mean', 'roll_3_mean', 'lag_1', 'lag_2', 'lag_3', 'lag_4'], 'spatial': ['lag_1_x_n_neighbors']}


In [7]:
svi = create_learner(burglary_model)
train_data = prepare_model_data(training_data, *model_tuple[1:], device, ward_idx_map= ward_idx_map)
training_results = train_model(train_data, svi, num_steps=500)

Training SVI: 100%|██████████| 500/500 [00:12<00:00, 39.86it/s]


In [8]:
test_data = prepare_model_data(testing_data, *model_tuple[1:], device, training_results[-2], training_results[-1], ward_idx_map)
prediction_tester = PredictionTester(test_data, burglary_model, svi.guide, occupation_mappings[1])

In [9]:
prediction_tester.predict(5_000)

In [10]:
prediction_tester.get_all_predictions()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
E01000001,2,0,0,2,0,0,1,2,1,2,...,1,0,2,2,0,0,1,1,0,1
E01000002,2,2,0,0,1,0,0,3,3,0,...,2,0,2,1,0,2,1,2,2,1
E01000003,0,3,2,1,1,1,1,2,1,1,...,2,2,0,1,0,1,2,1,2,0
E01000005,2,0,0,2,3,3,1,0,0,1,...,0,0,1,2,2,1,0,1,2,1
E01000006,2,0,1,1,0,0,1,3,1,1,...,0,2,0,0,0,1,0,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E01035688,5,5,3,19,16,8,7,2,7,2,...,12,3,4,3,11,7,2,4,6,5
E01035689,0,1,1,1,2,1,0,2,0,1,...,1,3,2,2,1,4,1,0,1,2
E01035690,2,10,2,13,12,13,11,0,10,2,...,3,5,2,4,11,2,3,6,7,5
E01035691,1,2,2,1,0,0,1,1,1,1,...,3,2,1,3,1,1,3,2,1,0


In [11]:
prediction_tester.get_confidence_intervals(alpha=0.05)

Unnamed: 0,lower_bound,upper_bound
E01000001,0.0,3.0
E01000002,0.0,3.0
E01000003,0.0,4.0
E01000005,0.0,5.0
E01000006,0.0,3.0
...,...,...
E01035688,1.0,19.0
E01035689,0.0,4.0
E01035690,1.0,19.0
E01035691,0.0,4.0


In [12]:
prediction_tester.get_mean_predictions()

Unnamed: 0,mean
E01000001,0.8350
E01000002,0.9578
E01000003,1.0352
E01000005,1.3960
E01000006,0.8430
...,...
E01035688,7.2328
E01035689,1.1480
E01035690,7.0494
E01035691,1.2254


In [13]:
prediction_tester.get_median_predictions("sample_predictions.parquet")

Unnamed: 0,median
E01000001,1.0
E01000002,1.0
E01000003,1.0
E01000005,1.0
E01000006,1.0
...,...
E01035688,6.0
E01035689,1.0
E01035690,6.0
E01035691,1.0


In [14]:
factors_map ={
    "b_static": model_tuple[1],
    "b_dynamic": model_tuple[2],
    "b_seasonal": model_tuple[3],
    "b_time_tr": model_tuple[4],
    "b_temporal": model_tuple[5],
    "b_spatial": model_tuple[6],
}


statistical_tester = StatisticalTester(test_data, burglary_model, svi.guide, factors_map)

In [15]:
statistical_tester.predict(5_000)

In [16]:
factor_summaries= statistical_tester.evaluate_all()
print(factor_summaries)

{'b_static': Empty DataFrame
Columns: [col, mean, ci_lower, ci_upper, p_val, significant_CI]
Index: [], 'b_dynamic':                    col      mean  ci_lower  ci_upper  p_val significant_CI
0  lag_1_x_n_neighbors -0.556057 -0.745857 -0.355546  0.000            Yes
1  lag1_diff_neighbors  0.026673 -0.151469  0.204743  0.784             No, 'b_seasonal': Empty DataFrame
Columns: [col, mean, ci_lower, ci_upper, p_val, significant_CI]
Index: [], 'b_time_tr': Empty DataFrame
Columns: [col, mean, ci_lower, ci_upper, p_val, significant_CI]
Index: [], 'b_temporal':             col      mean  ci_lower  ci_upper   p_val significant_CI
0  roll_12_mean  0.509937  0.333764  0.688182  0.0000            Yes
1         lag_1  0.323811  0.142058  0.499663  0.0000            Yes
2   roll_3_mean  0.179414 -0.008295  0.367736  0.0608             No
3   roll_6_mean -0.149312 -0.327170  0.028626  0.1028             No
4         lag_2 -0.133717 -0.315740  0.050528  0.1560             No
5         lag_4  0.0

In [17]:

import pandas as pd
inner_train, inner_val = single_out_last(training_data)
param_grid = {
      "lr": [1e-2, 5e-3, 1e-3],
      "guide_type": ["diag", "lowrank"]
      }
results_df = grid_search(
      burglary_model,
      inner_train,
      inner_val,
      *model_tuple[1:],
      device,
      param_grid,
      ward_idx_map=ward_idx_map,
      num_steps=500,
      )
print(results_df)
print("Best parameters:", results_df.loc[0].to_dict())

Training SVI: 100%|██████████| 500/500 [00:15<00:00, 32.38it/s]
Training SVI: 100%|██████████| 500/500 [00:18<00:00, 27.08it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 40.80it/s]
Training SVI: 100%|██████████| 500/500 [00:17<00:00, 29.22it/s]
Training SVI: 100%|██████████| 500/500 [00:11<00:00, 43.59it/s]
Training SVI: 100%|██████████| 500/500 [00:17<00:00, 28.91it/s]

      lr guide_type      val_loss
0  0.010       diag   7335.596680
1  0.005       diag   7842.027100
2  0.010    lowrank   8578.933105
3  0.005    lowrank   8805.617188
4  0.001    lowrank  12174.836914
5  0.001       diag  13268.906250
Best parameters: {'lr': 0.01, 'guide_type': 'diag', 'val_loss': 7335.5966796875}



