In [1]:
from data_prep.data_prep import prepare_all_data
from training.training import train_model, prepare_model_data, create_learner, grid_search
from training.feature_selection import forward_feature_selection, correlation_feature_selection
from model import burglary_model
from utils.utils import single_out_last, setup_reproducibility
from testing.testing import PredictionTester, StatisticalTester

model_tuple, occupation_mappings,ward_idx_map = prepare_all_data("../merged_data.parquet", "lsoa")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = setup_reproducibility(42)
training_data, testing_data = single_out_last(model_tuple[0])

In [None]:
#Automated Feature Selection Block 
#Correlation Feature Selection: Computationally Light
#FORWARD Feature Selection: Computationally Heavy, More Accurate but needs a GPU
inner_train,inner_val = single_out_last(training_data)
candidate_features = {
    "static": model_tuple[1],
    'dynamic': model_tuple[2],
    'seasonal': model_tuple[3],
    'time_trend': model_tuple[4],
    'temporal': model_tuple[5],
    'spatial': model_tuple[6],
    }
selected_feats = correlation_feature_selection(inner_train,candidate_features, max_features=12,print_progress=True)
print(f"Selected features: {selected_feats}")
#Forward Feature Selection:
#selected_feats = forward_feature_selection(burglary_model,inner_train,inner_val,candidate_features,device,num_steps=200,lr=1e-3,guide_type='diag',verbose=False max_features=10, print_progress=True)
model_tuple = (
    model_tuple[0],
    selected_feats['static'],
    selected_feats['dynamic'],
    selected_feats['seasonal'],
    selected_feats['time_trend'],
    selected_feats['temporal'],
    selected_feats['spatial'],
    )

Selected temporal:roll_12_mean corr=0.62
Selected temporal:roll_6_mean corr=0.60
Selected temporal:roll_3_mean corr=0.57
Selected temporal:lag_1 corr=0.48
Selected dynamic:lag_1_x_n_neighbors corr=0.46
Selected spatial:lag_1_x_n_neighbors corr=0.46
Selected temporal:lag_2 corr=0.46
Selected temporal:lag_3 corr=0.44
Selected temporal:lag_4 corr=0.41
Selected dynamic:lag1_diff_neighbors corr=0.39
Selected spatial:lag1_diff_neighbors corr=0.39
Selected dynamic:Mid-year Population Estimates|Working-age corr=0.27
Selected features: {'static': [], 'dynamic': ['lag_1_x_n_neighbors', 'lag1_diff_neighbors', 'Mid-year Population Estimates|Working-age'], 'seasonal': [], 'time_trend': [], 'temporal': ['roll_12_mean', 'roll_6_mean', 'roll_3_mean', 'lag_1', 'lag_2', 'lag_3', 'lag_4'], 'spatial': ['lag_1_x_n_neighbors', 'lag1_diff_neighbors']}


In [16]:
svi = create_learner(burglary_model)
train_data = prepare_model_data(training_data, *model_tuple[1:], device, ward_idx_map= ward_idx_map)
training_results = train_model(train_data, svi, num_steps=500)

Training SVI: 100%|██████████| 500/500 [00:11<00:00, 43.10it/s]


In [17]:
test_data = prepare_model_data(testing_data, *model_tuple[1:], device, training_results[-2], training_results[-1], ward_idx_map)
prediction_tester = PredictionTester(test_data, burglary_model, svi.guide, occupation_mappings[1])

In [18]:
prediction_tester.predict(5_000)

In [19]:
prediction_tester.get_all_predictions()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
E01000001,0,1,0,0,0,2,1,0,0,0,...,2,1,0,0,1,1,0,1,0,0
E01000002,0,1,1,3,0,0,0,0,0,0,...,0,0,1,1,0,0,0,1,0,0
E01000003,0,0,0,2,0,1,0,1,0,0,...,0,0,0,0,0,3,0,0,0,0
E01000005,1,1,2,1,1,0,2,1,1,0,...,1,1,0,1,0,0,0,1,1,1
E01000006,1,0,0,0,0,2,0,1,2,1,...,0,1,0,0,0,0,0,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E01035688,3,2,2,0,3,0,0,0,1,3,...,3,1,0,1,0,3,0,2,2,2
E01035689,3,0,0,0,1,0,0,0,2,0,...,0,0,1,0,0,2,1,0,1,0
E01035690,5,7,4,0,2,1,1,2,0,5,...,3,5,3,4,1,2,1,1,3,2
E01035691,1,1,3,2,0,0,3,0,2,4,...,5,0,0,1,3,3,1,2,1,0


In [20]:
prediction_tester.get_confidence_intervals(alpha=0.05)

Unnamed: 0,lower_bound,upper_bound
E01000001,0.0,2.0
E01000002,0.0,2.0
E01000003,0.0,2.0
E01000005,0.0,4.0
E01000006,0.0,2.0
...,...,...
E01035688,0.0,5.0
E01035689,0.0,3.0
E01035690,0.0,8.0
E01035691,0.0,5.0


In [21]:
prediction_tester.get_mean_predictions()

Unnamed: 0,mean
E01000001,0.5424
E01000002,0.3498
E01000003,0.4518
E01000005,1.1082
E01000006,0.4294
...,...
E01035688,1.3080
E01035689,0.7166
E01035690,2.4892
E01035691,1.4796


In [22]:
prediction_tester.get_median_predictions("sample_predictions.parquet")

Unnamed: 0,median
E01000001,0.0
E01000002,0.0
E01000003,0.0
E01000005,1.0
E01000006,0.0
...,...
E01035688,1.0
E01035689,1.0
E01035690,2.0
E01035691,1.0


In [23]:
factors_map ={
    "b_static": model_tuple[1],
    "b_dynamic": model_tuple[2],
    "b_seasonal": model_tuple[3],
    "b_time_tr": model_tuple[4],
    "b_temporal": model_tuple[5],
    "b_spatial": model_tuple[6],
}


statistical_tester = StatisticalTester(test_data, burglary_model, svi.guide, factors_map)

In [24]:
statistical_tester.predict(5_000)

In [25]:
factor_summaries= statistical_tester.evaluate_all()
print(factor_summaries)

{'b_static': Empty DataFrame
Columns: [col, mean, ci_lower, ci_upper, p_val, significant_CI]
Index: [], 'b_dynamic':                                          col      mean  ci_lower  ci_upper  \
0                        lag_1_x_n_neighbors -0.383274 -0.571647 -0.196798   
1  Mid-year Population Estimates|Working-age  0.265172  0.087088  0.442294   
2                        lag1_diff_neighbors  0.215128  0.031275  0.400172   

    p_val significant_CI  
0  0.0000            Yes  
1  0.0028            Yes  
2  0.0236            Yes  , 'b_seasonal': Empty DataFrame
Columns: [col, mean, ci_lower, ci_upper, p_val, significant_CI]
Index: [], 'b_time_tr': Empty DataFrame
Columns: [col, mean, ci_lower, ci_upper, p_val, significant_CI]
Index: [], 'b_temporal':             col      mean  ci_lower  ci_upper   p_val significant_CI
0  roll_12_mean -0.301205 -0.483005 -0.118353  0.0008            Yes
1         lag_3  0.266352  0.082727  0.445148  0.0040            Yes
2         lag_2  0.219712  0.05

In [26]:

import pandas as pd
inner_train, inner_val = single_out_last(training_data)
param_grid = {
      "lr": [1e-2, 5e-3, 1e-3],
      "guide_type": ["diag", "lowrank"]
      }
results_df = grid_search(
      burglary_model,
      inner_train,
      inner_val,
      *model_tuple[1:],
      device,
      param_grid,
      ward_idx_map=ward_idx_map,
      num_steps=500,
      )
print(results_df)
print("Best parameters:", results_df.loc[0].to_dict())

Training SVI: 100%|██████████| 500/500 [00:11<00:00, 44.28it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 32.14it/s]
Training SVI: 100%|██████████| 500/500 [00:10<00:00, 49.36it/s]
Training SVI: 100%|██████████| 500/500 [00:14<00:00, 33.34it/s]
Training SVI: 100%|██████████| 500/500 [00:10<00:00, 45.60it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 31.51it/s]

      lr guide_type      val_loss
0  0.010       diag   7001.862549
1  0.010    lowrank   9260.123779
2  0.005    lowrank   9317.157471
3  0.001       diag  10940.034180
4  0.001    lowrank  15569.246582
5  0.005       diag  52088.027344
Best parameters: {'lr': 0.01, 'guide_type': 'diag', 'val_loss': 7001.862548828125}



