In [1]:
from data_prep.data_prep import prepare_all_data
from training.training import train_model, prepare_model_data, create_learner
from model import burglary_model
from utils.utils import single_out_last, setup_reproducibility
from testing.testing import PredictionTester, StatisticalTester

model_tuple, occupation_mappings = prepare_all_data("../merged_data.parquet", "lsoa")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = setup_reproducibility(42)
training_data, testing_data = single_out_last(model_tuple[0])
svi = create_learner(burglary_model)

In [3]:
#List of Cut Features
factors_cut = {
    "b_static": [
        "Public Transport Accessibility Levels|Average Public Transport Access Score",
        "Shopping locations",
        "Emergency locations",
        "Ethnic Group|Mixed/multiple ethnic groups (%)",
        "area",
        "Car or van availability|4 or more cars or vans in household (%)",
        "Household Composition|% Lone parent household",
        "Public transport locations",
        "Household Composition|% Other multi person household",
        "Household Composition|% Couple household without dependent children",
        "Car or van availability|No cars or vans in household (%)",
        "Tenure|Owned outright (%)",
        "Ethnic Group|BAME (%)",
        "Tenure|Owned with a mortgage or loan (%)",
        "Food locations",
        "Public Transport Accessibility Levels|% 4-6 (good access)|Level3_66",
        "Ethnic Group|Other ethnic group (%)",
        "Dwelling type|Flat, maisonette or apartment (%)",
        "Leisure locations",
        "Entertainment locations",
        "Parking locations",
        "Car or van availability|1 car or van in household (%)"
    ],
    "b_dynamic": [
        "Living Environment Rank (where 1 is most deprived)",
        "Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)",
        "Employment Rank (where 1 is most deprived)",
        "Education, Skills and Training Rank (where 1 is most deprived)",
        "Mid-year Population Estimates|Aged 16-29",
        "Health Deprivation and Disability Rank (where 1 is most deprived)",
        "Mid-year Population Estimates|Working-age",
        "Mid-year Population Estimates|Aged 65+",
        "Income Rank (where 1 is most deprived)",
        "Barriers to Housing and Services Rank (where 1 is most deprived)"
    ],
    "b_seasonal": ["month_sin"],
    "b_time_tr": [],
    "b_temporal": ["lag_2", "lag_4", "lag_3"],
    "b_spatial": [
        "lag1_median_neighbors",
        "lag1_sum_neighbors",
        "lag1_mean_neighbors"
    ]
}


In [None]:
#Cut useless columns - You can ignore this block if you want to use all features
factors_keep = {
    "b_static": [
        "Household Composition|% One person household",
        "Public Transport Accessibility Levels|% 2-3 (average access)|Level3_66",
        "Ethnic Group|White (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|1a",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|1b",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|2",
        "Ethnic Group|Asian/Asian British (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|4",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|5",
        "Car or van availability|Cars per household",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|6a",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|6b",
        "Car or van availability|2 cars or vans in household (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|3",
        "Households|All households",
        "Ethnic Group|Black/African/Caribbean/Black British (%)",
        "Tenure|Social rented (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|0",
        "Tenure|Private rented (%)",
        "shared_length",
        "Car or van availability|3 cars or vans in household (%)"
    ],
    "b_dynamic": [
        "Mid-year Population Estimates|All Ages",
        "Crime Rank (where 1 is most deprived)",
        "Mid-year Population Estimates|Aged 30-44"
    ],
    "b_seasonal": ["during_corona", "post_corona", "month_cos"],
    "b_time_tr": ["time_s"],
    "b_temporal": ["lag_1"],
    "b_spatial": []
}
model_tuple = list(model_tuple)     
for idx, key in enumerate(
        ["b_static", "b_dynamic", "b_seasonal", "b_time_tr", "b_temporal", "b_spatial"],
        start=1):
    model_tuple[idx] = [c for c in model_tuple[idx] if c not in factors_cut[key]]
model_tuple = tuple(model_tuple)

In [5]:
train_data = prepare_model_data(training_data, *model_tuple[1:], device)
training_results = train_model(train_data, svi, num_steps=500)

Training SVI: 100%|██████████| 500/500 [00:18<00:00, 27.60it/s]


In [6]:
test_data = prepare_model_data(testing_data, *model_tuple[1:], device, training_results[-2], training_results[-1])
prediction_tester = PredictionTester(test_data, burglary_model, svi.guide, occupation_mappings[1])

In [7]:
prediction_tester.predict(5_000)

In [8]:
prediction_tester.get_all_predictions()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
E01000001,0,2,0,1,0,0,3,0,1,0,...,1,0,0,0,0,0,1,5,2,0
E01000002,0,0,1,3,0,0,1,1,3,0,...,2,1,0,1,2,2,0,1,0,1
E01000003,1,0,3,0,3,2,2,2,3,1,...,0,0,0,2,3,5,0,1,0,1
E01000005,0,1,2,0,0,1,1,1,2,4,...,0,2,0,0,1,4,2,1,1,0
E01000006,0,0,1,1,1,1,0,0,1,0,...,1,1,3,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E01035688,1,1,0,2,4,1,1,1,5,3,...,2,1,3,3,7,4,0,3,5,1
E01035689,2,4,0,2,4,2,0,2,2,0,...,2,0,1,2,4,2,2,0,1,0
E01035690,2,0,0,1,0,2,3,3,2,0,...,2,4,0,1,1,3,3,0,1,1
E01035691,1,1,2,1,0,0,0,0,1,0,...,3,6,0,3,0,0,2,0,0,1


In [9]:
prediction_tester.get_confidence_intervals(alpha=0.05)

Unnamed: 0,lower_bound,upper_bound
E01000001,0.0,4.0
E01000002,0.0,5.0
E01000003,0.0,5.0
E01000005,0.0,5.0
E01000006,0.0,3.0
...,...,...
E01035688,0.0,7.0
E01035689,0.0,5.0
E01035690,0.0,6.0
E01035691,0.0,4.0


In [10]:
prediction_tester.get_mean_predictions()

Unnamed: 0,mean
E01000001,1.0070
E01000002,1.1192
E01000003,1.2004
E01000005,1.1896
E01000006,0.7578
...,...
E01035688,2.0170
E01035689,1.2346
E01035690,1.7642
E01035691,0.9278


In [11]:
prediction_tester.get_median_predictions("sample_predictions.parquet")

Unnamed: 0,median
E01000001,1.0
E01000002,1.0
E01000003,1.0
E01000005,1.0
E01000006,1.0
...,...
E01035688,2.0
E01035689,1.0
E01035690,1.0
E01035691,1.0


In [12]:
factors_map ={
    "b_static": model_tuple[1],
    "b_dynamic": model_tuple[2],
    "b_seasonal": model_tuple[3],
    "b_time_tr": model_tuple[4],
    "b_temporal": model_tuple[5],
    "b_spatial": model_tuple[6],
}


statistical_tester = StatisticalTester(test_data, burglary_model, svi.guide, factors_map)

In [13]:
statistical_tester.predict(5_000)

In [14]:
factor_summaries= statistical_tester.evaluate_all()
print(factor_summaries)

{'b_static':                                                   col      mean  ci_lower  \
0   Car or van availability|3 cars or vans in hous...  0.384300  0.196811   
1                           Households|All households  0.447756  0.275247   
2   Car or van availability|2 cars or vans in hous... -0.286772 -0.465655   
3                Ethnic Group|Asian/Asian British (%)  0.179695  0.016450   
4   Public Transport Accessibility Levels|Number o... -0.170222 -0.356993   
5   Public Transport Accessibility Levels|Number o...  0.158343 -0.013332   
6   Public Transport Accessibility Levels|Number o...  0.149309 -0.013193   
7   Public Transport Accessibility Levels|Number o...  0.140158 -0.042453   
8   Public Transport Accessibility Levels|% 2-3 (a... -0.098035 -0.271484   
9   Public Transport Accessibility Levels|Number o... -0.104697 -0.293190   
10                             Ethnic Group|White (%)  0.074736 -0.087761   
11  Public Transport Accessibility Levels|Number o...  0.053051

In [15]:
# Simple hyperparameter tuning using a small grid search
from itertools import product
import pandas as pd

# Split the original training data into an internal train/validation split
inner_train, inner_val = single_out_last(training_data)

# Define search space
param_grid = {
    "lr": [1e-2, 5e-3, 1e-3],
    "guide_type": ["diag", "lowrank"],
}
results = []

for lr, guide in product(param_grid["lr"], param_grid["guide_type"]):
    svi = create_learner(burglary_model, lr=lr, guide_type=guide)
    train_ds = prepare_model_data(inner_train, *model_tuple[1:], device)
    svi, losses, means, stds = train_model(train_ds, svi, num_steps=200)

    val_ds = prepare_model_data(inner_val, *model_tuple[1:], device, means, stds)
    val_loss = svi.evaluate_loss(
        val_ds["occupation_idx"],
        val_ds["X_static"],
        val_ds["X_dynamic"],
        val_ds["X_seasonal"],
        val_ds["X_time_trend"],
        val_ds["X_temporal"],
        val_ds["X_spatial"],
        val_ds["y"],
    )
    results.append({"lr": lr, "guide_type": guide, "val_loss": val_loss})

results_df = pd.DataFrame(results).sort_values("val_loss").reset_index(drop=True)
print(results_df)
print("Best parameters:", results_df.loc[0].to_dict())


Training SVI: 100%|██████████| 200/200 [00:04<00:00, 47.98it/s]
Training SVI: 100%|██████████| 200/200 [00:08<00:00, 22.86it/s]
Training SVI: 100%|██████████| 200/200 [00:06<00:00, 30.66it/s]
Training SVI: 100%|██████████| 200/200 [00:08<00:00, 23.29it/s]
Training SVI: 100%|██████████| 200/200 [00:06<00:00, 30.18it/s]
Training SVI: 100%|██████████| 200/200 [00:08<00:00, 23.09it/s]

      lr guide_type     val_loss
0  0.005       diag  6441.858398
1  0.001       diag  6459.284912
2  0.010       diag  6716.772705
3  0.005    lowrank  7617.977051
4  0.001    lowrank  8065.766602
5  0.010    lowrank  8497.582764
Best parameters: {'lr': 0.005, 'guide_type': 'diag', 'val_loss': 6441.8583984375}



