In [1]:
from data_prep.data_prep import prepare_all_data
from training.training import train_model, prepare_model_data, create_learner, grid_search
from model import burglary_model
from utils.utils import single_out_last, setup_reproducibility
from testing.testing import PredictionTester, StatisticalTester

model_tuple, occupation_mappings,ward_idx_map = prepare_all_data("../merged_data.parquet", "lsoa")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = setup_reproducibility(42)
training_data, testing_data = single_out_last(model_tuple[0])
svi = create_learner(burglary_model)

In [None]:
factors_cut = {
    "b_static": [
        "Tenure|Private rented (%)",
        "Public Transport Accessibility Levels|Number o...",  # (all the truncated PTAL cols where sig=No)
        "Household Composition|% One person household",
        "Household Composition|% Couple household with ...",
        "Household Composition|% Lone parent household",
        "Car or van availability|4 or more cars or vans in hous...",
        "Ethnic Group|Other ethnic group (%)",
        "Parking locations",
        "Ethnic Group|BAME (%)",
        "Ethnic Group|Mixed/multiple ethnic groups (%)",
        "Tenure|Owned with a mortgage or loan (%)",
        "Public Transport Accessibility Levels|% 2-3 (a...",
        "Leisure locations",
        "Education locations",
        "Emergency locations",
        "Public Transport Accessibility Levels|Number o...",
        "Car or van availability|2 cars or vans in hous...",
        "Car or van availability|1 car or van in househ...",
        "Household Composition|% Other multi person hou...",
        "Public Transport Accessibility Levels|Number o...",
        "Dwelling type|Flat, maisonette or apartment (%)",
        "Public transport locations",
        "Food locations",
        "Public Transport Accessibility Levels|% 4-6 (g...",
        "Shopping locations",
        "n_neighbors"
    ],
    "b_dynamic": [
        "lag_1_x_n_neighbors",
        "lag1_diff_neighbors",
        "Education, Skills and Training Rank (where 1 is best)",
        "Employment Rank (where 1 is most deprived)",
        "Crime Rank (where 1 is most deprived)",
        "Barriers to Housing and Services Rank (where 1 is most deprived)",
        "Living Environment Rank (where 1 is most deprived)",
        "Mid-year Population Estimates|Aged 45-64",
        "Mid-year Population Estimates|Aged 0-15"
    ],
    "b_seasonal": [
        "post_corona",
        "month_cos",
        "during_corona",
        "month_sin"
    ],
    "b_time_tr": [
        
    ],
    "b_temporal": [
        "roll_12_mean",
        "lag_3",
        "lag_4"
    ],
    "b_spatial": [
        "lag1_mean_neighbors",
        "lag1_median_neighbors",
        "lag1_sum_neighbors"
    ]
}

In [4]:
#Cut useless columns - You can ignore this block if you want to use all features
factors_keep = {
    "b_static": [
        "Public Transport Accessibility Levels|% 2-3 (average access)|Level3_66",
        "Ethnic Group|White (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|1a",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|1b",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|2",
        "Ethnic Group|Asian/Asian British (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|4",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|5",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|6a",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|6b",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|3",
        "Tenure|Social rented (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|0",
        "Tenure|Private rented (%)",
        "shared_length",
        "Car or van availability|3 cars or vans in household (%)"
    ],
    "b_dynamic": [
        "Mid-year Population Estimates|All Ages",
        "Mid-year Population Estimates|Aged 30-44"
    ],
    "b_seasonal": [],
    "b_time_tr": [],
    "b_temporal": ["lag_1"],
    "b_spatial": []
}
model_tuple = list(model_tuple)     
for idx, key in enumerate(
        ["b_static", "b_dynamic", "b_seasonal", "b_time_tr", "b_temporal", "b_spatial"],
        start=1):
    model_tuple[idx] = [c for c in model_tuple[idx] if c not in factors_cut[key]]
model_tuple = tuple(model_tuple)

In [3]:
train_data = prepare_model_data(training_data, *model_tuple[1:], device, ward_idx_map= ward_idx_map)
training_results = train_model(train_data, svi, num_steps=500)

Training SVI: 100%|██████████| 500/500 [00:16<00:00, 29.97it/s]


In [4]:
test_data = prepare_model_data(testing_data, *model_tuple[1:], device, training_results[-2], training_results[-1], ward_idx_map)
prediction_tester = PredictionTester(test_data, burglary_model, svi.guide, occupation_mappings[1])

In [5]:
prediction_tester.predict(5_000)

In [6]:
prediction_tester.get_all_predictions()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
E01000001,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
E01000002,0,0,0,2,2,1,0,0,0,0,...,2,0,1,0,0,0,0,0,2,0
E01000003,0,0,2,0,0,0,2,0,0,0,...,0,0,0,0,1,0,0,0,0,0
E01000005,0,0,0,0,0,6,1,2,0,1,...,0,0,0,0,1,0,0,0,0,0
E01000006,7,0,1,0,0,0,1,0,0,0,...,6,0,5,0,0,1,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E01035688,2,1,6,0,0,4,28,5,0,1,...,0,2,7,2,2,2,0,4,3,1
E01035689,1,1,5,0,1,0,10,7,0,1,...,0,1,8,0,0,5,1,1,1,1
E01035690,4,7,6,0,7,2,58,3,5,0,...,1,2,1,4,11,7,2,9,5,2
E01035691,2,2,1,2,12,0,12,4,1,5,...,1,2,1,0,2,1,5,3,1,1


In [7]:
prediction_tester.get_confidence_intervals(alpha=0.05)

Unnamed: 0,lower_bound,upper_bound
E01000001,0.0,1.000
E01000002,0.0,7.000
E01000003,0.0,4.000
E01000005,0.0,4.000
E01000006,0.0,5.000
...,...,...
E01035688,0.0,19.000
E01035689,0.0,7.000
E01035690,0.0,24.025
E01035691,0.0,8.025


In [8]:
prediction_tester.get_mean_predictions()

Unnamed: 0,mean
E01000001,0.1512
E01000002,1.0702
E01000003,0.6334
E01000005,0.6724
E01000006,1.2396
...,...
E01035688,3.8598
E01035689,1.5728
E01035690,5.0462
E01035691,1.9190


In [9]:
prediction_tester.get_median_predictions("sample_predictions.parquet")

Unnamed: 0,median
E01000001,0.0
E01000002,0.0
E01000003,0.0
E01000005,0.0
E01000006,1.0
...,...
E01035688,2.0
E01035689,1.0
E01035690,3.0
E01035691,1.0


In [10]:
factors_map ={
    "b_static": model_tuple[1],
    "b_dynamic": model_tuple[2],
    "b_seasonal": model_tuple[3],
    "b_time_tr": model_tuple[4],
    "b_temporal": model_tuple[5],
    "b_spatial": model_tuple[6],
}


statistical_tester = StatisticalTester(test_data, burglary_model, svi.guide, factors_map)

In [11]:
statistical_tester.predict(5_000)

In [12]:
factor_summaries= statistical_tester.evaluate_all()
print(factor_summaries)

{'b_static':                                                   col      mean  ci_lower  \
0   Car or van availability|3 cars or vans in hous... -0.469111 -0.655779   
1                              Ethnic Group|White (%)  0.377216  0.189209   
2   Ethnic Group|Black/African/Caribbean/Black Bri...  0.498507  0.321684   
3                Ethnic Group|Asian/Asian British (%)  0.325042  0.152770   
4                            Tenure|Social rented (%)  0.293166  0.111938   
5   Public Transport Accessibility Levels|Number o...  0.286901  0.102585   
6   Public Transport Accessibility Levels|Number o...  0.283377  0.104689   
7   Household Composition|% Couple household witho...  0.280619  0.103381   
8   Public Transport Accessibility Levels|Number o...  0.250675  0.076211   
9   Public Transport Accessibility Levels|Number o...  0.250903  0.066229   
10                          Households|All households -0.262349 -0.447613   
11                          Tenure|Owned outright (%) -0.245499

In [13]:
from training.training import grid_search
import pandas as pd
inner_train, inner_val = single_out_last(training_data)
param_grid = {
      "lr": [1e-2, 5e-3, 1e-3],
      "guide_type": ["diag", "lowrank"]
      }
results_df = grid_search(
      burglary_model,
      inner_train,
      inner_val,
      *model_tuple[1:],
      device,
      param_grid,
      ward_idx_map=ward_idx_map,
      num_steps=500,
      )
print(results_df)
print("Best parameters:", results_df.loc[0].to_dict())

Training SVI: 100%|██████████| 500/500 [00:23<00:00, 21.16it/s]
Training SVI: 100%|██████████| 500/500 [00:24<00:00, 20.37it/s]
Training SVI: 100%|██████████| 500/500 [00:18<00:00, 27.42it/s]
Training SVI: 100%|██████████| 500/500 [00:24<00:00, 20.45it/s]
Training SVI: 100%|██████████| 500/500 [00:16<00:00, 30.23it/s]
Training SVI: 100%|██████████| 500/500 [00:22<00:00, 22.22it/s]

      lr guide_type      val_loss
0  0.010       diag   9700.565430
1  0.010    lowrank  11198.565430
2  0.005    lowrank  14001.527832
3  0.001       diag  20118.119141
4  0.001    lowrank  22064.511719
5  0.005       diag  62263.409180
Best parameters: {'lr': 0.01, 'guide_type': 'diag', 'val_loss': 9700.5654296875}



