In [1]:
from data_prep.data_prep import prepare_all_data
from training.training import train_model, prepare_model_data, create_learner
from model import burglary_model
from utils.utils import single_out_last, setup_reproducibility
from testing.testing import PredictionTester, StatisticalTester

model_tuple, occupation_mappings = prepare_all_data("../merged_data.parquet", "lsoa")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = setup_reproducibility(42)
training_data, testing_data = single_out_last(model_tuple[0])
svi = create_learner(burglary_model)

In [None]:
#Cut useless columns
factors_keep = {
    "b_static": [
        "Household Composition|% One person household",
        "Public Transport Accessibility Levels|% 2-3 (average access)|Level3_66",
        "Ethnic Group|White (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|1a",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|1b",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|2",
        "Ethnic Group|Asian/Asian British (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|4",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|5",
        "Car or van availability|Cars per household",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|6a",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|6b",
        "Car or van availability|2 cars or vans in household (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|3",
        "Households|All households",
        "Ethnic Group|Black/African/Caribbean/Black British (%)",
        "Tenure|Social rented (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|0",
        "Tenure|Private rented (%)",
        "shared_length",
        "Car or van availability|3 cars or vans in household (%)"
    ],
    "b_dynamic": [
        "Mid-year Population Estimates|All Ages",
        "Crime Rank (where 1 is most deprived)",
        "Mid-year Population Estimates|Aged 30-44"
    ],
    "b_seasonal": ["during_corona", "post_corona", "month_cos"],
    "b_time_tr": ["time_s"],
    "b_temporal": ["lag_1"],
    "b_spatial": []
}
model_tuple = list(model_tuple)     
for idx, key in enumerate(
        ["b_static", "b_dynamic", "b_seasonal", "b_time_tr", "b_temporal", "b_spatial"],
        start=1):
    model_tuple[idx] = [c for c in model_tuple[idx] if c in factors_keep[key]]
model_tuple = tuple(model_tuple)

In [4]:
train_data = prepare_model_data(training_data, *model_tuple[1:], device)
training_results = train_model(train_data, svi, num_steps=500)

Training SVI: 100%|██████████| 500/500 [00:11<00:00, 44.04it/s]


In [5]:
test_data = prepare_model_data(testing_data, *model_tuple[1:], device, training_results[-2], training_results[-1])
prediction_tester = PredictionTester(test_data, burglary_model, svi.guide, occupation_mappings[1])

In [6]:
prediction_tester.predict(5_000)

In [7]:
prediction_tester.get_all_predictions()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
E01000001,0,0,0,0,1,1,0,0,1,0,...,2,1,1,0,2,2,0,0,0,0
E01000002,0,0,2,0,0,1,0,0,0,0,...,0,0,1,0,0,0,2,1,0,2
E01000003,1,0,0,1,2,1,0,0,0,0,...,0,1,1,0,0,0,1,0,1,1
E01000005,1,2,0,0,1,1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,2
E01000006,0,0,0,2,0,0,0,0,2,0,...,0,0,1,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E01035688,3,1,1,1,3,1,0,1,2,1,...,3,1,1,1,2,1,0,2,0,0
E01035689,1,1,1,1,0,0,0,3,2,0,...,2,0,2,1,0,0,0,1,0,0
E01035690,2,1,1,4,3,3,0,3,2,1,...,1,1,2,2,1,3,0,2,1,1
E01035691,4,0,1,3,0,1,3,1,1,2,...,3,3,1,1,1,1,1,1,3,1


In [8]:
prediction_tester.get_confidence_intervals(alpha=0.05)

Unnamed: 0,lower_bound,upper_bound
E01000001,0.0,2.0
E01000002,0.0,2.0
E01000003,0.0,2.0
E01000005,0.0,3.0
E01000006,0.0,2.0
...,...,...
E01035688,0.0,3.0
E01035689,0.0,2.0
E01035690,0.0,5.0
E01035691,0.0,4.0


In [9]:
prediction_tester.get_mean_predictions()

Unnamed: 0,mean
E01000001,0.5078
E01000002,0.5260
E01000003,0.5466
E01000005,0.6452
E01000006,0.4482
...,...
E01035688,1.0196
E01035689,0.5780
E01035690,1.8070
E01035691,1.2540


In [10]:
prediction_tester.get_median_predictions("sample_predictions.parquet")

Unnamed: 0,median
E01000001,0.0
E01000002,0.0
E01000003,0.0
E01000005,0.0
E01000006,0.0
...,...
E01035688,1.0
E01035689,0.0
E01035690,2.0
E01035691,1.0


In [11]:
factors_map ={
    "b_static": model_tuple[1],
    "b_dynamic": model_tuple[2],
    "b_seasonal": model_tuple[3],
    "b_time_tr": model_tuple[4],
    "b_temporal": model_tuple[5],
    "b_spatial": model_tuple[6],
}


statistical_tester = StatisticalTester(test_data, burglary_model, svi.guide, factors_map)

In [12]:
statistical_tester.predict(5_000)

In [13]:
factor_summaries= statistical_tester.evaluate_all()
print(factor_summaries)

{'b_static':                                                   col      mean  ci_lower  \
0   Car or van availability|2 cars or vans in hous...  0.291008  0.226556   
1   Public Transport Accessibility Levels|Number o...  0.375006  0.336238   
2   Public Transport Accessibility Levels|Number o...  0.381461  0.343778   
3   Public Transport Accessibility Levels|Number o...  0.429671  0.374289   
4   Public Transport Accessibility Levels|Number o...  0.445937  0.389051   
5   Public Transport Accessibility Levels|Number o...  0.225969  0.171797   
6   Public Transport Accessibility Levels|Number o...  0.339355  0.300232   
7   Public Transport Accessibility Levels|% 2-3 (a...  0.153825  0.093481   
8   Public Transport Accessibility Levels|Number o...  0.324017  0.275496   
9                           Tenure|Private rented (%)  0.199007  0.140500   
10       Household Composition|% One person household  0.183481  0.112969   
11                           Tenure|Social rented (%)  0.105172