In [1]:
from data_prep.data_prep import prepare_all_data
from training.training import train_model, prepare_model_data, create_learner
from model import burglary_model
from utils.utils import single_out_last, setup_reproducibility
from testing.testing import PredictionTester, StatisticalTester

model_tuple, occupation_mappings = prepare_all_data("../merged_data.parquet", "lsoa")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = setup_reproducibility(42)
training_data, testing_data = single_out_last(model_tuple[0])
svi = create_learner(burglary_model)

In [None]:
#List of Cut Features
factors_cut = {
    "b_static": [
        "Public Transport Accessibility Levels|Average Public Transport Access Score",
        "Shopping locations",
        "Emergency locations",
        "Ethnic Group|Mixed/multiple ethnic groups (%)",
        "area",
        "Car or van availability|4 or more cars or vans in household (%)",
        "Household Composition|% Lone parent household",
        "Public transport locations",
        "Household Composition|% Other multi person household",
        "Household Composition|% Couple household without dependent children",
        "Car or van availability|No cars or vans in household (%)",
        "Tenure|Owned outright (%)",
        "Ethnic Group|BAME (%)",
        "Tenure|Owned with a mortgage or loan (%)",
        "Food locations",
        "Public Transport Accessibility Levels|% 4-6 (good access)|Level3_66",
        "Ethnic Group|Other ethnic group (%)",
        "Dwelling type|Flat, maisonette or apartment (%)",
        "Leisure locations",
        "Entertainment locations",
        "Parking locations",
        "Car or van availability|1 car or van in household (%)"
    ],
    "b_dynamic": [
        "Living Environment Rank (where 1 is most deprived)",
        "Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)",
        "Employment Rank (where 1 is most deprived)",
        "Education, Skills and Training Rank (where 1 is most deprived)",
        "Mid-year Population Estimates|Aged 16-29",
        "Health Deprivation and Disability Rank (where 1 is most deprived)",
        "Mid-year Population Estimates|Working-age",
        "Mid-year Population Estimates|Aged 65+",
        "Income Rank (where 1 is most deprived)",
        "Barriers to Housing and Services Rank (where 1 is most deprived)"
    ],
    "b_seasonal": ["month_sin"],
    "b_time_tr": [],
    "b_temporal": ["lag_2", "lag_4", "lag_3"],
    "b_spatial": [
        "lag1_median_neighbors",
        "lag1_sum_neighbors",
        "lag1_mean_neighbors"
    ]
}


In [None]:
#Cut useless columns - You can ignore this block if you want to use all features
factors_keep = {
    "b_static": [
        "Household Composition|% One person household",
        "Public Transport Accessibility Levels|% 2-3 (average access)|Level3_66",
        "Ethnic Group|White (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|1a",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|1b",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|2",
        "Ethnic Group|Asian/Asian British (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|4",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|5",
        "Car or van availability|Cars per household",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|6a",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|6b",
        "Car or van availability|2 cars or vans in household (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|3",
        "Households|All households",
        "Ethnic Group|Black/African/Caribbean/Black British (%)",
        "Tenure|Social rented (%)",
        "Public Transport Accessibility Levels|Number of people in each PTAL level:|0",
        "Tenure|Private rented (%)",
        "shared_length",
        "Car or van availability|3 cars or vans in household (%)"
    ],
    "b_dynamic": [
        "Mid-year Population Estimates|All Ages",
        "Crime Rank (where 1 is most deprived)",
        "Mid-year Population Estimates|Aged 30-44"
    ],
    "b_seasonal": ["during_corona", "post_corona", "month_cos"],
    "b_time_tr": ["time_s"],
    "b_temporal": ["lag_1"],
    "b_spatial": []
}
model_tuple = list(model_tuple)     
for idx, key in enumerate(
        ["b_static", "b_dynamic", "b_seasonal", "b_time_tr", "b_temporal", "b_spatial"],
        start=1):
    model_tuple[idx] = [c for c in model_tuple[idx] if c in factors_keep[key]]
model_tuple = tuple(model_tuple)

In [5]:
train_data = prepare_model_data(training_data, *model_tuple[1:], device)
training_results = train_model(train_data, svi, num_steps=500)

Training SVI: 100%|██████████| 500/500 [00:12<00:00, 40.24it/s]


In [8]:
test_data = prepare_model_data(testing_data, *model_tuple[1:], device, training_results[-2], training_results[-1])
prediction_tester = PredictionTester(test_data, burglary_model, svi.guide, occupation_mappings[1])

In [9]:
prediction_tester.predict(5_000)

In [10]:
prediction_tester.get_all_predictions()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
E01000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,1,0
E01000002,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2
E01000003,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
E01000005,0,0,0,0,2,0,0,0,1,0,...,1,0,0,1,1,0,0,1,0,1
E01000006,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E01035688,0,0,2,1,4,0,0,3,0,0,...,0,1,1,1,1,1,0,0,0,5
E01035689,0,4,0,0,0,0,0,2,0,0,...,1,1,0,0,1,1,0,0,0,0
E01035690,1,4,1,1,5,0,2,4,2,0,...,5,0,0,1,1,1,1,1,1,2
E01035691,0,3,1,5,1,1,0,6,0,2,...,1,2,1,1,2,1,0,1,1,0


In [11]:
prediction_tester.get_confidence_intervals(alpha=0.05)

Unnamed: 0,lower_bound,upper_bound
E01000001,0.0,1.0
E01000002,0.0,1.0
E01000003,0.0,2.0
E01000005,0.0,2.0
E01000006,0.0,2.0
...,...,...
E01035688,0.0,4.0
E01035689,0.0,2.0
E01035690,0.0,6.0
E01035691,0.0,4.0


In [12]:
prediction_tester.get_mean_predictions()

Unnamed: 0,mean
E01000001,0.2100
E01000002,0.1698
E01000003,0.2920
E01000005,0.2834
E01000006,0.3842
...,...
E01035688,0.9254
E01035689,0.4198
E01035690,1.9862
E01035691,1.2760


In [13]:
prediction_tester.get_median_predictions("sample_predictions.parquet")

Unnamed: 0,median
E01000001,0.0
E01000002,0.0
E01000003,0.0
E01000005,0.0
E01000006,0.0
...,...
E01035688,1.0
E01035689,0.0
E01035690,2.0
E01035691,1.0


In [14]:
factors_map ={
    "b_static": model_tuple[1],
    "b_dynamic": model_tuple[2],
    "b_seasonal": model_tuple[3],
    "b_time_tr": model_tuple[4],
    "b_temporal": model_tuple[5],
    "b_spatial": model_tuple[6],
}


statistical_tester = StatisticalTester(test_data, burglary_model, svi.guide, factors_map)

In [15]:
statistical_tester.predict(5_000)

In [16]:
factor_summaries= statistical_tester.evaluate_all()
print(factor_summaries)

{'b_static':                                                   col      mean  ci_lower  \
0                Ethnic Group|Asian/Asian British (%) -0.279886 -0.337358   
1   Household Composition|% Couple household with ... -0.312407 -0.378775   
2   Public Transport Accessibility Levels|% 0-1 (p...  0.482611  0.354949   
3                              Ethnic Group|White (%) -0.231163 -0.308196   
4   Public Transport Accessibility Levels|% 2-3 (a...  1.006114  0.917092   
5   Ethnic Group|Black/African/Caribbean/Black Bri... -0.144837 -0.226331   
6   Public Transport Accessibility Levels|Number o...  0.368385  0.297645   
7        Household Composition|% One person household  0.170154  0.088318   
8   Public Transport Accessibility Levels|Number o...  0.664504  0.584470   
9                           Households|All households -0.184477 -0.269171   
10  Public Transport Accessibility Levels|Number o...  0.445443  0.390085   
11  Public Transport Accessibility Levels|Number o...  0.900215