In [1]:
from data_prep.data_prep import prepare_all_data
from training.training import prepare_model_data, grid_search
from training.feature_selection import forward_feature_selection, correlation_feature_selection,projection_predictive_selection
from model import burglary_model
from utils.utils import single_out_last, setup_reproducibility
from testing.testing import StatisticalTester
from pipeline import train_and_evaluate_model, cross_validate_time_splits
model_tuple, occupation_mappings,ward_idx_map = prepare_all_data("../merged_data.parquet", "lsoa")

In [2]:
device = setup_reproducibility(42)
print(f'Using device: {device}')
training_data, testing_data = single_out_last(model_tuple[0])

Using device: cuda


In [3]:
# #Automated Feature Selection Block 
# #Correlation Feature Selection: Computationally Light
# #FORWARD Feature Selection: Computationally Heavy, More Accurate but needs a GPU
# inner_train,inner_val = single_out_last(training_data)
# candidate_features = {
#     "static": model_tuple[1],
#     'dynamic': model_tuple[2],
#     'seasonal': model_tuple[3],
#     'time_trend': model_tuple[4],
#     'temporal': model_tuple[5],
#     'spatial': model_tuple[6],
#     }
# #selected_feats = correlation_feature_selection(inner_train,candidate_features, max_features=12,print_progress=True)
# selected_feats=projection_predictive_selection(
#     burglary_model,
#     inner_train,
#     inner_val,
#     candidate_features,
#     device,
#     num_steps=200,
#     lr=1e-3,
#     guide_type='diag',
#     verbose=False,
#     max_features=10,
#     print_progress=True
# )
# print(f"Selected features: {selected_feats}")
# #Forward Feature Selection:
# #selected_feats = forward_feature_selection(burglary_model,inner_train,inner_val,candidate_features,device,num_steps=200,lr=1e-3,guide_type='diag',verbose=False max_features=10, print_progress=True)
# ['Car or van availability|1 car or van in household (%)', 'Car or van availability|Cars per household', 'Car or van availability|No cars or vans in household (%)', 'Ethnic Group|BAME (%)', 'Household Composition|% Other multi person household', 'Household Composition|% Couple household with dependent children', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|6a', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|2', 'Tenure|Owned outright (%)', 'n_neighbors', 'Public Transport Accessibility Levels|% 4-6 (good access)|Level3_67', 'Tenure|Owned with a mortgage or loan (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|1b', 'Dwelling type|Flat, maisonette or apartment (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|4', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|5', 'Household Composition|% One person household', 'Ethnic Group|Asian/Asian British (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|3', 'Shopping locations', 'Ethnic Group|Black/African/Caribbean/Black British (%)', 'Ethnic Group|Other ethnic group (%)', 'area', 'Public transport locations', 'Public Transport Accessibility Levels|% 2-3 (average access)|Level3_66', 'Ethnic Group|White (%)', 'Education locations', 'Car or van availability|4 or more cars or vans in household (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|1a', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|6b', 'Public Transport Accessibility Levels|% 0-1 (poor access)|Level3_65', 'Car or van availability|2 cars or vans in household (%)'],
# ['Barriers to Housing and Services Rank (where 1 is most deprived)', 'Mid-year Population Estimates|All Ages', 'Mid-year Population Estimates|Aged 45-64', 'Mid-year Population Estimates|Aged 0-15', 'lag_1_x_n_neighbors', 'Employment Rank (where 1 is most deprived)', 'Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)', 'Mid-year Population Estimates|Working-age', 'Health Deprivation and Disability Rank (where 1 is most deprived)', 'Mid-year Population Estimates|Aged 65+', 'Education, Skills and Training Rank (where 1 is most deprived)', 'Income Rank (where 1 is most deprived)'],
# ['during_corona', 'post_corona'],
# ['time_log'],
# ['lag_1', 'lag_3'],
# ['lag1_diff_neighbors', 'lag_1_x_n_neighbors', 'lag1_median_neighbors', 'lag1_mean_neighbors', 'lag1_sum_neighbors'],

# rmse          2.185602
# mae           1.073718
# crps          1.527997


# ['Ethnic Group|BAME (%)', 'Car or van availability|No cars or vans in household (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|4', 'n_neighbors', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|3', 'Public Transport Accessibility Levels|% 0-1 (poor access)|Level3_65', 'Household Composition|% One person household', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|6a', 'Ethnic Group|Asian/Asian British (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|2', 'Ethnic Group|Black/African/Caribbean/Black British (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|6b', 'Car or van availability|1 car or van in household (%)', 'Ethnic Group|Other ethnic group (%)', 'Tenure|Owned with a mortgage or loan (%)', 'Car or van availability|2 cars or vans in household (%)'],
# ['Mid-year Population Estimates|All Ages', 'Mid-year Population Estimates|Aged 0-15', 'Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)', 'Education, Skills and Training Rank (where 1 is most deprived)', 'Mid-year Population Estimates|Aged 45-64', 'Employment Rank (where 1 is most deprived)', 'Income Rank (where 1 is most deprived)'],
# ["post_corona", "month_sin"],
# ['time_log'],
# ['lag_1'],
# ['lag1_diff_neighbors', 'lag1_mean_neighbors', 'lag_1_x_n_neighbors']

# rmse          1.595894
# mae           0.933347
# crps          0.702800



# ['Tenure|Owned with a mortgage or loan (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|2', 'Car or van availability|No cars or vans in household (%)', 'Household Composition|% One person household', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|4', 'Ethnic Group|Black/African/Caribbean/Black British (%)'],
# ['Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)', 'Employment Rank (where 1 is most deprived)', 'Mid-year Population Estimates|Aged 0-15', 'Income Rank (where 1 is most deprived)'],
# ["during_corona", "month_sin"],
# ['time_log'],
# ['lag_1'],
# ['lag1_diff_neighbors', 'lag1_mean_neighbors']

# rmse          1.595894
# mae           0.933347
# crps          0.702800

#model_tuple = (
#    model_tuple[0],
#['Ethnic Group|BAME (%)', 'Car or van availability|No cars or vans in household (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|4', 'n_neighbors', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|3', 'Public Transport Accessibility Levels|% 0-1 (poor access)|Level3_65', 'Household Composition|% One person household', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|6a', 'Ethnic Group|Asian/Asian British (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|2', 'Ethnic Group|Black/African/Caribbean/Black British (%)', 'Public Transport Accessibility Levels|Number of people in each PTAL level:|6b', 'Car or van availability|1 car or van in household (%)', 'Ethnic Group|Other ethnic group (%)', 'Tenure|Owned with a mortgage or loan (%)', 'Car or van availability|2 cars or vans in household (%)'],
#['Mid-year Population Estimates|All Ages', 'Mid-year Population Estimates|Aged 0-15', 'Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)', 'Education, Skills and Training Rank (where 1 is most deprived)', 'Mid-year Population Estimates|Aged 45-64', 'Employment Rank (where 1 is most deprived)', 'Income Rank (where 1 is most deprived)'],
#["post_corona", "month_sin"],
#['time_log'],
#['lag_1'],
#['lag1_diff_neighbors', 'lag1_mean_neighbors', 'lag_1_x_n_neighbors']
#)

gdf = model_tuple[0]
split_point = gdf['time_s'].max() - 3
train_df = gdf[gdf['time_s'] < split_point]
val_df = gdf[gdf['time_s'] >= split_point]

candidate_features = {
    'static': model_tuple[1],
    'dynamic': model_tuple[2],
    'seasonal': model_tuple[3],
    'time_trend': model_tuple[4],
    'temporal': model_tuple[5],
    'spatial': model_tuple[6],
}

selected_feats = forward_feature_selection(
    burglary_model,
    train_df,
    val_df,
    candidate_features,
    device,
    max_features=10,
    debug_random=False,
)

print('Selected features (debug mode):')
for group, feats in selected_feats.items():
    print(group, feats)

model_tuple = (
    model_tuple[0],
    selected_feats['static'],
    selected_feats['dynamic'],
    selected_feats['seasonal'],
    selected_feats['time_trend'],
    selected_feats['temporal'],
    selected_feats['spatial'],
)
print(model_tuple)


Training SVI: 100%|██████████| 500/500 [00:16<00:00, 30.83it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 31.53it/s]
Training SVI: 100%|██████████| 500/500 [00:16<00:00, 30.90it/s]
Training SVI: 100%|██████████| 500/500 [00:21<00:00, 23.37it/s]
Training SVI: 100%|██████████| 500/500 [00:19<00:00, 25.24it/s]
Training SVI: 100%|██████████| 500/500 [00:17<00:00, 28.41it/s]
Training SVI: 100%|██████████| 500/500 [00:17<00:00, 28.28it/s]
Training SVI: 100%|██████████| 500/500 [00:17<00:00, 28.87it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 40.74it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 38.07it/s]
Training SVI: 100%|██████████| 500/500 [00:10<00:00, 49.19it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 37.35it/s]
Training SVI: 100%|██████████| 500/500 [00:14<00:00, 35.10it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 37.51it/s]
Training SVI: 100%|██████████| 500/500 [00:14<00:00, 35.52it/s]
Training SVI: 100%|██████████| 500/500 [

Selected dynamic:Mid-year Population Estimates|All Ages -> 1.88


Training SVI: 100%|██████████| 500/500 [00:11<00:00, 43.20it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 41.29it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 35.87it/s]
Training SVI: 100%|██████████| 500/500 [00:11<00:00, 41.68it/s]
Training SVI: 100%|██████████| 500/500 [00:10<00:00, 47.56it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 41.60it/s]
Training SVI: 100%|██████████| 500/500 [00:09<00:00, 50.60it/s]
Training SVI: 100%|██████████| 500/500 [00:10<00:00, 48.35it/s]
Training SVI: 100%|██████████| 500/500 [00:09<00:00, 52.72it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 36.50it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 35.97it/s]
Training SVI: 100%|██████████| 500/500 [00:14<00:00, 33.50it/s]
Training SVI: 100%|██████████| 500/500 [00:14<00:00, 34.97it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 32.44it/s]
Training SVI: 100%|██████████| 500/500 [00:16<00:00, 30.39it/s]
Training SVI: 100%|██████████| 500/500 [

Selected spatial:lag1_diff_neighbors -> 1.87


Training SVI: 100%|██████████| 500/500 [00:12<00:00, 38.93it/s]
Training SVI: 100%|██████████| 500/500 [00:11<00:00, 41.88it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 38.25it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 40.59it/s]
Training SVI: 100%|██████████| 500/500 [00:11<00:00, 43.24it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 41.62it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 39.90it/s]
Training SVI: 100%|██████████| 500/500 [00:11<00:00, 42.27it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 40.97it/s]
Training SVI: 100%|██████████| 500/500 [00:11<00:00, 43.11it/s]
Training SVI: 100%|██████████| 500/500 [00:10<00:00, 47.51it/s]
Training SVI: 100%|██████████| 500/500 [00:10<00:00, 46.74it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 40.00it/s]
Training SVI: 100%|██████████| 500/500 [00:14<00:00, 33.61it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 36.37it/s]
Training SVI: 100%|██████████| 500/500 [

Selected dynamic:Barriers to Housing and Services Rank (where 1 is most deprived) -> 1.74


Training SVI: 100%|██████████| 500/500 [00:11<00:00, 43.05it/s]
Training SVI: 100%|██████████| 500/500 [00:16<00:00, 30.80it/s]
Training SVI: 100%|██████████| 500/500 [00:11<00:00, 43.41it/s]
Training SVI: 100%|██████████| 500/500 [00:14<00:00, 33.67it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 41.37it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 32.57it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 37.25it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 36.63it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 38.24it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 39.65it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 39.60it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 40.16it/s]
Training SVI: 100%|██████████| 500/500 [00:12<00:00, 39.86it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 37.17it/s]
Training SVI: 100%|██████████| 500/500 [00:13<00:00, 37.57it/s]
Training SVI: 100%|██████████| 500/500 [

Selected None:None -> 1.74
Selected features (debug mode):
static []
dynamic ['Mid-year Population Estimates|All Ages', 'Barriers to Housing and Services Rank (where 1 is most deprived)']
seasonal []
time_trend []
temporal []
spatial ['lag1_diff_neighbors']
(        Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)  \
12                                                29111.0                     
13                                                29111.0                     
14                                                29111.0                     
15                                                29111.0                     
16                                                29111.0                     
...                                                   ...                     
853969                                             8904.0                     
853970                                             8904.0                     
853971                        

In [None]:
import pandas as pd
pd.read_dataframe(model_tuple[0]).to_parquet("model_tuple.parquet")

In [4]:
results = cross_validate_time_splits(model_tuple, "time_s", 12, burglary_model,
                           occupation_mappings[1], device, ward_idx_map)

Training SVI: 100%|██████████| 500/500 [00:23<00:00, 21.65it/s]
Training SVI: 100%|██████████| 500/500 [00:22<00:00, 22.60it/s]
Training SVI: 100%|██████████| 500/500 [00:16<00:00, 30.31it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 31.32it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 31.65it/s]
Training SVI: 100%|██████████| 500/500 [00:17<00:00, 28.27it/s]
Training SVI: 100%|██████████| 500/500 [00:18<00:00, 26.83it/s]
Training SVI: 100%|██████████| 500/500 [00:18<00:00, 27.53it/s]
Training SVI: 100%|██████████| 500/500 [00:17<00:00, 28.25it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 31.71it/s]
Training SVI: 100%|██████████| 500/500 [00:15<00:00, 32.67it/s]
Training SVI: 100%|██████████| 500/500 [00:16<00:00, 29.50it/s]


In [5]:
results[0]

Unnamed: 0,train_time,test_time,rmse,mae,crps
0,-1.480323,1.500464,1.627805,0.865257,0.297773
1,-1.480323,1.520604,1.787672,0.843303,0.33308
2,-1.480323,1.540745,2.922669,0.931118,0.375426
3,-1.480323,1.560885,1.383983,1.013494,0.350308
4,-1.480323,1.581026,1.711698,1.07291,0.351761
5,-1.480323,1.601166,1.358573,0.850352,0.313447
6,-1.480323,1.621307,1.625554,0.872482,0.342948
7,-1.480323,1.641447,1.281729,0.897059,0.316734
8,-1.480323,1.661587,1.413074,0.871877,0.31774
9,-1.480323,1.681728,1.460396,0.878928,0.332208


In [6]:
results[0].mean()

train_time   -1.480323
test_time     1.611236
rmse          1.640664
mae           0.899920
crps          0.327253
dtype: float64

In [7]:
train_data = prepare_model_data(training_data, *model_tuple[1:], device, ward_idx_map=ward_idx_map)
test_data = prepare_model_data(testing_data, *model_tuple[1:], device, train_data["means"], train_data["stds"], ward_idx_map)

evaluation_metrics, svi, svi.guide, prediction_tester = train_and_evaluate_model(train_data, test_data, burglary_model, occupation_mappings[1])

Training SVI: 100%|██████████| 500/500 [00:13<00:00, 36.47it/s]


In [8]:
factors_map ={
    "b_static": model_tuple[1],
    "b_dynamic": model_tuple[2],
    "b_seasonal": model_tuple[3],
    "b_time_tr": model_tuple[4],
    "b_temporal": model_tuple[5],
    "b_spatial": model_tuple[6],
}

statistical_tester = StatisticalTester(test_data, burglary_model, svi.guide, factors_map)

In [9]:
statistical_tester.predict(5_000)

In [10]:
factor_summaries = statistical_tester.evaluate_all(alpha=0.1)

In [11]:
# import pandas as pd
# inner_train, inner_val = single_out_last(training_data)
# param_grid = {
#       "lr": [1e-2, 5e-3, 1e-3],
#       "guide_type": ["diag", "lowrank"]
#       }
# results_df = grid_search(
#       burglary_model,
#       inner_train,
#       inner_val,
#       *model_tuple[1:],
#       device,
#       param_grid,
#       ward_idx_map=ward_idx_map,
#       num_steps=500,
#       )
# print(results_df)
# print("Best parameters:", results_df.loc[0].to_dict())

In [12]:
for key, value in factor_summaries.items():
    print(value.query("significant_CI == 'Yes'")["col"].tolist())

[]
['Mid-year Population Estimates|All Ages']
[]
[]
[]
[]


In [13]:
model_tuple[0]["Crime Rank (where 1 is most deprived)"].value_counts()

KeyError: 'Crime Rank (where 1 is most deprived)'

In [None]:
factor_summaries["b_dynamic"]

In [None]:
factors_map ={
    "b_static": model_tuple[1],
    "b_dynamic": model_tuple[2],
    "b_seasonal": model_tuple[3],
    "b_time_tr": model_tuple[4],
    "b_temporal": model_tuple[5],
    "b_spatial": model_tuple[6],
}

In [None]:
check = prediction_tester.get_median_predictions()

In [None]:
check["true"] = prediction_tester.y
check["diff"] = check["median"] - check["true"]

In [None]:
check

In [None]:
import matplotlib.pyplot as plt
wtf = check.nlargest(200, columns=["diff"])
plt.figure(figsize=(15, 6))
plt.plot(wtf.index, wtf["median"], marker='o', label="Predicted Median")
plt.plot(wtf.index, wtf["true"], marker='s', label="Actual")

plt.xlabel("Date")
plt.ylabel("Value")
plt.title("Two Series on the Same Scale")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
check[["median", "true", "diff"]].describe()