#### Getting Predicted Data

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm



df = pd.read_csv('predicted_data/predicted_data.csv', index_col=0)
# Step 2: Compute the median across each row (i.e., across all 5000 columns)
median_series = df.median(axis=1)
# Step 3: Create a new DataFrame with 'median' as the only column
median_df = pd.DataFrame({'median': median_series})
predicted_data = median_df

list_of_lsoa = predicted_data.index.tolist()
predicted_data

Unnamed: 0,median
E01000001,0.0
E01000002,0.0
E01000003,0.0
E01000005,1.0
E01000006,0.0
...,...
E01035688,1.0
E01035689,0.0
E01035690,2.0
E01035691,1.0


#### Getting Observed Data

In [2]:
# Load the CSV files
metropolitan_df = pd.read_csv('observed_data/2025-03-metropolitan-street.csv')
city_of_london_df = pd.read_csv('observed_data/2025-03-city-of-london-street.csv')

# Combine both dataframes
combined_df = pd.concat([metropolitan_df, city_of_london_df], ignore_index=True)

# Filter to only include burglaries
burglary_df = combined_df[combined_df['Crime type'].str.lower() == 'burglary']

# Count the number of burglaries per LSOA
burglary_counts = burglary_df['LSOA code'].value_counts().reset_index()
burglary_counts.columns = ['LSOA', 'count']
burglary_counts.set_index('LSOA', inplace=True)
burglary_counts = burglary_counts.sort_index(ascending=True)

observed_data = burglary_counts
observed_data = observed_data.reset_index()
observed_data = observed_data.rename(columns={'index': 'LSOA'})

observed_data


Unnamed: 0,LSOA,count
0,E01000003,1
1,E01000005,1
2,E01000007,4
3,E01000009,1
4,E01000013,2
...,...,...
2320,E01035716,14
2321,E01035717,6
2322,E01035718,3
2323,E01035719,1


#### Function for Evaluation Metric

**RMSE** tells you: “How far off were your mean predictions?” \
**CRPS** tells you: “How well did your full predictive distribution reflect the reality, including its uncertainty?”

##### RMSE 

In [3]:

def compute_rmse(predictions, lsoa_codes, observed_df):
    """
    Compute CRPS between predicted samples and observed burglary counts.

    Parameters:
    - predictions: np.ndarray of shape (n_samples, n_LSOAs)
    - lsoa_codes: list of LSOA codes in the same order as columns of `predictions`
    - observed_df: DataFrame or Series indexed by LSOA with counts

    Returns:
    - average RMSE score over all LSOAs
    """
    def rmse_single(pred_samples, obs):
        pred_samples = np.asarray(pred_samples)
        term1 = np.mean((pred_samples - obs)**2)
        term2 = 0.5 * np.mean(np.abs(pred_samples[:, None] - pred_samples[None, :]))
        return term1 - term2

    all_lsoas = pd.Series(0, index=lsoa_codes)

    # Accept either a DataFrame with 'count' or a Series
    if isinstance(observed_df, pd.DataFrame):
        observed_series = observed_df['count']
    else:
        observed_series = observed_df

    observed_aligned = all_lsoas.copy()
    observed_aligned.update(observed_series)

    rmse_scores = []
    for i, lsoa in enumerate(lsoa_codes):
        obs = observed_aligned[lsoa]
        rmse = rmse_single(predictions[:, i], obs)
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)



In [4]:
# list_of_lsoa must match predicted_data.columns exactly
list_of_lsoa = predicted_data.columns.tolist()

# Compute RMSE
rmse_score = compute_rmse(predicted_data.to_numpy(), list_of_lsoa, observed_data)
print(f"rmse Score: {rmse_score:.4f}")


rmse Score: 1.6255


average predictive distribution is, on average, 1.625 units away from the true crime count (in a probabilistic sense).

- predicts crime 1.6255+- away from true prediction

##### CRPS

In order to calculate CRPS score, we need to find the probability distributions of each LSOA (estimated using the simulated data, under the assumption it is a gaussian distribution)

In [5]:
# Step 2: For each LSOA, fit a normal distribution to its 5000 samples
lsoa_pdfs = {}
means = []
stds = []
index_list = []
epsilon = 1e-6  # Tiny value to prevent divide-by-zero

for lsoa_code, row in predicted_data.iterrows():
    samples = row.values.astype(float)
    mu = np.mean(samples)
    sigma = np.std(samples)

    # If sigma is too small, adjust it
    if sigma < epsilon:
        sigma = epsilon

    lsoa_pdfs[lsoa_code] = norm(loc=mu, scale=sigma)
    means.append(mu)
    stds.append(sigma)
    index_list.append(lsoa_code)

# Step 3: Save stats
stats_df = pd.DataFrame({
    'mu': means,
    'sigma': stds
}, index=index_list)

stats_df = stats_df.reset_index()  # index becomes a column
stats_df = stats_df.rename(columns={'index': 'LSOA'})

stats_df

Unnamed: 0,LSOA,mu,sigma
0,E01000001,0.0,0.000001
1,E01000002,0.0,0.000001
2,E01000003,0.0,0.000001
3,E01000005,1.0,0.000001
4,E01000006,0.0,0.000001
...,...,...,...
4959,E01035688,1.0,0.000001
4960,E01035689,0.0,0.000001
4961,E01035690,2.0,0.000001
4962,E01035691,1.0,0.000001


In [6]:
def calculate_crps(mu, sigma, x_obs):
    """
    Compute the Continuous Ranked Probability Score (CRPS)
    for a normal distribution N(mu, sigma^2) and an observed value x_obs.

    Parameters:
    - mu (float): Mean of the normal distribution.
    - sigma (float): Standard deviation of the normal distribution. Must be > 0.
    - x_obs (float): The observed value.

    Returns:
    - crps (float): The CRPS score.
    """
    if sigma <= 0:
        raise ValueError("Standard deviation sigma must be greater than 0.")

    z = (x_obs - mu) / sigma
    pdf = norm.pdf(z)
    cdf = norm.cdf(z)
    crps = sigma * (z * (2 * cdf - 1) + 2 * pdf - 1 / np.sqrt(np.pi))
    return crps

Function to calculate CRPS

In [7]:
def compute_average_crps(stats_df, observation_data):
    """
    Computes the average CRPS for predictions in stats_df using observational values
    from observation_data (which has 'LSOA' as its index).

    Parameters:
    - stats_df (pd.DataFrame): Contains 'LSOA', 'mu', 'sigma'
    - observation_data (pd.DataFrame): Indexed by 'LSOA', must contain the observed value as a column

    Returns:
    - float: The average CRPS across all valid rows
    """
    crps_list = []

    for _, row in stats_df.iterrows():
        lsoa = row['LSOA']
        mu = row['mu']
        sigma = row['sigma']

        # Try to get observed value from the observation_data index
        try:
            x_obs = observation_data.loc[lsoa].values[0]
        except KeyError:
            x_obs = 0  # if LSOA not found, default to 0

        try:
            crps = calculate_crps(mu, sigma, x_obs)
            crps_list.append(crps)
        except ValueError:
            continue  # skip if sigma <= 0

    if not crps_list:
        return float('nan')

    return np.mean(crps_list)

In [9]:
compute_average_crps(stats_df,observed_data)

0.4696816290452512


score captures both bias (distance between predicted mean and true count) and sharpness (how spread or confident predictions were). 

score suggests that the predicted distributions are, on average, about 0.47 crimes away from the observed value, factoring in uncertainty.
