#### Getting Predicted Data

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import norm



original_data = pd.read_csv('predicted_data/predicted_data.csv', index_col=0)
# Step 2: Compute the median across each row (i.e., across all 5000 columns)
median_series = original_data.median(axis=1)
# Step 3: Create a new DataFrame with 'median' as the only column
median_df = pd.DataFrame({'count': median_series})
predicted_data = median_df

predicted_data = predicted_data.reset_index()
predicted_data = predicted_data.rename(columns={'index': 'LSOA'})
original_data = original_data.reset_index()
original_data = original_data.rename(columns={'index': 'LSOA'})

predicted_data

Unnamed: 0,LSOA,count
0,E01000001,0.0
1,E01000002,0.0
2,E01000003,0.0
3,E01000005,1.0
4,E01000006,0.0
...,...,...
4959,E01035688,1.0
4960,E01035689,0.0
4961,E01035690,2.0
4962,E01035691,1.0


#### Getting Observed Data

In [13]:
# Load the CSV files
metropolitan_df = pd.read_csv('observed_data/2025-03-metropolitan-street.csv')
city_of_london_df = pd.read_csv('observed_data/2025-03-city-of-london-street.csv')

# Combine both dataframes
combined_df = pd.concat([metropolitan_df, city_of_london_df], ignore_index=True)

# Filter to only include burglaries
burglary_df = combined_df[combined_df['Crime type'].str.lower() == 'burglary']

# Count the number of burglaries per LSOA
burglary_counts = burglary_df['LSOA code'].value_counts().reset_index()
burglary_counts.columns = ['LSOA', 'count']
burglary_counts.set_index('LSOA', inplace=True)
burglary_counts = burglary_counts.sort_index(ascending=True)

observed_data = burglary_counts
observed_data = observed_data.reset_index()
observed_data = observed_data.rename(columns={'index': 'LSOA'})

observed_data


Unnamed: 0,LSOA,count
0,E01000003,1
1,E01000005,1
2,E01000007,4
3,E01000009,1
4,E01000013,2
...,...,...
2320,E01035716,14
2321,E01035717,6
2322,E01035718,3
2323,E01035719,1


In [None]:
def Full_Data_In_Merged_RMSE(predicted_data, observed_data):    
    predicted_data = predicted_data.copy()
    observed_data = observed_data.copy()
    predicted_data['LSOA'] = predicted_data['LSOA'].astype(str)
    observed_data['LSOA'] = observed_data['LSOA'].astype(str)

    # Merge and fill missing observed counts with 0
    merged = pd.merge(
        predicted_data,
        observed_data,
        on='LSOA',
        how='left',
        suffixes=('_pred', '_obs')
    )
    merged['count_obs'] = merged['count_obs'].fillna(0)

    return merged

def Full_Data_In_Merged_CRPS(stats_df, observed_data):    
    merged_crps = pd.merge(stats_df, observed_data, on='LSOA', how='left')
    merged_crps = merged_crps.fillna(0)
    return merged_crps

### MERGED FOR RMSE

In [15]:
merged_rmse = Full_Data_In_Merged(predicted_data,observed_data)
print(merged_rmse)

           LSOA  count_pred  count_obs
0     E01000001         0.0        0.0
1     E01000002         0.0        0.0
2     E01000003         0.0        1.0
3     E01000005         1.0        1.0
4     E01000006         0.0        0.0
...         ...         ...        ...
4959  E01035688         1.0        0.0
4960  E01035689         0.0        0.0
4961  E01035690         2.0        2.0
4962  E01035691         1.0        1.0
4963  E01035692         1.0        0.0

[4964 rows x 3 columns]


#### Function for Evaluation Metric

**RMSE** tells you: “How far off were your mean predictions?” \
**CRPS** tells you: “How well did your full predictive distribution reflect the reality, including its uncertainty?”

##### RMSE 

In [None]:
def rmse_score(merged: pd.DataFrame) -> float:
    """
    Computes average RMSE across LSOAs using the merged DataFrame with 'count_pred' and 'count_obs'.

    Parameters:
    - merged: DataFrame with ['LSOA', 'count_pred', 'count_obs']

    Returns:
    - float: average RMSE across all LSOAs
    """
    def compute_rmse(group):
        return np.sqrt(np.mean((group['count_pred'] - group['count_obs']) ** 2))

    rmse_by_lsoa = merged.groupby('LSOA').apply(compute_rmse)
    return rmse_by_lsoa.mean()


In [17]:
# Compute RMSE
rmse_score = rmse_score(merged_rmse)
print(f"rmse Score: {rmse_score:.4f}")

rmse Score: 0.7725


  rmse_by_lsoa = merged.groupby('LSOA').apply(compute_rmse)


average predictive distribution is, on average, 0.7725 units away from the true crime count (in a probabilistic sense).

- predicts crime 0.7725+- away from true prediction

##### CRPS

In order to calculate CRPS score, we need to find the probability distributions of each LSOA (estimated using the simulated data, under the assumption it is a gaussian distribution)

In [18]:
def fit_gaussian_per_lsoa(original_data: pd.DataFrame, id_col: str = 'LSOA', epsilon: float = 1e-6):
    """
    Fits a normal distribution to the predicted samples for each LSOA.

    Parameters:
    - original_data: DataFrame with one identifier column (e.g. 'LSOA') and the rest are samples
    - id_col: name of the identifier column (e.g., 'LSOA')
    - epsilon: minimum allowed std deviation to avoid degenerate distribution

    Returns:
    - lsoa_pdfs: dict mapping LSOA to scipy.stats.norm distribution
    - stats_df: DataFrame with columns ['LSOA', 'mu', 'sigma']
    """
    from scipy.stats import norm
    import numpy as np
    import pandas as pd

    lsoa_pdfs = {}
    means = []
    stds = []
    index_list = []

    for _, row in original_data.iterrows():
        lsoa_code = row[id_col]
        samples = row.drop(id_col).values.astype(float)

        mu = np.mean(samples)
        sigma = np.std(samples)

        if sigma < epsilon:
            sigma = epsilon

        lsoa_pdfs[lsoa_code] = norm(loc=mu, scale=sigma)
        means.append(mu)
        stds.append(sigma)
        index_list.append(lsoa_code)

    stats_df = pd.DataFrame({
        id_col: index_list,
        'mu': means,
        'sigma': stds
    })

    return stats_df


### MERGED FOR CRPS

In [34]:
stats_df = fit_gaussian_per_lsoa(original_data)
merged_crps = Full_Data_In_Merged_CRPS(stats_df,observed_data)
merged_crps

Unnamed: 0,LSOA,mu,sigma,count
0,E01000001,0.5334,0.749189,0.0
1,E01000002,0.5808,0.770630,0.0
2,E01000003,0.4614,0.670306,1.0
3,E01000005,0.7356,0.861912,1.0
4,E01000006,0.5468,0.746599,0.0
...,...,...,...,...
4959,E01035688,1.0402,1.038356,0.0
4960,E01035689,0.6722,0.830631,0.0
4961,E01035690,1.9140,1.455543,2.0
4962,E01035691,1.1714,1.119295,1.0


In [22]:
def calculate_crps(mu, sigma, x_obs):
    """
    Computes CRPS for a normal distribution with mean=mu, std=sigma and observation x_obs.
    """
    z = (x_obs - mu) / sigma
    pdf = norm.pdf(z)
    cdf = norm.cdf(z)
    return sigma * (z * (2 * cdf - 1) + 2 * pdf - 1 / np.sqrt(np.pi))

def compute_average_crps_from_fitted(stats_df: pd.DataFrame) -> float:
    """
    Computes the average CRPS using mu/sigma from stats_df and the corresponding count_obs.

    Parameters:
    - df: DataFrame with ['LSOA', 'mu', 'sigma', 'count_obs']

    Returns:
    - float: average CRPS
    """
    crps_scores = []

    for _, row in stats_df.iterrows():
        mu = row['mu']
        sigma = row['sigma']
        x_obs = row['count']
        crps = calculate_crps(mu, sigma, x_obs)
        crps_scores.append(crps)

    return np.mean(crps_scores)


In [23]:
print(compute_average_crps_from_fitted(merged_crps))

0.5894143915877256



score captures both bias (distance between predicted mean and true count) and sharpness (how spread or confident predictions were). 

score suggests that the predicted distributions are, on average, about 0.59 crimes away from the observed value, factoring in uncertainty.
