In [None]:
# print out correlation and rmse results for predicted P(T) and predicted ratings for real data for full model, reports only model, and ratings only model
import os
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
import warnings
plt.rcParams['font.family'] = 'serif'

# Set the font used for math expressions to LaTeX
plt.rcParams["mathtext.fontset"] = "cm"

In [2]:
# file paths
base_file = '/share/garg/311_data/sb2377/clean_codebase/three_year_base.csv'
type_rating_observed_base_file = '/share/garg/311_data/sb2377/clean_codebase/three_year_type_rating_observed_base.csv'
results_dir = '/share/garg/311_data/sb2377/results'
demographics_file = '/share/garg/311_data/sb2377/clean_codebase/tract_demographics.csv'

# user specified arguments
types = {'Street': 'StreetConditionDOT',
         'Park': 'MaintenanceorFacilityDPR',
         'Rodent': 'RodentDOHMH',
         'Food': 'FoodDOHMH',
         'DCWP': 'ConsumerComplaintDCWP'}
models = {'Full model': {'job_ids':[3000] + [i * 3 + 3005 for i in range(12)]},
          'Ratings-only model': {'job_ids':[3002] + [i * 3 + 3007 for i in range(12)]},
          'Reports-only model': {'job_ids':[3001] + [i * 3 + 3006 for i in range(12)]}}
epoch = '59'
budget_k = 100

In [3]:
# load files
base_df = pd.read_csv(base_file)
type_rating_observed_base_df = pd.read_csv(type_rating_observed_base_file)
demographics_df = pd.read_csv(demographics_file)
base_node_df = base_df[['GEOID', 'node_idxs']].drop_duplicates()
demographics_df = pd.merge(demographics_df, base_node_df, on='GEOID', how='left')

In [4]:
# get type indices
# for df with all types
type_df = base_df[['typeagency', 'type_idxs']].drop_duplicates()
indices = {}
for type_name, type_id in types.items():
    idx = type_df[type_df['typeagency'] == type_id]['type_idxs'].iloc[0]
    indices[type_name] = idx

# for df with only types with observed ratings
type_df = type_rating_observed_base_df[['typeagency', 'type_idxs']].drop_duplicates()
type_rating_observed_indices = {}
for type_name, type_id in types.items():
    idx = type_df[type_df['typeagency'] == type_id]['type_idxs'].iloc[0]
    type_rating_observed_indices[type_name] = idx

In [5]:
def create_groups(race_white_nh_pct):
    """
    Create race group labels based on terciles of white non-Hispanic percentage.
    
    Parameters:
    -----------
    race_white_nh_pct : array-like
        Array of percent white non-Hispanic for each census tract (0-100 scale)
    
    Returns:
    --------
    labels : numpy array of strings
        Array of labels: 'predominantly_minority', 'mixed', or 'predominantly_white'
    
    Notes:
    ------
    - Predominantly minority: Bottom 33% (lowest % white)
    - Mixed/Diverse: Middle 33%
    - Predominantly white: Top 33% (highest % white)
    """
    race_white_nh_pct = np.array(race_white_nh_pct)
    
    # Calculate tercile thresholds (33rd and 67th percentiles)
    p33 = np.percentile(race_white_nh_pct, 33.33)
    p67 = np.percentile(race_white_nh_pct, 66.67)
    
    # Create labels
    labels = np.empty(len(race_white_nh_pct), dtype=object)
    labels[race_white_nh_pct < p33] = 0
    labels[(race_white_nh_pct >= p33) & (race_white_nh_pct < p67)] = 1
    labels[race_white_nh_pct >= p67] = 2
    
    return labels

In [6]:
# get predicted ratings for all jobs for types with observed ratings
checkpoint_file = '{}/job{}/model-epoch={}.ckpt'
results_file = '{}/job{}/epoch={}_test.pkl'
checkpoint_counters = {}
results_counters = {}
for m in models:
    checkpoint_counters[m] = 0
    results_counters[m] = 0
type_rating_observed_dfs = {}
for m in models:
    type_rating_observed_dfs[m] = []

for m in models:
    for i, job_idx in enumerate(models[m]['job_ids']):
        if os.path.exists(checkpoint_file.format(results_dir, job_idx, epoch)):
            checkpoint_counters[m] += 1
        if os.path.exists(results_file.format(results_dir, job_idx, epoch)):
            results_counters[m] += 1
            with open(results_file.format(results_dir, job_idx, epoch), 'rb') as file:
                pred_rating, true_rating, mask, node_embedding, type_embedding, node_idxs, type_idxs, demographics, pred_pt, true_t = pickle.load(file)

            raw_demographics_df = pd.DataFrame()
            raw_demographics_df['node_idxs'] = node_idxs
            raw_demographics_df = raw_demographics_df.merge(demographics_df, on='node_idxs', how='left')
            
            income_groups = create_groups(raw_demographics_df['income_median'].values)
            race_groups = create_groups(raw_demographics_df['race_white_nh_pct'].values)

            df = pd.DataFrame()
            df['pred_rating'] = pred_rating
            df['true_rating'] = true_rating
            df['node_idxs'] = node_idxs
            df['type_idxs'] = type_idxs
            df['pred_pt'] = pred_pt
            df['true_t'] = true_t
            df['mask'] = mask
            df['income_groups'] = income_groups
            df['race_groups'] = race_groups
            df['income_median'] = raw_demographics_df['income_median'].values
            df['race_white_nh_pct'] = raw_demographics_df['race_white_nh_pct'].values
            df['population'] = raw_demographics_df['population'].values

            type_rating_observed_dfs[m].append(df)

for m in models:
    print('{}: checkpoint files done = {}'.format(m, checkpoint_counters[m]))
    print('{}: results files done = {}'.format(m, results_counters[m]))

Full model: checkpoint files done = 13
Full model: results files done = 13
Ratings-only model: checkpoint files done = 13
Ratings-only model: results files done = 13
Reports-only model: checkpoint files done = 13
Reports-only model: results files done = 13


In [None]:
def evaluate_topk_coverage(true_ratings, predicted_ratings, k_values=[5, 10, 20, 50]):
    """
    Evaluate top-k coverage for incident prediction.
    
    Args:
        true_ratings: Ground truth ratings (lower = worse condition)
        predicted_ratings: Model predicted ratings
        k_values: List of k values to evaluate
    
    Returns:
        Dictionary of coverage scores for each k
    """
    results = {}
    
    for k in k_values:
        # Get indices of k worst neighborhoods (lowest ratings)
        true_topk = np.argsort(true_ratings)[:k]  # Lowest k ratings
        pred_topk = np.argsort(predicted_ratings)[:k]
        
        # Calculate coverage: intersection / k
        coverage = len(np.intersect1d(true_topk, pred_topk)) / k
        results[f'top_{k}_coverage'] = coverage
    
    return results

def expected_calibration_error(true_ratings, predicted_ratings, n_bins=10):
    """
    Expected Calibration Error for continuous predictions.
    
    ECE = weighted average of |predicted - actual| across bins
    Lower is better (0 = perfect calibration)
    """
    bin_edges = np.percentile(predicted_ratings, 
                               np.linspace(0, 100, n_bins + 1))
    
    ece = 0.0
    total_samples = len(predicted_ratings)
    
    for i in range(n_bins):
        in_bin = (predicted_ratings >= bin_edges[i]) & \
                 (predicted_ratings < bin_edges[i + 1])
        
        if in_bin.sum() > 0:
            bin_weight = in_bin.sum() / total_samples
            pred_mean = predicted_ratings[in_bin].mean()
            true_mean = true_ratings[in_bin].mean()
            
            ece += bin_weight * abs(pred_mean - true_mean)
    
    return ece

def calculate_correlation_by_group(predictions, actuals, group_labels):
    """
    Calculate correlation between predictions and actuals for each demographic group.
    
    Parameters:
    -----------
    predictions : array-like
        Predicted values (e.g., predicted ratings)
    actuals : array-like
        Actual/ground truth values (e.g., true ratings)
    group_labels : array-like
        Demographic group labels for each observation
    
    Returns:
    --------
    results_df : pandas DataFrame
        DataFrame with correlation statistics for each group
    """
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    group_labels = np.array(group_labels)
    
    # Calculate overall correlation
    overall_corr = np.corrcoef(predictions, actuals)[0, 1]
    
    # Calculate correlation for each group
    results = []
    unique_groups = np.unique(group_labels)
    
    for group in unique_groups:
        # Get data for this group
        mask = group_labels == group
        group_pred = predictions[mask]
        group_actual = actuals[mask]
        
        # Calculate correlation
        group_corr = np.corrcoef(group_pred, group_actual)[0, 1]
        
        # Calculate correlation gap (group - overall)
        corr_gap = group_corr - overall_corr
        
        results.append({
            'group': group,
            'correlation_gap': corr_gap,
        })
    
    results_df = pd.DataFrame(results)
    return results_df

def calculate_calibration_by_group(predicted_probs, actuals, group_labels):
    """
    Calculate calibration error for each demographic group.
    
    Calibration measures whether predicted probabilities match empirical frequencies.
    Expected Calibration Error (ECE) = |mean(predicted_probs) - mean(actuals)|
    
    Parameters:
    -----------
    predicted_probs : array-like
        Predicted probabilities (e.g., P(report) from model)
    actuals : array-like
        Actual binary outcomes (e.g., 0/1 for whether report occurred)
    group_labels : array-like
        Demographic group labels for each observation
    
    Returns:
    --------
    results_df : pandas DataFrame
        DataFrame with calibration statistics for each group
    """
    predicted_probs = np.array(predicted_probs)
    actuals = np.array(actuals)
    group_labels = np.array(group_labels)
    
    # Calculate overall calibration error
    overall_mean_pred = np.mean(predicted_probs)
    overall_mean_actual = np.mean(actuals)
    overall_ece = np.abs(overall_mean_pred - overall_mean_actual)
    
    # Calculate calibration for each group
    results = []
    unique_groups = np.unique(group_labels)
    
    for group in unique_groups:
        # Get data for this group
        mask = group_labels == group
        group_pred = predicted_probs[mask]
        group_actual = actuals[mask]

        # Mean predicted probability
        mean_pred = np.mean(group_pred)
        
        # Empirical frequency (mean of binary actuals)
        mean_actual = np.mean(group_actual)
        
        # Expected Calibration Error (ECE)
        ece = np.abs(mean_pred - mean_actual)
        
        # Calibration gap (group ECE - overall ECE)
        ece_gap = ece - overall_ece
        
        results.append({
            'group': group,
            'calibration_gap': ece_gap
        })
  
    results_df = pd.DataFrame(results)
    
    return results_df

def compute_representation_ratio(predicted_ratings, demographic_data, population, budget_k):
    """
    Compute representation-based spatial equity using continuous demographic data (% white residents).
    """
    df = pd.DataFrame({
        'predicted_rating': predicted_ratings,
        'demographic_data': demographic_data,
        'pop': population
    })
    
    # Select tracts that receive the budget (lowest predicted ratings)
    served_idx = np.argsort(df['predicted_rating'])[:budget_k]
    df['served'] = False
    df.loc[served_idx, 'served'] = True
    
    # Weighted average % white among served tracts
    served_weighted = np.average(df[df['served']]['demographic_data'], weights=df[df['served']]['pop'])
    
    # Weighted average % white among all tracts
    all_weighted = np.average(df['demographic_data'], weights=df['pop'])
    
    # Ratio: representation of white residents in served vs. total
    representation_ratio = served_weighted / all_weighted 
    
    return representation_ratio

In [9]:
# print out correlation and rmse results for predicted ratings
for m in models:
    df_set = type_rating_observed_dfs[m]
    metric_vals = {'correlation': [],
               'rmse': [], 
               'top_5_coverage': [], 
               'top_10_coverage': [], 
               'top_20_coverage': [], 
               'top_50_coverage': [],
               'ece': [],
               'income_corr_gap': {0: [], 1: [], 2: []}, 
               'race_corr_gap': {0: [], 1: [], 2: []},
               'income_ece_gap': {0: [], 1: [], 2: []},
               'race_ece_gap': {0: [], 1: [], 2: []},
               'income_representation_ratio': [],
               'race_representation_ratio': []}
    for t in types:
        type_metric_vals = {'correlation': [],
               'rmse': [], 
               'top_5_coverage': [], 
               'top_10_coverage': [], 
               'top_20_coverage': [], 
               'top_50_coverage': [],
               'ece': [],
               'income_corr_gap': {0: [], 1: [], 2: []}, 
               'race_corr_gap': {0: [], 1: [], 2: []},
               'income_ece_gap': {0: [], 1: [], 2: []},
               'race_ece_gap': {0: [], 1: [], 2: []},
               'income_representation_ratio': [],
               'race_representation_ratio': []}
        idx = indices[t]
        type_rating_observed_idx = type_rating_observed_indices[t]
        for df in df_set:
            df_type = df[df['type_idxs'] == idx]
            if m == 'Ratings-only model':
                df_type = df[df['type_idxs'] == type_rating_observed_idx]
            else:
                df_type = df[df['type_idxs'] == idx]
            node_df = df_type.groupby(['node_idxs', 'type_idxs']).mean().reset_index()

            with warnings.catch_warnings():
                if m == 'Reports-only model':
                    # for reports-only model, we use -P(T) as a proxy for r
                    pred_rating = -1 * node_df['pred_pt']
                else:
                    pred_rating = node_df['pred_rating']
                # correlation
                corr = pearsonr(node_df['pred_rating'], node_df['true_rating'])
                type_metric_vals['correlation'].append(corr[0])
                # rmse
                rmse = np.sqrt(mean_squared_error(node_df['pred_rating'], node_df['true_rating']))
                type_metric_vals['rmse'].append(rmse)
                # top k coverage for predicted ratings
                coverages = evaluate_topk_coverage(node_df['true_rating'], pred_rating)
                for k in coverages:
                    type_metric_vals[k].append(coverages[k])
                # ECE
                ece = expected_calibration_error(node_df['true_rating'], pred_rating)
                type_metric_vals['ece'].append(ece)
                # correlation gap
                income_corr_gap = calculate_correlation_by_group(pred_rating, node_df['true_rating'], node_df['income_groups'])
                race_corr_gap = calculate_correlation_by_group(pred_rating, node_df['true_rating'], node_df['race_groups'])
                for i in range(3):
                    type_metric_vals['income_corr_gap'][i].append(income_corr_gap[income_corr_gap['group'] == i]['correlation_gap'].item())
                    type_metric_vals['race_corr_gap'][i].append(race_corr_gap[race_corr_gap['group'] == i]['correlation_gap'].item())
                # ECE gap
                income_ece_gap = calculate_calibration_by_group(pred_rating, node_df['true_rating'], node_df['income_groups'])
                race_ece_gap = calculate_calibration_by_group(pred_rating, node_df['true_rating'], node_df['race_groups'])
                for i in range(3):
                    type_metric_vals['income_ece_gap'][i].append(income_ece_gap[income_ece_gap['group'] == i]['calibration_gap'].item())
                    type_metric_vals['race_ece_gap'][i].append(race_ece_gap[race_ece_gap['group'] == i]['calibration_gap'].item())
                # representation ratio
                income_representation_ratio = compute_representation_ratio(pred_rating, node_df['income_median'], node_df['population'], budget_k)
                race_representation_ratio = compute_representation_ratio(pred_rating, node_df['race_white_nh_pct'], node_df['population'], budget_k)
                type_metric_vals['income_representation_ratio'].append(income_representation_ratio)
                type_metric_vals['race_representation_ratio'].append(race_representation_ratio)
                
        for k in type_metric_vals:
            if k in ['income_corr_gap', 'race_corr_gap', 'income_ece_gap', 'race_ece_gap']:
                for i in range(3):
                    metric_vals[k][i].append(type_metric_vals[k][i])
            else:
                metric_vals[k].append(type_metric_vals[k])
    
    for k in metric_vals:
        with warnings.catch_warnings():
            # calculate mean and 95% confidence interval over correlations
            warnings.simplefilter("ignore")
            if k in ['income_corr_gap', 'race_corr_gap', 'income_ece_gap', 'race_ece_gap']:
                for i in range(3):
                    metric_vals_k = np.array(metric_vals[k][i])
                    filtered_metric_vals = metric_vals_k[~np.isnan(metric_vals_k).any(axis=1)]
                    mean_for_each_job = filtered_metric_vals.mean(axis=0)
                    mean_overall = filtered_metric_vals.mean()
                    se_across_jobs = np.std(mean_for_each_job) / np.sqrt(len(mean_for_each_job) - 1)
                    print(f'Model: {m}, Group number: {i}, {k}: {mean_overall:.4f} \pm {1.96 * se_across_jobs:.4f}')
            else:
                metric_vals_k = np.array(metric_vals[k])
                filtered_metric_vals = metric_vals_k[~np.isnan(metric_vals_k).any(axis=1)]
                mean_for_each_job = filtered_metric_vals.mean(axis=0)
                mean_overall = filtered_metric_vals.mean()
                se_across_jobs = np.std(mean_for_each_job) / np.sqrt(len(mean_for_each_job) - 1)
                print('Model: {}, {}: {:.4f} \pm {:.4f}'.format(m, k, mean_overall, 1.96 * se_across_jobs))

Model: Full model, correlation: 0.5303 \pm 0.0194
Model: Full model, rmse: 0.5833 \pm 0.0123
Model: Full model, top_5_coverage: 0.1169 \pm 0.0287
Model: Full model, top_10_coverage: 0.1692 \pm 0.0257
Model: Full model, top_20_coverage: 0.2115 \pm 0.0189
Model: Full model, top_50_coverage: 0.2655 \pm 0.0148
Model: Full model, ece: 0.2072 \pm 0.0080
Model: Full model, Group number: 0, income_corr_gap: 0.0028 \pm 0.0030
Model: Full model, Group number: 1, income_corr_gap: -0.0023 \pm 0.0028
Model: Full model, Group number: 2, income_corr_gap: -0.0056 \pm 0.0042
Model: Full model, Group number: 0, race_corr_gap: -0.0005 \pm 0.0043
Model: Full model, Group number: 1, race_corr_gap: -0.0121 \pm 0.0081
Model: Full model, Group number: 2, race_corr_gap: -0.0006 \pm 0.0048
Model: Full model, Group number: 0, income_ece_gap: 0.0030 \pm 0.0030
Model: Full model, Group number: 1, income_ece_gap: -0.0012 \pm 0.0049
Model: Full model, Group number: 2, income_ece_gap: 0.0061 \pm 0.0040
Model: Full mo