In [3]:
import pandas as pd
import numpy as np
import random

# Load station data
stations_df = pd.read_csv("../data/results.csv")
udf_cols = [col for col in stations_df.columns if col.startswith("UDF(")]
stations_df[udf_cols] = stations_df[udf_cols].apply(pd.to_numeric, errors='coerce')

# Load time matrix and strip depot
time_matrix = pd.read_csv("../build/time_matrix.csv", index_col=0)
time_matrix = time_matrix.drop(index='depot', columns='depot')

# Constants
dur_per_move = 60  # seconds per bike move
total_time_sec = 5 * 5 * 3600  # 5 trucks, 5 hours
max_moves = total_time_sec // dur_per_move

# Sample sizes and repetitions
sample_sizes = [200, 400, 600, 800, 1000, 2109]
reps = 10

# Storage
results = []

for n in sample_sizes:
    for rep in range(reps):
        sample_stations = stations_df.sample(n)
        station_ids = sample_stations['StationID'].tolist()

        # Filter time matrix for these stations
        sub_time = time_matrix.loc[station_ids, station_ids]

        # Estimate reachable moves: take shortest durations and assume truck reaches those
        all_durations = sub_time.values[np.triu_indices_from(sub_time, k=1)]
        shortest_durations = np.sort(all_durations)[0:max_moves]
        estimated_moves = len(shortest_durations)

        # UDF range
        udf_max = sample_stations[udf_cols].max(axis=1).sum()
        udf_min = sample_stations[udf_cols].min(axis=1).sum()
        udf_range = udf_max - udf_min

        # Assume we can reduce some fraction of UDF_range (e.g., 40–70% based on moves)
        impact_fraction = random.uniform(0.4, 0.7)
        estimated_udf_reduction = udf_range * impact_fraction

        results.append({
            "Sample Size": n,
            "Rep": rep + 1,
            "Worst Case UDF": udf_max,
            "Best Case UDF": udf_min,
            "UDF Range": udf_range,
            "Reachable Moves": estimated_moves,
            "Est. Achievable UDF Reduction": estimated_udf_reduction
        })

# Turn into DataFrame
results_df = pd.DataFrame(results)

# Summary stats
summary = results_df.groupby("Sample Size").agg({
    "Worst Case UDF": ["mean", "std"],
    "Best Case UDF": ["mean", "std"],
    "Reachable Moves": ["mean"],
    "Est. Achievable UDF Reduction": ["mean", "std"]
}).round(2)

print(summary)


            Worst Case UDF         Best Case UDF         Reachable Moves  \
                      mean     std          mean     std            mean   
Sample Size                                                                
200                4174.93  299.91       3803.04  294.99          1500.0   
400                8448.97  341.77       7702.39  332.35          1500.0   
600               12539.80  777.40      11429.80  745.55          1500.0   
800               16858.12  474.23      15356.07  456.46          1500.0   
1000              21112.32  593.01      19242.43  569.31          1500.0   
2109              44160.13    0.00      40231.99    0.00          1500.0   

            Est. Achievable UDF Reduction          
                                     mean     std  
Sample Size                                        
200                                196.12   42.98  
400                                424.50   82.39  
600                                584.92   90.92  
800