In [24]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
import time

In [2]:
# Load the data
stations = pd.read_csv('station_data.csv')
pois = pd.read_csv('edinburgh_pois.csv')

In [3]:
print("="*70)
print("STEP 1: DATA LOADING & VALIDATION")
print("="*70)

STEP 1: DATA LOADING & VALIDATION


In [14]:
# Print basic details
print(f"Stations: {len(stations)} candidates")
print(f"POIs: {len(pois)} demand points")
print(f"Station columns: {stations.columns.tolist()}")
print(f"POI columns: {pois.columns.tolist()}")

Stations: 85 candidates
POIs: 33669 demand points
Station columns: ['station_id', 'name', 'address', 'rental_uris', 'lat', 'lon', 'capacity']
POI columns: ['name', 'geometry', 'lat', 'lon', 'category']


In [15]:
# Check for missing coordinates
missing_stations = stations[['lat', 'lon']].isnull().any(axis=1).sum()
missing_pois = pois[['lat', 'lon']].isnull().any(axis=1).sum()
print(f"Stations with missing coordinates: {missing_stations}")
print(f"POIs with missing coordinates: {missing_pois}")


Stations with missing coordinates: 0
POIs with missing coordinates: 0


In [22]:
# Check basic spatial bounds
print(f"Station lat range: {stations['lat'].min():.4f} - {stations['lat'].max():.4f}")
print(f"POI lat range: {pois['lat'].min():.4f} - {pois['lat'].max():.4f}")

Station lat range: 55.9084 - 55.9800
POI lat range: 55.8640 - 55.9835


In [20]:
# 5. Sample preview
print("\nFirst 3 stations:")
print(stations[['station_id', 'name', 'lat', 'lon', 'capacity']].head(3))
print("\nFirst 3 POIs:")
print(pois[['name', 'lat', 'lon', 'category']].head(3))


First 3 stations:
   station_id                      name        lat       lon  capacity
0        2268             Picardy Place  55.956535 -3.186248        31
1        2265  Musselburgh Brunton Hall  55.943961 -3.058307        29
2        2263          Musselburgh Lidl  55.943880 -3.066754        34

First 3 POIs:
                        name        lat       lon category
0      Wester Hailes Library  55.916229 -3.285146  library
1  Pirniehall Primary School  55.973706 -3.251074   school
2       Corstorphine Library  55.940710 -3.281036  library


In [13]:
print("\n" + "="*70)
print("✓✓✓ STEP 1 COMPLETE ✓✓✓")
print("="*70)


✓✓✓ STEP 1 COMPLETE ✓✓✓


In [23]:
print("="*70)
print("STEP 2: CALCULATE HAVERSINE DISTANCE MATRIX")
print("="*70)

STEP 2: CALCULATE HAVERSINE DISTANCE MATRIX


In [27]:
I = len(pois)      # Number of POIs (demand points)
J = len(stations)  # Number of stations (candidate locations)
print(f"Loaded {I} POIs and {J} stations")

Loaded 33669 POIs and 85 stations


In [28]:
# Define haversine distance function
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    Returns distance in kilometers
    """
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in kilometers
    return c * r

In [30]:
# Calculate distance matrix
print(f"\n Calculating distance matrix ({I} × {J})...")
print("  This may take 2-5 minutes...\n")

start_time = time.time()
distance_matrix = np.zeros((I, J))

# Progress tracking
for i in range(I):
    if (i + 1) % 5000 == 0:  # Print progress every 5000 POIs
        elapsed = time.time() - start_time
        print(f"  Progress: {i+1}/{I} POIs ({100*(i+1)/I:.1f}%) - {elapsed:.1f}s elapsed")
    
    poi_lat = pois.iloc[i]['lat']
    poi_lon = pois.iloc[i]['lon']
    
    for j in range(J):
        station_lat = stations.iloc[j]['lat']
        station_lon = stations.iloc[j]['lon']
        
        distance_matrix[i, j] = haversine(poi_lat, poi_lon, station_lat, station_lon)

elapsed = time.time() - start_time
print(f"✓ Distance matrix calculated in {elapsed:.1f} seconds")


 Calculating distance matrix (33669 × 85)...
  This may take 2-5 minutes...

  Progress: 5000/33669 POIs (14.9%) - 9.6s elapsed
  Progress: 10000/33669 POIs (29.7%) - 19.1s elapsed
  Progress: 15000/33669 POIs (44.6%) - 28.5s elapsed
  Progress: 20000/33669 POIs (59.4%) - 38.0s elapsed
  Progress: 25000/33669 POIs (74.3%) - 47.5s elapsed
  Progress: 30000/33669 POIs (89.1%) - 57.0s elapsed
✓ Distance matrix calculated in 63.9 seconds


In [31]:
# Save distance matrix
print("\n⏳ Saving distance matrix...")
np.savetxt('distance_matrix.csv', distance_matrix, delimiter=',', fmt='%.6f')
print("✓ Saved: distance_matrix.csv")



⏳ Saving distance matrix...
✓ Saved: distance_matrix.csv


In [32]:
 #Calculate and display statistics
print("\n" + "="*70)
print("DISTANCE MATRIX STATISTICS")
print("="*70)


DISTANCE MATRIX STATISTICS


In [33]:
min_dist = distance_matrix.min()
max_dist = distance_matrix.max()
mean_dist = distance_matrix.mean()
median_dist = np.median(distance_matrix)

print(f"\nShape: {distance_matrix.shape}")
print(f"Min distance: {min_dist:.4f} km")
print(f"Max distance: {max_dist:.4f} km")
print(f"Mean distance: {mean_dist:.4f} km")
print(f"Median distance: {median_dist:.4f} km")



Shape: (33669, 85)
Min distance: 0.0182 km
Max distance: 19.8792 km
Mean distance: 4.9976 km
Median distance: 4.4316 km


In [34]:
# Distribution by distance bands
print(f"\nDistance distribution:")
print(f"  < 0.5 km: {(distance_matrix < 0.5).sum():,} entries")
print(f"  0.5-1 km: {((distance_matrix >= 0.5) & (distance_matrix < 1)).sum():,} entries")
print(f"  1-2 km: {((distance_matrix >= 1) & (distance_matrix < 2)).sum():,} entries")
print(f"  2-5 km: {((distance_matrix >= 2) & (distance_matrix < 5)).sum():,} entries")
print(f"  > 5 km: {(distance_matrix >= 5).sum():,} entries")


Distance distribution:
  < 0.5 km: 18,371 entries
  0.5-1 km: 56,759 entries
  1-2 km: 246,717 entries
  2-5 km: 1,364,169 entries
  > 5 km: 1,175,849 entries


In [35]:
# Find minimum distance for each POI
min_distances_per_poi = distance_matrix.min(axis=1)
print(f"\nClosest station to each POI:")
print(f"  Min: {min_distances_per_poi.min():.4f} km")
print(f"  Max: {min_distances_per_poi.max():.4f} km")
print(f"  Mean: {min_distances_per_poi.mean():.4f} km")
print(f"  Median: {np.median(min_distances_per_poi):.4f} km")


Closest station to each POI:
  Min: 0.0182 km
  Max: 7.5563 km
  Mean: 1.1166 km
  Median: 0.8431 km


In [36]:
min_distances_per_poi

array([1.19359935, 1.60337914, 0.74818363, ..., 2.0022961 , 1.91384257,
       2.60432355], shape=(33669,))

In [37]:
# Save minimum distances per POI
min_distances_df = pd.DataFrame({
    'POI_index': range(I),
    'closest_station_distance_km': min_distances_per_poi
})
min_distances_df.to_csv('min_distances_per_poi.csv', index=False)
print("\n✓ Saved: min_distances_per_poi.csv")


✓ Saved: min_distances_per_poi.csv


In [39]:
# ============================================================================
# NEW: FIND CLOSEST STATION FOR EACH POI
# ============================================================================

print("\n⏳ Identifying closest stations to each POI...")

# Find index of closest station for each POI
closest_station_indices = np.argmin(distance_matrix, axis=1)
closest_distances = np.min(distance_matrix, axis=1)

# Create detailed results dataframe
closest_stations_df = pd.DataFrame({
    'POI_index': range(I),
    'POI_name': pois['name'].values,
    'POI_category': pois['category'].values,
    'closest_station_index': closest_station_indices,
    'closest_station_id': stations.iloc[closest_station_indices]['station_id'].values,
    'closest_station_name': stations.iloc[closest_station_indices]['name'].values,
    'distance_to_closest_km': closest_distances
})

# Save comprehensive results
print("⏳ Saving comprehensive results...")
closest_stations_df.to_csv('poi_closest_stations_without_lat_log.csv', index=False)
print("✓ Saved: poi_closest_stations.csv")


⏳ Identifying closest stations to each POI...
⏳ Saving comprehensive results...
✓ Saved: poi_closest_stations.csv


In [40]:
print("\n" + "="*70)
print("CLOSEST STATION TO EACH POI (Coverage Analysis)")
print("="*70)

print(f"\nClosest station distances:")
print(f"  Min: {closest_distances.min():.4f} km")
print(f"  Max: {closest_distances.max():.4f} km")
print(f"  Mean: {closest_distances.mean():.4f} km")
print(f"  Median: {np.median(closest_distances):.4f} km")
print(f"  Std Dev: {closest_distances.std():.4f} km")

print(f"\nCoverage by distance from closest station:")
print(f"  Within 0.5 km:  {(closest_distances < 0.5).sum():,} POIs ({100*(closest_distances < 0.5).sum()/I:.2f}%)")
print(f"  Within 1 km:    {(closest_distances < 1).sum():,} POIs ({100*(closest_distances < 1).sum()/I:.2f}%)")
print(f"  Within 2 km:    {(closest_distances < 2).sum():,} POIs ({100*(closest_distances < 2).sum()/I:.2f}%)")
print(f"  Within 5 km:    {(closest_distances < 5).sum():,} POIs ({100*(closest_distances < 5).sum()/I:.2f}%)")



CLOSEST STATION TO EACH POI (Coverage Analysis)

Closest station distances:
  Min: 0.0182 km
  Max: 7.5563 km
  Mean: 1.1166 km
  Median: 0.8431 km
  Std Dev: 1.1115 km

Coverage by distance from closest station:
  Within 0.5 km:  9,455 POIs (28.08%)
  Within 1 km:    19,858 POIs (58.98%)
  Within 2 km:    30,258 POIs (89.87%)
  Within 5 km:    32,697 POIs (97.11%)


In [41]:
print("\n" + "="*70)
print("STATION POPULARITY (Most coverage in baseline/1-NN)")
print("="*70)

# Count how many POIs have each station as closest
station_popularity = pd.Series(closest_station_indices).value_counts().sort_index()
station_popularity_df = pd.DataFrame({
    'station_index': station_popularity.index,
    'station_id': stations.iloc[station_popularity.index]['station_id'].values,
    'station_name': stations.iloc[station_popularity.index]['name'].values,
    'num_pois_closest': station_popularity.values,
    'capacity': stations.iloc[station_popularity.index]['capacity'].values
})

# Sort by number of POIs
station_popularity_df = station_popularity_df.sort_values('num_pois_closest', ascending=False)

print("\nTop 10 stations (by number of POIs with them as closest):")
print(station_popularity_df.head(10).to_string(index=False))

print("\nBottom 10 stations (by number of POIs with them as closest):")
print(station_popularity_df.tail(10).to_string(index=False))

# Save station popularity
station_popularity_df.to_csv('station_popularity_baseline.csv', index=False)
print("\n✓ Saved: station_popularity_baseline.csv")


STATION POPULARITY (Most coverage in baseline/1-NN)

Top 10 stations (by number of POIs with them as closest):
 station_index  station_id                    station_name  num_pois_closest  capacity
            37        1728         Portobello - Kings Road              2784        69
            40        1725                   Edinburgh Zoo              2476        20
            14        1809        Royal Edinburgh Hospital              2064        23
            24        1757                      Meggetland              1918        21
            74         253                Kings Building 2              1800        20
            13        1813 Milton Road - Edinburgh College              1799        24
            25        1756        Western General Hospital              1771        20
            11        1815   Sighthill - Edinburgh College              1393        17
            56        1038              South Trinity Road              1389        31
            17    

In [42]:
print("\n" + "="*70)
print("COVERAGE ANALYSIS BY POI CATEGORY")
print("="*70)

for category in pois['category'].unique():
    cat_mask = pois['category'] == category
    cat_distances = closest_distances[cat_mask]
    cat_count = cat_mask.sum()
    
    print(f"\n{category.upper()} ({cat_count} POIs):")
    print(f"  Avg distance to closest station: {cat_distances.mean():.4f} km")
    print(f"  Max distance to closest station: {cat_distances.max():.4f} km")
    print(f"  POIs within 0.5 km: {(cat_distances < 0.5).sum()} ({100*(cat_distances < 0.5).sum()/cat_count:.2f}%)")
    print(f"  POIs within 1 km: {(cat_distances < 1).sum()} ({100*(cat_distances < 1).sum()/cat_count:.2f}%)")
    print(f"  POIs within 2 km: {(cat_distances < 2).sum()} ({100*(cat_distances < 2).sum()/cat_count:.2f}%)")



COVERAGE ANALYSIS BY POI CATEGORY

LIBRARY (49 POIs):
  Avg distance to closest station: 1.1907 km
  Max distance to closest station: 7.2702 km
  POIs within 0.5 km: 27 (55.10%)
  POIs within 1 km: 32 (65.31%)
  POIs within 2 km: 41 (83.67%)

SCHOOL (188 POIs):
  Avg distance to closest station: 1.4555 km
  Max distance to closest station: 7.2347 km
  POIs within 0.5 km: 58 (30.85%)
  POIs within 1 km: 95 (50.53%)
  POIs within 2 km: 150 (79.79%)

UNIVERSITY (28 POIs):
  Avg distance to closest station: 0.4721 km
  Max distance to closest station: 6.7418 km
  POIs within 0.5 km: 25 (89.29%)
  POIs within 1 km: 26 (92.86%)
  POIs within 2 km: 27 (96.43%)

RESIDENTIAL (32837 POIs):
  Avg distance to closest station: 1.1199 km
  Max distance to closest station: 7.5563 km
  POIs within 0.5 km: 9149 (27.86%)
  POIs within 1 km: 19268 (58.68%)
  POIs within 2 km: 29511 (89.87%)

COMMERCIAL (551 POIs):
  Avg distance to closest station: 0.8358 km
  Max distance to closest station: 7.4368 km


In [43]:
print("\n" + "="*70)
print("SAMPLE POI-STATION MAPPINGS (First 10 POIs)")
print("="*70)
print("\n" + closest_stations_df.head(10)[['POI_name', 'POI_category', 
                                            'closest_station_name', 
                                            'distance_to_closest_km']].to_string(index=False))



SAMPLE POI-STATION MAPPINGS (First 10 POIs)

                         POI_name POI_category          closest_station_name  distance_to_closest_km
            Wester Hailes Library      library Sighthill - Edinburgh College                1.193599
        Pirniehall Primary School       school      Western General Hospital                1.603379
             Corstorphine Library      library                 Edinburgh Zoo                0.748184
Fettes College Preparatory School       school      Western General Hospital                0.659447
                         Haywired       school     Edinburgh Royal Infirmary                1.686661
              Musselburgh Library      library      Musselburgh Brunton Hall                0.182720
     St David's RC Primary School       school      Western General Hospital                1.586040
                 Playfair Library      library                 Surgeons Hall                0.108982
       Communications & Marketing   universit

In [44]:
df = pd.read_csv('poi_closest_stations.csv')
print((df['distance_to_closest_km'] < 1).sum())


19858


In [45]:
# ============================================================================
# NEW: FIND POIS CLOSEST TO EACH STATION
# ============================================================================

print("\n⏳ Identifying POIs closest to each station...")

# For each station, find all POIs and their distances
station_to_pois = {}

for j in range(J):
    distances_to_this_station = distance_matrix[:, j]
    closest_poi_index = np.argmin(distances_to_this_station)
    closest_distance = distances_to_this_station[closest_poi_index]
    
    station_to_pois[j] = {
        'closest_poi_index': closest_poi_index,
        'closest_poi_name': pois.iloc[closest_poi_index]['name'],
        'closest_poi_category': pois.iloc[closest_poi_index]['category'],
        'closest_distance': closest_distance
    }

# Create dataframe for station-centric view
station_centric_df = pd.DataFrame([
    {
        'station_index': j,
        'station_id': stations.iloc[j]['station_id'],
        'station_name': stations.iloc[j]['name'],
        'station_lat': stations.iloc[j]['lat'],
        'station_lon': stations.iloc[j]['lon'],
        'station_capacity': stations.iloc[j]['capacity'],
        'closest_poi_name': station_to_pois[j]['closest_poi_name'],
        'closest_poi_category': station_to_pois[j]['closest_poi_category'],
        'closest_poi_index': station_to_pois[j]['closest_poi_index'],
        'closest_distance_km': station_to_pois[j]['closest_distance']
    }
    for j in range(J)
])

# Save station-centric results
station_centric_df.to_csv('stations_closest_pois.csv', index=False)
print("✓ Saved: stations_closest_pois.csv")

# ============================================================================
# EXTENDED: TOP-K CLOSEST POIS TO EACH STATION
# ============================================================================

print("\n⏳ Identifying top-10 closest POIs to each station...")

top_k = 10  # Top 10 closest POIs per station

station_top_pois_list = []

for j in range(J):
    distances_to_this_station = distance_matrix[:, j]
    
    # Get indices of top-k closest POIs
    top_k_indices = np.argsort(distances_to_this_station)[:top_k]
    
    for rank, poi_idx in enumerate(top_k_indices, 1):
        station_top_pois_list.append({
            'station_index': j,
            'station_id': stations.iloc[j]['station_id'],
            'station_name': stations.iloc[j]['name'],
            'rank': rank,
            'poi_index': poi_idx,
            'poi_name': pois.iloc[poi_idx]['name'],
            'poi_category': pois.iloc[poi_idx]['category'],
            'poi_lat': pois.iloc[poi_idx]['lat'],
            'poi_lon': pois.iloc[poi_idx]['lon'],
            'distance_km': distance_matrix[poi_idx, j]
        })

station_top_pois_df = pd.DataFrame(station_top_pois_list)
station_top_pois_df.to_csv('stations_top_10_closest_pois.csv', index=False)
print("✓ Saved: stations_top_10_closest_pois.csv")

# ============================================================================
# DISPLAY SAMPLE: TOP STATIONS BY CLOSEST POI DISTANCE
# ============================================================================

print("\n" + "="*70)
print("SAMPLE: CLOSEST POI TO EACH STATION")
print("="*70)

print("\nFirst 15 stations and their closest POIs:")
print(station_centric_df.head(15)[['station_name', 'closest_poi_name', 
                                     'closest_poi_category', 
                                     'closest_distance_km']].to_string(index=False))

# ============================================================================
# EXTENDED ANALYSIS: CATCHMENT AREAS
# ============================================================================

print("\n" + "="*70)
print("STATION CATCHMENT AREAS (POIs within 1 km)")
print("="*70)

catchment_areas = []

for j in range(J):
    distances_to_this_station = distance_matrix[:, j]
    
    # POIs within various distances
    within_0_5km = (distances_to_this_station < 0.5).sum()
    within_1km = (distances_to_this_station < 1).sum()
    within_2km = (distances_to_this_station < 2).sum()
    
    catchment_areas.append({
        'station_index': j,
        'station_id': stations.iloc[j]['station_id'],
        'station_name': stations.iloc[j]['name'],
        'capacity': stations.iloc[j]['capacity'],
        'pois_within_0_5km': within_0_5km,
        'pois_within_1km': within_1km,
        'pois_within_2km': within_2km,
        'avg_distance_to_all_pois_km': distances_to_this_station.mean()
    })

catchment_df = pd.DataFrame(catchment_areas)
catchment_df = catchment_df.sort_values('pois_within_1km', ascending=False)
catchment_df.to_csv('station_catchment_areas.csv', index=False)

print("\nTop 10 stations by POIs within 1 km:")
print(catchment_df.head(10)[['station_name', 'pois_within_0_5km', 
                               'pois_within_1km', 'pois_within_2km', 
                               'avg_distance_to_all_pois_km']].to_string(index=False))

print("\n✓ Saved: station_catchment_areas.csv")



⏳ Identifying POIs closest to each station...
✓ Saved: stations_closest_pois.csv

⏳ Identifying top-10 closest POIs to each station...
✓ Saved: stations_top_10_closest_pois.csv

SAMPLE: CLOSEST POI TO EACH STATION

First 15 stations and their closest POIs:
                           station_name                            closest_poi_name closest_poi_category  closest_distance_km
                          Picardy Place                                         NaN          residential             0.109875
               Musselburgh Brunton Hall                                     Farnham          residential             0.061124
                       Musselburgh Lidl Spark of Genius Musselburgh Learning Centre               school             0.103727
                       Leith Walk North                                         NaN          residential             0.100510
                            Duke Street                                         NaN          residential        