In [1]:
import numpy as np
import pandas as pd
import copy

## Generating distance dicts

In [36]:
def gen_interstop_distance_list(arr):
    s_arr = sorted(arr)
    #print(arr)
    s_arr = [y-x for x, y in zip(s_arr[:-1], s_arr[1:])]
    return s_arr

def gen_mean(arr):
    return np.mean(np.array(arr))

def haversine_distance(lat1, lon1, lat2, lon2):
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
    return np.round(res, 2)

def gen_distance_dicts(target_files, root_path):
    distance_dicts = {}
    dists = []
    for t in target_files:
        stop_times = pd.read_csv(root_path+"/"+t+"/stop_times.txt")
        stops = pd.read_csv(root_path+"/"+t+"/stops.txt")
        trips = pd.read_csv(root_path+"/"+t+"/trips.txt")
        dist_travelled = stop_times.merge(trips, on="trip_id").merge(stops, on='stop_id')[["trip_id","route_id","stop_id","shape_dist_traveled","stop_name"]]
        dist_travelled = dist_travelled.fillna(0.0)
        
        # For debugging
        dists.append(dist_travelled.groupby(["route_id","trip_id"])["shape_dist_traveled"].apply(lambda x: gen_interstop_distance_list(x)))
        grouped_by_routes_trips_df = dist_travelled.groupby(["route_id","trip_id"])["shape_dist_traveled"].apply(lambda x: gen_mean(gen_interstop_distance_list(x)))
        #print(grouped_by_routes_trips_df)
        grouped_by_routes_trips_df = grouped_by_routes_trips_df.reset_index()
        distance_dicts.update(grouped_by_routes_trips_df.groupby("route_id")["shape_dist_traveled"].mean().to_dict())
    return distance_dicts,dists


In [38]:
#bcdot_dd, dists_bcdot = gen_distance_dicts(target_files, "..")
#wmata_dd = gen_distance_dicts(["bus_gtfs_wmata","rail_gtfs_wmata"],"WMATA")
umd_shuttle_dd, dists_umddot = gen_distance_dicts(["20210826_gtfs"], ".")

In [40]:
### Interstop distances

def gen_interstop_distance(x):
    x = x.sort_values('stop_sequence')
    
    dists = []
    dists.append(0)
    for s1_lon, s1_lat, s2_lon, s2_lat in zip(x[:-1]["stop_lon"], x[:-1]["stop_lat"], x[1:]["stop_lon"],x[1:]["stop_lat"]):
        dists.append(haversine_distance(s1_lat, s1_lon, s2_lat, s2_lon))
    #dist_val = copy.deepcopy(dists)
    #print(dist_val)
    #x["interstop_dist"] = dists
    #print(x)
    #print(x[1:])
    return dists
    
def gen_mean(arr):
    return np.mean(np.array(arr))

def haversine_distance(lat1, lon1, lat2, lon2):
    r = 6371
    phi1 = np.radians(float(lat1))
    phi2 = np.radians(float(lat2))
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(float(lon2) - float(lon1))
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
    return np.round(res, 2)

def gen_distance_dicts(target_files, root_path):
    distance_dicts = {}
    dists = []
    for t in target_files:
        stop_times = pd.read_csv(root_path+"/"+t+"/stop_times.txt",dtype={'trip_id':str, 'stop_id':str})
        stops = pd.read_csv(root_path+"/"+t+"/stops.txt",dtype={'stop_id':str})
        trips = pd.read_csv(root_path+"/"+t+"/trips.txt",dtype={'trip_id':str, 'route_id':str})

        #if(root_path =="WMATA"):
            #stops = stops[stops.stop_id.apply(lambda x: type(x)==int or x.isnumeric() )]
            #stops = stops.astype({'stop_id': 'int64'})

        dist_travelled = stop_times.merge(trips, on="trip_id").merge(stops, on='stop_id')[["trip_id","route_id","stop_id","stop_sequence","stop_name", "stop_lat","stop_lon"]]
        dist_travelled = dist_travelled.fillna(0.0)
        #print(dist_travelled["route_id"].unique())
        

        # For debugging
        dists.append(dist_travelled.groupby(["route_id","trip_id"])[["stop_lat","stop_lon","stop_sequence","route_id","trip_id"]].apply(lambda x: gen_interstop_distance(x)))
        dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))
        dist_travelled = dist_travelled.reset_index()
        #distance_dicts={} 
        distance_dicts.update(dist_travelled.groupby("route_id")[0].mean().to_dict())
        #distance_dicts
        #distance_dicts.update(grouped_by_routes_trips_df.groupby("route_id")["shape_dist_traveled"].mean().to_dict())
    return distance_dicts,dists

#wmata_dd, dists_wmata = gen_distance_dicts(["bus_gtfs_wmata","rail_gtfs_wmata"],"WMATA")
bc_dot_target_files = ["mdotmta_gtfs_commuterbus",
                        "mdotmta_gtfs_localbus",
                        "mdotmta_gtfs_marc",
                        "mdotmta_gtfs_lightrail",
                        "mdotmta_gtfs_metro"]



In [41]:
wmata_dd, dists_wmata = gen_distance_dicts(["bus_gtfs_wmata","rail_gtfs_wmata"],"WMATA")
umd_shuttle_dd, dists_umddot = gen_distance_dicts(["20210826_gtfs"], ".")

  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))
  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))
  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))


In [43]:
bcdot_dd, dists_bcdot = gen_distance_dicts(bc_dot_target_files, "..")

  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))
  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))
  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))
  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))
  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))


In [42]:
marc_dd, dists_marc = gen_distance_dicts(["mdotmta_gtfs_marc"], "..")
thebus_dd, dists_thebus = gen_distance_dicts(["us-maryland-prince-georges-county-the-bus-gtfs-477",],".")
#keys = thebus_dd_df.keys().copy()

  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))
  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))


In [44]:
charm_dd, dists_charm = gen_distance_dicts(["CCC GTFS Feed - Fall 2024"], "..")

  dist_travelled = dist_travelled.groupby(["route_id","trip_id"]).apply(lambda x : gen_mean(gen_interstop_distance(x)))


In [45]:
len(thebus_dd)

24

In [46]:
umd_shuttle_dd_df = pd.DataFrame(umd_shuttle_dd.items(), columns=['route_id', 'avg_dist'])
bcdot_dd_df = pd.DataFrame(bcdot_dd.items(), columns=['route_id', 'avg_dist'])
wmata_dd_df = pd.DataFrame(wmata_dd.items(), columns=['route_id', 'avg_dist'])
thebus_dd_df = pd.DataFrame(thebus_dd.items(), columns=['route_id','avg_dist'])
thebus_dd_df["route_id"] = thebus_dd_df["route_id"].astype(str) + "TB"
marc_dd_df = pd.DataFrame(marc_dd.items(), columns=['route_id','avg_dist'])
bcdot_dd_df = pd.DataFrame(bcdot_dd.items(), columns=['route_id','avg_dist'])
charm_dd_df = pd.DataFrame(charm_dd.items(), columns=['route_id','avg_dist'])
charm_dd_df["route_id"] = charm_dd_df["route_id"].astype(str) + "CCC"

bcdot_combined_df = pd.concat([charm_dd_df, bcdot_dd_df])
wmata_combined_df = pd.concat([umd_shuttle_dd_df, wmata_dd_df, marc_dd_df, thebus_dd_df, bcdot_dd_df, charm_dd_df])

In [47]:
umd_shuttle_dd_df.to_csv("umd_shuttle_dd.csv")
bcdot_dd_df.to_csv("mdot_dd.csv")
wmata_dd_df.to_csv("wmata_dd.csv")
wmata_combined_df.to_csv("DC_combined_dd.csv")
bcdot_combined_df.to_csv("BC_combined_dd.csv")

In [48]:
charm_dd_df

Unnamed: 0,route_id,avg_dist
0,4962CCC,0.360025
1,4963CCC,0.366248
2,4964CCC,0.264899
3,4965CCC,0.330184
4,6139CCC,0.448894
