# Filter by APC

In [None]:
import pandas as pd
import os

PATH = 'corrected-20211024'
GTFS = '20211024'
if not os.path.exists('./{}'.format(PATH)):
    os.makedirs(PATH)

# read APC data -----------------------------------------------------------
months = ['202111', '202112', '202201', '202202', '202203', '202204']
APC_df = pd.DataFrame()
for month in months:
    APC_df = APC_df.append(pd.read_parquet('./APC/{}.snappy.parquet'.format(month), engine='pyarrow'), ignore_index=True)
APC_df = APC_df[APC_df['time_actual_arrive'].notnull()]
APC_trip_ids = [int(trip_id) for trip_id in APC_df['gtfs_trip_id'].unique()]
# -------------------------------------------------------------------------

# trips.txt ---------------------------------------------------------------
trip_df = pd.read_csv('./{}/trips.txt'.format(GTFS))
trip_df_filtered = trip_df[trip_df['trip_id'].isin(APC_trip_ids) & ~trip_df['shape_id'].str.contains('shp-10C')]
trip_df_not_used = trip_df[~trip_df['trip_id'].isin(APC_trip_ids) | trip_df['shape_id'].str.contains('shp-10C')]
trip_df_filtered.to_csv('./{}/trips.txt'.format(PATH), index=None)
# -------------------------------------------------------------------------

# stop_times.txt ----------------------------------------------------------
stop_time_df = pd.read_csv('./20211024/stop_times.txt')
stop_time_df_filtered = pd.DataFrame({})

for trip_id in trip_df['trip_id'].unique():
    stop_time_tmp_df = stop_time_df[stop_time_df['trip_id'] == trip_id]
    APC_tmp_df = APC_df[APC_df['gtfs_trip_id'] == str(trip_id)]
    stop_time_df_filtered = stop_time_df_filtered.append(
                                stop_time_tmp_df[stop_time_tmp_df['stop_id'].isin(APC_tmp_df.stop_id.unique())],
                                ignore_index=True).sort_values(by=['stop_sequence'])

stop_time_df_filtered.to_csv('./{}/stop_times.txt'.format(PATH), index=None)
# -------------------------------------------------------------------------

# shapes.txt --------------------------------------------------------------
shape_df = pd.read_csv('./{}/shapes.txt'.format(GTFS))
shape_df_filtered = shape_df[shape_df['shape_id'].isin(trip_df_filtered['shape_id'].unique())]
shape_df_filtered.to_csv('./{}/shapes.txt'.format(PATH), index=None)
# -------------------------------------------------------------------------

# stops.txt ---------------------------------------------------------------
stop_df = pd.read_csv('./{}/stops.txt'.format(GTFS))
stop_df_filtered = stop_df[stop_df['stop_id'].isin(stop_time_df_filtered['stop_id'].unique())]
stop_df_filtered.to_csv('./{}/stops.txt'.format(PATH), index=None)
# -------------------------------------------------------------------------

# correct GTFS

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, LineString
import plotly.express as px
import random
import numpy as np
import geopy.distance
import osmnx as ox
ox.config(use_cache=True, log_console=True)
place_name = "Chattanooga, Tennessee, USA"
graph = ox.graph_from_place(place_name)

In [None]:
def route_to_line_string(route):
    node_start = []
    node_end = []
    X_to = []
    Y_to = []
    X_from = []
    Y_from = []
    
    if len(route) <= 1:
        return []

    for u, v in zip(route[:-1], route[1:]):
        node_start.append(u)
        node_end.append(v)
        X_from.append(graph.nodes[u]['x'])
        Y_from.append(graph.nodes[u]['y'])
        X_to.append(graph.nodes[v]['x'])
        Y_to.append(graph.nodes[v]['y'])
    df = pd.DataFrame(list(zip(node_start, node_end, X_from, Y_from, X_to, Y_to)),
    columns =["node_start", "node_end", "X_from", "Y_from", "X_to", "Y_to"])
    def create_line_gdf(df):
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.X_from, df.Y_from))
        gdf["geometry_to"] = [Point(xy) for xy in zip(gdf.X_to, gdf.Y_to)]
        gdf["line"] = gdf.apply(lambda row: LineString([row["geometry_to"], row["geometry"]]), axis=1)
        line_gdf = gdf[["node_start","node_end", "line"]].set_geometry("line")
        return line_gdf
    line_gdf = create_line_gdf(df)
    gdf = gpd.GeoDataFrame(geometry=line_gdf.line.values)

    res = []
    for ls in gdf.geometry.values:
        x,y = ls.coords.xy
        res.append([y[1], x[1]])
        res.append([y[0], x[0]])
        
    return res

In [None]:
WRITE_HTML = False
if WRITE_HTML:
    if not os.path.exists('./{}-html'.format(PATH)):
        os.makedirs('{}-html'.format(PATH))

trip_df = pd.read_csv('./{}/trips.txt'.format(PATH))
shape_df = pd.read_csv('./{}/shapes.txt'.format(GTFS))
stop_df = pd.read_csv('./{}/stops.txt'.format(GTFS))
stop_time_df = pd.read_csv('./{}/stop_times.txt'.format(PATH)).sort_values(by=['arrival_time'])
clever_df = pd.read_csv('./Clever/clever_0110to0116_sel.csv')

corrected_shape_df = pd.DataFrame({})
for shape_id in shape_df.shape_id.unique():
    print(shape_id)
    if len(trip_df[trip_df['shape_id'] == shape_id]) <= 0:
        continue
    trip_ids_with_shape = trip_df[trip_df['shape_id'] == shape_id]['trip_id'].unique()
    trip_ids_with_shape = [str(trip_id) for trip_id in trip_ids_with_shape]
    APC_df_filtered_by_trip = APC_df[(APC_df['gtfs_trip_id'].isin(trip_ids_with_shape)) & 
                                     APC_df['time_actual_arrive'].notnull()].sort_values(by=['time_actual_arrive'])
    shape_df_filtered = shape_df[shape_df['shape_id'] == shape_id].reset_index()
    shape_df_filtered.loc[:, 'lat'] = shape_df_filtered['shape_pt_lat']
    shape_df_filtered.loc[:, 'lon'] = shape_df_filtered['shape_pt_lon']
    shape_df_filtered.loc[:, 'id'] = shape_df_filtered['shape_pt_sequence']
    shape_df_filtered.loc[:, 'dist_traveled'] = shape_df_filtered['shape_dist_traveled']
    shape_df_filtered.loc[:, 'type'] = 'trajectory'
    shape_df_filtered = shape_df_filtered.loc[:, ['lat', 'lon', 'type', 'id', 'dist_traveled']]    
    
    trip_id = int(trip_ids_with_shape[0])
    stop_time_df_filtered = stop_time_df[stop_time_df['trip_id'] == trip_id]
    stop_df_filtered = stop_df[stop_df['stop_id'].isin(stop_time_df_filtered['stop_id'].values)]
    stop_df_filtered.loc[:, 'lat'] = stop_df_filtered['stop_lat']
    stop_df_filtered.loc[:, 'lon'] = stop_df_filtered['stop_lon']
    stop_df_filtered.loc[:, 'id'] = stop_df_filtered['stop_id']
    stop_df_filtered.loc[:, 'type'] = 'bus stop'

    df_real = clever_df[clever_df['origtatripno'] == int(trip_id/1000)]
    df_real['date'] = df_real['tmstmp'].apply(lambda x: x[:8])
    df_real = df_real[df_real['lat'] != 0]
    df_real = df_real[['lat', 'lon', 'tmstmp', 'date']].sort_values(by=['tmstmp'])
    
    anomaly_bus_stops = []
    APC_bus_stops = list(APC_df_filtered_by_trip.stop_id.unique())
    for _, row in stop_df_filtered.iterrows():
        if row['id'] not in APC_bus_stops:
            anomaly_bus_stops.append(row['id'])
    
    if len(anomaly_bus_stops) <= 0:
        # normal
        corrected_shape_df = corrected_shape_df.append(shape_df[shape_df['shape_id'] == shape_id], ignore_index=True)        
        continue
    
    bus_stops_in_order = list(stop_time_df_filtered['stop_id'].values)
    anomaly_stop_idxs = sorted([bus_stops_in_order.index(stop_id) for stop_id in anomaly_bus_stops])

    # process list idx ----------------------------------------------------
    tmp_list = list(range(len(bus_stops_in_order)))
    for val in anomaly_stop_idxs:
        tmp_list.remove(val)
    res = []
    res.append([tmp_list[0]])
    flag = True
    last_val = 0
    for _, val in enumerate(tmp_list[1:]):
        if len(res[-1]) == 1:
            if val == last_val + 1:
                last_val = val
            else:
                res[-1].append(last_val)
                res.append([val])   
                last_val = val
    res[-1].append(tmp_list[-1])

    # output => [[0, 6], [8, 8], [10, 78]] (means move from stop 0 => 6, disconnected, stop 8, stop 10 => 78)
    # ----------------------------------------------------------------------
    
    
    line_strings = []
    for idx, (start_idx, end_idx) in enumerate(res):
        if start_idx == end_idx:
            lat, lon = stop_df_filtered[stop_df_filtered['stop_id'] == bus_stops_in_order[start_idx]].values[0][4:6]
            line_strings.append([[lat, lon]])
        else:
            start_distance = stop_time_df_filtered[stop_time_df_filtered['stop_id'] == bus_stops_in_order[start_idx]].values[0][-2]
            end_distance = stop_time_df_filtered[stop_time_df_filtered['stop_id'] == bus_stops_in_order[end_idx]].values[0][-2]
            tmp_df = shape_df_filtered[(start_distance + 5 < shape_df_filtered['dist_traveled']) & \
                                       (shape_df_filtered['dist_traveled'] < end_distance - 5)]
            line_strings.append(tmp_df[['lat', 'lon']].values)
    
    corrected_line_string = []
    for idx in range(len(line_strings) - 1):
        for lat, lon in line_strings[idx][:-1]:
            corrected_line_string.append([lat, lon])
        ori_lat, ori_lon = line_strings[idx][-1]
        orig = ox.distance.nearest_nodes(graph, [ori_lon], [ori_lat])[0]
        des_lat, des_lon = line_strings[idx+1][0]
        dest = ox.distance.nearest_nodes(graph, [des_lon], [des_lat])[0]
        route = ox.shortest_path(graph, orig, dest, weight="length")
        corrected_line_string.extend(route_to_line_string(route))

    for lat, lon in line_strings[-1]:
        corrected_line_string.append([lat, lon])

#     shape_id	shape_pt_lat	shape_pt_lon	shape_pt_sequence	shape_dist_traveled
    for idx, (lat, lon) in enumerate(corrected_line_string):
        corrected_shape_df = corrected_shape_df.append({'shape_id': shape_id, 'shape_pt_lat': lat, 'shape_pt_lon': lon,
                                                        'shape_pt_sequence': idx + 1, 'shape_dist_traveled': -1}, 
                                                        ignore_index=True)
    if WRITE_HTML == False:
        continue
    
    df_plt = pd.DataFrame({})
    shape_df_filtered['source'] = 'Originial GTFS'
    df_plt = df_plt.append(shape_df_filtered[['lat', 'lon', 'source']], ignore_index=True)

    for lat, lon in corrected_line_string:
        df_plt = df_plt.append({'lat': lat, 'lon': lon, 'source': 'Corrected Shape'}, ignore_index=True)
            
    df_real_filtered = df_real[df_real['date']=='20220111'].sort_values(by=['tmstmp'])
    df_real_filtered = df_real_filtered.drop_duplicates('tmstmp').reset_index()
    df_real_filtered['source'] = 'Clever'
    df_plt = df_plt.append(df_real_filtered[['lat', 'lon', 'source']], ignore_index=True)

    lat_center = df_plt['lat'].mean()
    lng_center = df_plt['lon'].mean()
    center = {"lat":lat_center, "lon":lng_center}

    stop_df_filtered_used = stop_df_filtered[stop_df_filtered['stop_id'].isin(APC_bus_stops)]
    stop_df_filtered_used['source'] = 'Used Bus Stops (By APC)'
    stop_df_filtered_unused = stop_df_filtered[~stop_df_filtered['stop_id'].isin(APC_bus_stops)]
    stop_df_filtered_unused['source'] = 'Unused Bus Stops (By APC)'
    df_plt = df_plt.append(stop_df_filtered_used[['lat', 'lon', 'source']], ignore_index=True)
    df_plt = df_plt.append(stop_df_filtered_unused[['lat', 'lon', 'source']], ignore_index=True)

    fig = px.scatter_mapbox(df_plt,
                            lat="lat",
                            lon="lon",
                            color='source',
                            zoom=9,
                            center=center,
                            height=1000,
                            width=1500)
    fig.add_traces(px.line_mapbox(df_plt[~df_plt['source'].isin(['Used Bus Stops (By APC)', 
                                                                'Unused Bus Stops (By APC)'])], 
                                  lat="lat", lon="lon", color='source').data)
    fig.update_layout(mapbox_style="open-street-map")
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_traces(marker={'size': 7})
    fig.write_html('./{}-html/{}.html'.format(PATH, shape_id))
    
corrected_shape_df.to_csv('./{}/shapes.txt'.format(PATH), index=None)