# Notebook Overview

1. **Importing Libraries and Configurations**:
    - Import necessary libraries such as `polars`, `datetime`, and `json`.
    - Configure `polars` settings for better performance and display.

2. **Data Loading and Preprocessing**:
    - Load route mappings to map route IDs to their respective names.
    - Read and preprocess the main dataset from a Parquet file, replacing route IDs with their mapped names.
    - Load and preprocess additional data from JSON files, including stops and patterns data.

3. **Data Merging and Transformation**:
    - Merge stops data with the main dataset to get stop names for `nextStopID` and `lastStopID`.
    - Filter the dataset to include only relevant records and add new columns such as `stopChanged` and `timeDiff`.

4. **Computing Permutations**:
    - Define a function to compute permutations for each route, calculating time differences between stops.
    - Save the resulting dataset to a Parquet file for further analysis.

5. **Data Analysis**:
    - Extract and display specific columns from the processed dataset for analysis.
    - Filter and analyze data for specific routes and stops.

6. **Alternative Approaches**:
    - Document an alternative method to compute time differences between stops using the `shift` method (commented out).

This notebook provides a comprehensive workflow for processing and analyzing transportation data, enabling detailed insights into bus routes and stop timings.

In [None]:
import polars as pl
from datetime import datetime
import json
import datetime
import math

pl.enable_string_cache()
pl.Config().set_tbl_cols(100)
pl.Config().set_tbl_rows(25)

In [None]:
%config InteractiveShell.ast_node_interactivity = 'last_expr_or_assign'

In [None]:
#Map route id to correct route name
route_mapping = {
    3: "2L",
    4: "2R",
    33: "3",
    17: "10",
    18: "11",
    23: "12",
    12: "16",
    13: "17",
    14: "18",
    30: "19",
    29: "21",
    38: "21 Tripper",
    777: "777"
}

In [None]:
df = pl.read_parquet("./data/2024-09-entries.parquet")
df = df.with_columns(pl.col("routeID").replace_strict(route_mapping))

In [None]:
#Load stops data
file = open("./data/stops.json", "r")
stopsData = json.load(file)

stops = pl.DataFrame(stopsData['get_stops'])

#Load patterns data
pattern_mapping = {
    3: "2L",
    4: "2R",
    37: "3",
    17: "10",
    18: "11",
    23: "12",
    12: "16",
    13: "17",
    14: "18",
    33: "19",
    46: "21",
    45: "21 Tripper",
}

#Load patterns json
file = open("./data/patterns.json", "r")
patternsData = json.load(file)

patterns = pl.DataFrame(patternsData['get_patterns'])
patterns = patterns.with_columns(pl.col("id").replace_strict(pattern_mapping, default="None"))

In [None]:
stops.rename({"id": "nextStopID"}).select(["nextStopID", "name"]).unique().sort("nextStopID")

#Left join stops to get stop names for nextStopID and lastStopID
df = df.join(stops.rename({"id": "nextStopID"}).select(["nextStopID", "name", "lat", "lng"]).unique(), on="nextStopID", how="left").rename({"name": "nextStopName", "lat": "nextStopLat", "lng": "nextStopLng"})
df = df.join(stops.rename({"id": "lastStopID"}).select(["lastStopID", "name", "lat", "lng"]).unique(), on="lastStopID", how="left").rename({"name": "lastStopName", "lat": "lastStopLat", "lng": "lastStopLng"})

In [None]:
#Filter out the subset of data we want to work with
df = df.filter(
    (pl.col("inService"))
)

#Add stopChanged column
df = df.with_columns(
    (
        (pl.col("lastStopID") == pl.col("nextStopID").shift(1)).over(
            "equipmentID", order_by="receiveTime"
        )
    ).alias("stopChanged")
)

df = df.filter(pl.col("stopChanged")).with_columns(
    (pl.col("nextStopID").shift(1) == pl.col("lastStopID"))
    .over("equipmentID", order_by="receiveTime")
    .alias("nextToLast")
)

#Add timeDiff column
df = df.with_columns(
    (-pl.col("receiveTime").diff(-1).over("equipmentID", order_by="receiveTime")).alias(
        "timeDiff"
    )
).filter(pl.col("nextToLast"))


In [None]:
def compute_permutations():
    mega_df = None
    for route in route_mapping.values():

        # Skip any inactive buses
        if(route == "777"):
            continue

        subset = df.filter(pl.col('routeID') == route)
        stop_ids = patterns.filter(pl.col("id") == route)['stopIDs'][0]

        print(f"[!] Computing permutations for route: {route} with stop ids len: {len(stop_ids)}")

        for i in range(len(stop_ids)):
            stop1 = stop_ids[i]
            df1 = subset.filter(pl.col('lastStopID') == stop1)
            
            for j in range(1, 6):
                stop2 = stop_ids[(i + j) % len(stop_ids)]
                df2 = subset.filter(pl.col('lastStopID') == stop2)
                
                df2 = df2.with_columns(
                    pl.col("receiveTime").alias("receiveTime_right"), 
                    pl.col('lastStopID').alias('nextStopID_actual'),
                    pl.col('lastStopName').alias('nextStopName_actual')
                )
       
                joined_df = df1.join_asof(df2, on="receiveTime", by='equipmentID', strategy='forward')
                
                joined_df = joined_df.with_columns(
                    (pl.col("receiveTime_right") - pl.col("receiveTime")).alias(f"eta")
                )
                
                joined_df.drop(["scheduleNumber_right", "nextStopName_right", "nextStopID_right", "aID_right", "trainID_right", "onSchedule_right"], strict=True)
                
                #Add Time Of Day and Day of Week columns
                joined_df = joined_df.with_columns(
                    pl.col("receiveTime").dt.hour().alias("hour_of_day"),
                    pl.col("receiveTime").dt.minute().alias("minute_of_hour"),
                    pl.col("receiveTime").dt.weekday().alias("day_of_week"),
                    pl.col("eta").dt.total_seconds().alias("eta_seconds")
                )

                if mega_df is None:
                    mega_df = joined_df
                else:
                    mega_df = mega_df.vstack(joined_df)
    
    return mega_df

mega_df = compute_permutations()


In [None]:
#Calculate the euclidean distance between the two stop locations
def euclidean_distance(row):
    R = 6371.0  # Earth's radius in km

    lat1, lon1, lat2, lon2 = row["nextStopLat"], row["nextStopLng"], row["lastStopLat"], row["lastStopLng"]
  
    # Convert degrees to radians
    lat1_rad, lon1_rad = math.radians(lat1), math.radians(lon1)
    lat2_rad, lon2_rad = math.radians(lat2), math.radians(lon2)

    # Approximate Euclidean distance
    x1 = R * lat1_rad
    y1 = R * lon1_rad
    x2 = R * lat2_rad
    y2 = R * lon2_rad

    distance_km =  math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
    
    # Convert to miles
    return distance_km * 0.621371  
  
#Add distance column
mega_df = mega_df.with_columns(
    pl.struct(["nextStopLat", "nextStopLng", "lastStopLat", "lastStopLng"])
    .map_elements(euclidean_distance, return_dtype=pl.Float32)
    .alias("distance")
)

In [None]:
mega_df.write_parquet('./data/mega_df.parquet')

print("[X] Succesfully wrote mega_df to disk")

In [None]:
mega_df['routeID', 'equipmentID', 'lastStopID', 'nextStopID_actual', 'lastStopName', 'nextStopName_actual',  'receiveTime', 'receiveTime_right', 'eta', 'day_of_week', 'hour_of_day', 'minute_of_hour', 'distance']

In [None]:
two_l_mega_df = mega_df.filter(
    pl.col('routeID') == '2L'
)

In [None]:
two_l_mega_df.filter(pl.col('eta_seconds') > (60 * 20)).count()

In [None]:
two_l_mega_df.write_parquet('./data/two_l_mega_df_variance.parquet')
print("[X] Succesfully wrote two_l_mega_df to disk")

In [None]:
twelve_route_mega_df = mega_df.filter(
    pl.col('routeID') == '12',
    pl.col('eta_seconds') < (60 * 35)
)

In [None]:
twelve_route_mega_df.filter(pl.col('eta_seconds') > (60 * 35)).count()

In [None]:
twelve_route_mega_df.write_parquet('./data/twelve_route_mega_df.parquet')
print("[X] Succesfully wrote two_l_mega_df and twelve_route_mega_df to disk")

### Another way to compute n stops ahead with some caviates

In [None]:
# # Approach #2 - Using the 'shift' method to calculate the time difference between two stops (Not recommended)
# # Calcuate the time difference between going to Linchon hall to Stevenson South
# shiftVal = -3
# startingStop = 465
# subset = df.filter(pl.col("routeID") == "2L")

# # Ensure the data is sorted by 'equipmentID' and 'receiveTime'
# df = subset.sort(["equipmentID", "receiveTime"])

# # Group by 'equipmentID' and shift the 'receiveTime' column by 3 to get the time three stops ahead
# df = df.with_columns([
#     pl.col("receiveTime").shift(shiftVal).over("equipmentID").alias("receiveTime_3_stops_ahead"),
#     pl.col("nextStopID").shift(shiftVal).over("equipmentID").alias("nextStopID_3_stops_ahead"),
#     pl.col("lastStopName").shift(shiftVal).over("equipmentID").alias("lastStopName_3_stops_ahead")
# ])

# # Join with stops to get the name of the stop 3 stops ahead
# df = df.join(stops.rename({"id": "nextStopID_3_stops_ahead"}).select(["nextStopID_3_stops_ahead", "name"]).unique(), on="nextStopID_3_stops_ahead", how="left").rename({"name": "nextStopName_3_ahead"})

# # Filter the rows where 'nextStopID' is 433 to calculate the time difference
# df_filtered = df.filter(pl.col("nextStopID") == startingStop)

# df_filtered = df_filtered.with_columns([
#     pl.col("receiveTime").dt.convert_time_zone("America/Chicago").alias("receiveTime"),
#     pl.col("receiveTime_3_stops_ahead").dt.convert_time_zone("America/Chicago").alias("receiveTime_3_stops_ahead")
# ])

# # Calculate the time difference between the current stop and three stops ahead
# df_filtered = df_filtered.with_columns([
#     (pl.col("receiveTime_3_stops_ahead") - pl.col("receiveTime")).alias("timeDiff_3_stops_ahead")
# ])

# # Display the result
# df_filtered['routeID', 'equipmentID', 'lastStopName', 'nextStopName_3_ahead', 'receiveTime', 'receiveTime_3_stops_ahead', 'timeDiff_3_stops_ahead']