# Notebook Overview

1. **Importing Libraries and Configurations**:
    - Import necessary libraries such as `polars`, `datetime`, and `json`.
    - Configure `polars` settings for better performance and display.

2. **Data Loading and Preprocessing**:
    - Load route mappings to map route IDs to their respective names.
    - Read and preprocess the main dataset from a Parquet file, replacing route IDs with their mapped names.
    - Load and preprocess additional data from JSON files, including stops and patterns data.

3. **Data Merging and Transformation**:
    - Merge stops data with the main dataset to get stop names for `nextStopID` and `lastStopID`.
    - Filter the dataset to include only relevant records and add new columns such as `stopChanged` and `timeDiff`.

4. **Computing Permutations**:
    - Define a function to compute permutations for each route, calculating time differences between stops.
    - Save the resulting dataset to a Parquet file for further analysis.

5. **Data Analysis**:
    - Extract and display specific columns from the processed dataset for analysis.
    - Filter and analyze data for specific routes and stops.

6. **Alternative Approaches**:
    - Document an alternative method to compute time differences between stops using the `shift` method (commented out).

This notebook provides a comprehensive workflow for processing and analyzing transportation data, enabling detailed insights into bus routes and stop timings.

In [1]:
import polars as pl
from datetime import datetime
import json
import datetime
import math

pl.enable_string_cache()
pl.Config().set_tbl_cols(100)
pl.Config().set_tbl_rows(25)

polars.config.Config

In [2]:
%config InteractiveShell.ast_node_interactivity = 'last_expr_or_assign'

In [3]:
#Map route id to correct route name
route_mapping = {
    3: "2L",
    4: "2R",
    33: "3",
    17: "10",
    18: "11",
    23: "12",
    12: "16",
    13: "17",
    14: "18",
    30: "19",
    29: "21",
    38: "21 Tripper",
    777: "777"
}

{3: '2L',
 4: '2R',
 33: '3',
 17: '10',
 18: '11',
 23: '12',
 12: '16',
 13: '17',
 14: '18',
 30: '19',
 29: '21',
 38: '21 Tripper',
 777: '777'}

In [4]:
df = pl.read_parquet("./data/2024-09-entries.parquet")
df = df.with_columns(pl.col("routeID").replace_strict(route_mapping))

routeID,patternID,equipmentID,tripID,lat,lng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__
str,i64,str,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,datetime[ns],str,datetime[ns],cat,i64,i64,i64,f64,bool,i64
"""777""",33,"""1001""",,41.93536,-88.77044,0,0,4,0,0,-1,0,197,491,11402,"""NIS""",false,,0,2024-08-31 23:59:49,"""35467605078ef4d""",2024-09-01 00:00:01,,-1,-1,-1,,false,13
"""777""",9998,"""1002""",,41.93484,-88.72824,0,0,9,0,0,-1,0,78,0,0,"""NIS""",false,,0,2024-08-31 15:14:57,"""3546760508796c6""",2024-09-01 00:00:01,,-1,-1,-1,,false,14
"""777""",0,"""1003""",,41.93277,-88.76676,0,0,0,0,0,-1,0,0,0,0,"""NIS""",false,,0,2024-08-30 04:47:18,"""3546760504518c0""",2024-09-01 00:00:01,,-1,-1,-1,,false,15
"""777""",9998,"""1004""",,41.93481,-88.7283,0,0,15,0,0,-1,0,0,0,0,"""NIS""",false,,0,2024-08-31 09:43:05,"""35467605056be89""",2024-09-01 00:00:01,,-1,-1,-1,,false,16
"""777""",9998,"""1101""",,41.93486,-88.7282,0,0,0,125,0,-1,0,351,0,0,"""NIS""",false,,0,2024-08-31 05:17:48,"""3546760504edeed""",2024-09-01 00:00:01,,-1,-1,-1,,false,17
"""17""",13,"""1102""",10058,41.93558,-88.76723,0,0,237,0,806,1139,10612,2,805,10611,"""8:00:00-17""",true,-658.0,11606,2024-08-31 11:36:01,"""35467605059a8e2""",2024-09-01 00:00:01,"""Outbound""",2,805,806,100.0,true,18
"""777""",9998,"""1103""",,41.93481,-88.72816,0,0,0,0,0,-1,0,47,0,0,"""NIS""",false,,0,2024-08-31 02:19:58,"""35467605044c796""",2024-09-01 00:00:01,,-1,-1,-1,,false,19
"""777""",0,"""1401""",,41.93539,-88.77016,0,0,0,0,0,-1,0,0,0,0,"""NIS""",false,,0,2024-08-01 19:06:17,"""3546760504ac22e""",2024-09-01 00:00:01,,-1,-1,-1,,false,20
"""777""",0,"""1701""",,41.9348,-88.72831,0,0,0,0,0,-1,0,0,0,0,"""NIS""",false,,0,2024-08-10 04:51:32,"""3546760504a404a""",2024-09-01 00:00:01,,-1,-1,-1,,false,21
"""777""",9998,"""1702""",,41.93459,-88.72733,0,0,0,0,0,-1,0,7,0,0,"""NIS""",false,,0,2024-08-30 19:58:30,"""3546760505ac39c""",2024-09-01 00:00:01,,-1,-1,-1,,false,22


In [6]:
#Load stops data
file = open("./data/stops.json", "r")
stopsData = json.load(file)

stops = pl.DataFrame(stopsData['get_stops'])

#Load patterns data
pattern_mapping = {
    3: "2L",
    4: "2R",
    37: "3",
    17: "10",
    18: "11",
    23: "12",
    12: "16",
    13: "17",
    14: "18",
    33: "19",
    46: "21",
    45: "21 Tripper",
}

#Load patterns json
file = open("./data/patterns.json", "r")
patternsData = json.load(file)

patterns = pl.DataFrame(patternsData['get_patterns'])
patterns = patterns.with_columns(pl.col("id").replace_strict(pattern_mapping, default="None"))

id,name,extID,type,length,color,encLine,decLine,routes,routeNames,stations,stopIDs
str,str,str,i64,f64,str,str,list[null],list[i64],list[str],list[null],list[i64]
"""2L""","""Route 2L (Full Service)""","""2""",2,5.156097,"""#097138""","""eq}~Fxix|OM?a@@}A??fE?nEAbE?h@…",[],[3],"[""Route 2L""]",[],"[433, 465, … 820]"
"""2R""","""Route 2R Full Service""","""1""",1,5.083211,"""#FFD600""","""qk}~Fn{x|Ow@C{BhAoBp@Em@G}G@_I…",[],[4],"[""Route 2R""]",[],"[431, 451, … 432]"
"""3""","""Route 3 Full Service""","""3""",2,2.793036,"""#58F964""","""kq}~Fxix|Oa@??@sA?@?C?A}C???aC…",[],[33],"[""Route 3""]",[],"[431, 493, … 431]"
"""10""","""Route 10 Full Service""","""10""",3,5.564219,"""#782BC9""","""{q}~Fxix|OuBACfF@jH@xClFkBj@Sj…",[],[17],"[""Route 10""]",[],"[477, 432, … 431]"
"""11""","""Route 11 Full Service""","""11""",2,12.793737,"""#A07D5C""","""mq}~F~ix|O}A?QECKEaPAoACmQGGeR…",[],[18],"[""Route 11""]",[],"[492, 431, … 491]"
"""12""","""Route 12 Full Service""","""12""",2,39.210159,"""#3C8DBC""","""{~|~Fzex|O?dCrIA@qE@}Bx@u}@@mF…",[],[23],"[""Route 12""]",[],"[809, 453, … 575]"
"""16""","""Route 16 Full Service""","""16""",3,4.9,"""#A64598""","""y~|~Ftcx|OAfErI@@}H?W~@cbAFmAD…",[],[12],"[""Route 16""]",[],"[809, 536, … 811]"
"""17""","""Route 17 Full Service""","""17""",2,11.542563,"""#3E5BA6""","""ap}~Fjkx|OHOAYQGu@B_B?K}]?_M`B…",[],[13],"[""Route 17""]",[],"[805, 806, … 805]"
"""18""","""Route 18 Full Service""","""18""",4,14.597877,"""#FFAA00""","""}i`_G|no|OoA}Aa@]gAKCB[~BUp@c@…",[],[14],"[""Route 18""]",[],"[676, 835, … 685]"
"""19""","""Route 19 Full Service""","""19""",4,21.596067,"""#34ADE0""","""cm}~Fj_y|Ol@CRUFW?_@??ESOQ??WI…",[],[30],"[""Route 19""]",[],"[819, 528, … 819]"


In [7]:
stops.rename({"id": "nextStopID"}).select(["nextStopID", "name"]).unique().sort("nextStopID")

#Left join stops to get stop names for nextStopID and lastStopID
df = df.join(stops.rename({"id": "nextStopID"}).select(["nextStopID", "name", "lat", "lng"]).unique(), on="nextStopID", how="left").rename({"name": "nextStopName", "lat": "nextStopLat", "lng": "nextStopLng"})
df = df.join(stops.rename({"id": "lastStopID"}).select(["lastStopID", "name", "lat", "lng"]).unique(), on="lastStopID", how="left").rename({"name": "lastStopName", "lat": "lastStopLat", "lng": "lastStopLng"})

routeID,patternID,equipmentID,tripID,nextStopLat,nextStopLng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__,nextStopName,lat_right,lng_right,lastStopName,lastStopLat,lastStopLng
str,i64,str,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,datetime[ns],str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64
"""777""",33,"""1001""",,41.93536,-88.77044,0,0,4,0,0,-1,0,197,491,11402,"""NIS""",false,,0,2024-08-31 23:59:49,"""35467605078ef4d""",2024-09-01 00:00:01,,-1,-1,-1,,false,13,,,,"""Normal Rd. at Northern Lane""",41.939995,-88.765862
"""777""",9998,"""1002""",,41.93484,-88.72824,0,0,9,0,0,-1,0,78,0,0,"""NIS""",false,,0,2024-08-31 15:14:57,"""3546760508796c6""",2024-09-01 00:00:01,,-1,-1,-1,,false,14,,,,,,
"""777""",0,"""1003""",,41.93277,-88.76676,0,0,0,0,0,-1,0,0,0,0,"""NIS""",false,,0,2024-08-30 04:47:18,"""3546760504518c0""",2024-09-01 00:00:01,,-1,-1,-1,,false,15,,,,,,
"""777""",9998,"""1004""",,41.93481,-88.7283,0,0,15,0,0,-1,0,0,0,0,"""NIS""",false,,0,2024-08-31 09:43:05,"""35467605056be89""",2024-09-01 00:00:01,,-1,-1,-1,,false,16,,,,,,
"""777""",9998,"""1101""",,41.93486,-88.7282,0,0,0,125,0,-1,0,351,0,0,"""NIS""",false,,0,2024-08-31 05:17:48,"""3546760504edeed""",2024-09-01 00:00:01,,-1,-1,-1,,false,17,,,,,,
"""17""",13,"""1102""",10058,41.93558,-88.76723,0,0,237,0,806,1139,10612,2,805,10611,"""8:00:00-17""",true,-658.0,11606,2024-08-31 11:36:01,"""35467605059a8e2""",2024-09-01 00:00:01,"""Outbound""",2,805,806,100.0,true,18,"""Lucinda Ave at Wirtz Dr Stop #…",41.936348,-88.764023,"""Holmes Student Center Stop #73…",41.935528,-88.767426
"""777""",9998,"""1103""",,41.93481,-88.72816,0,0,0,0,0,-1,0,47,0,0,"""NIS""",false,,0,2024-08-31 02:19:58,"""35467605044c796""",2024-09-01 00:00:01,,-1,-1,-1,,false,19,,,,,,
"""777""",0,"""1401""",,41.93539,-88.77016,0,0,0,0,0,-1,0,0,0,0,"""NIS""",false,,0,2024-08-01 19:06:17,"""3546760504ac22e""",2024-09-01 00:00:01,,-1,-1,-1,,false,20,,,,,,
"""777""",0,"""1701""",,41.9348,-88.72831,0,0,0,0,0,-1,0,0,0,0,"""NIS""",false,,0,2024-08-10 04:51:32,"""3546760504a404a""",2024-09-01 00:00:01,,-1,-1,-1,,false,21,,,,,,
"""777""",9998,"""1702""",,41.93459,-88.72733,0,0,0,0,0,-1,0,7,0,0,"""NIS""",false,,0,2024-08-30 19:58:30,"""3546760505ac39c""",2024-09-01 00:00:01,,-1,-1,-1,,false,22,,,,,,


In [8]:
#Filter out the subset of data we want to work with
df = df.filter(
    (pl.col("inService"))
)

#Add stopChanged column
df = df.with_columns(
    (
        (pl.col("lastStopID") == pl.col("nextStopID").shift(1)).over(
            "equipmentID", order_by="receiveTime"
        )
    ).alias("stopChanged")
)

df = df.filter(pl.col("stopChanged")).with_columns(
    (pl.col("nextStopID").shift(1) == pl.col("lastStopID"))
    .over("equipmentID", order_by="receiveTime")
    .alias("nextToLast")
)

#Add timeDiff column
df = df.with_columns(
    (-pl.col("receiveTime").diff(-1).over("equipmentID", order_by="receiveTime")).alias(
        "timeDiff"
    )
).filter(pl.col("nextToLast"))



routeID,patternID,equipmentID,tripID,nextStopLat,nextStopLng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__,nextStopName,lat_right,lng_right,lastStopName,lastStopLat,lastStopLng,stopChanged,nextToLast,timeDiff
str,i64,str,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,datetime[ns],str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns]
"""19""",33,"""603""",10252,41.94231,-88.77307,0,0,8,0,616,1143,11390,360,709,11389,"""33-0831_1""",true,-4.0,11681,2024-09-01 00:03:09,"""35467605059b849""",2024-09-01 00:03:17,"""Outbound""",4,709,616,0.0,true,357,"""Annie Glidden Rd. at Varsity D…",41.943466,-88.77301,"""Annie Glidden Rd. & Loren Dr. …",41.94175,-88.772987,true,true,20s
"""19""",33,"""603""",10252,41.94478,-88.77306,0,0,8,0,482,1144,11391,0,616,11390,"""33-0831_1""",true,-4.0,11681,2024-09-01 00:03:29,"""35467605059b849""",2024-09-01 00:03:32,"""Outbound""",5,616,482,26.68739,false,384,"""Blackhawk Rd. at Kimberly Dr.""",41.946342,-88.769913,"""Annie Glidden Rd. at Varsity D…",41.943466,-88.77301,true,true,50s
"""19""",33,"""603""",10252,41.94629,-88.76997,0,0,8,0,483,1144,11392,2,482,11391,"""33-0831_1""",true,-4.0,11681,2024-09-01 00:04:19,"""35467605059b849""",2024-09-01 00:04:33,"""Outbound""",6,482,483,0.0,true,492,"""Blackhawk Rd. at Edgebrook Dr.""",41.947201,-88.769913,"""Blackhawk Rd. at Kimberly Dr.""",41.946342,-88.769913,true,true,1m 10s
"""21""",46,"""72309""",,41.99158,-88.68532,0,0,0,0,764,1145,11448,9,763,11447,"""19:00:00-21""",true,-2.0,11696,2024-09-01 00:04:33,"""354676050c83bea""",2024-09-01 00:04:48,"""Outbound""",4,763,764,0.0,true,537,"""Lucas St. at Opportunity House""",41.993652,-88.684898,"""Maple St. at Opportunity House…",41.991978,-88.68515,true,true,1m 10s
"""2L""",3,"""Old_1501""",329,41.93639,-88.7736,0,0,33,0,465,1145,10482,271,433,10481,"""19:00:00-2""",true,-2.0,11414,2024-09-01 00:04:41,"""354676050595540""",2024-09-01 00:04:48,"""Outbound""",4,433,465,0.0,true,538,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,40s
"""16""",12,"""72307""",10046,41.93175,-88.75342,0,0,47,0,624,1146,10598,112,623,10597,"""12-0831_1""",true,-1.0,11594,2024-09-01 00:04:48,"""354676050c83bf0""",2024-09-01 00:05:03,"""Outbound""",4,623,624,0.0,true,563,"""Locust St. at 6th St.""",41.929585,-88.745903,"""Locust St. at 2nd St.""",41.931332,-88.752335,true,true,2m 20s
"""2L""",3,"""Old_1501""",329,41.93746,-88.7754,0,0,33,0,466,1146,10483,0,465,10482,"""19:00:00-2""",true,-2.0,11414,2024-09-01 00:05:21,"""354676050595540""",2024-09-01 00:05:33,"""Outbound""",5,465,466,0.0,true,619,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,1m
"""21""",46,"""72309""",,41.99374,-88.68572,0,0,0,0,765,1147,11449,100,764,11448,"""19:00:00-21""",true,-2.0,11696,2024-09-01 00:05:43,"""354676050c83bea""",2024-09-01 00:05:49,"""Outbound""",5,764,765,0.0,true,645,"""State St. at Sycamore Post Off…",41.988407,-88.684143,"""Lucas St. at Opportunity House""",41.993652,-88.684898,true,true,1m 40s
"""10""",17,"""904""",10159,41.93984,-88.77303,0,0,101,0,529,1146,10691,360,528,10690,"""17-0831_1""",true,-4.0,11166,2024-09-01 00:05:45,"""35467605084242c""",2024-09-01 00:05:49,"""Outbound""",4,528,529,0.0,true,633,"""Varsity Blvd. at Pappas""",41.943569,-88.774178,"""Annie Glidden Rd at Crane Dr""",41.93985,-88.77301,true,true,50s
"""19""",33,"""603""",10252,41.94709,-88.76576,0,0,8,0,824,1145,11396,181,486,11395,"""33-0831_1""",true,-3.0,11681,2024-09-01 00:05:59,"""35467605059b849""",2024-09-01 00:06:04,"""Outbound""",10,486,824,100.0,false,654,"""Normal Rd at NIU School of Nur…",41.947834,-88.765846,"""Ridge Rd. at Normal Rd.""",41.948471,-88.765907,true,true,2m 10s


In [35]:
def compute_permutations():
    mega_df = None
    for route in route_mapping.values():

        # Skip any inactive routes
        if(route == "777"):
            continue

        subset = df.filter(pl.col('routeID') == route)
        stop_ids = patterns.filter(pl.col("id") == route)['stopIDs'][0]

        print(f"[!] Computing permutations for route: {route} with stop ids len: {len(stop_ids)}")

        for i in range(len(stop_ids)):
            stop1 = stop_ids[i]
            df1 = subset.filter(pl.col('lastStopID') == stop1)
            
            for j in range(1, 6):
                stop2 = stop_ids[(i + j) % len(stop_ids)]
                df2 = subset.filter(pl.col('lastStopID') == stop2)
                
                df2 = df2.with_columns(
                    pl.col("receiveTime").alias("receiveTime_right"), 
                    pl.col('lastStopID').alias('nextStopID_actual'),
                    pl.col('lastStopName').alias('nextStopName_actual')
                )
       
                joined_df = df1.join_asof(df2, on="receiveTime", by='equipmentID', strategy='forward')
                
                joined_df = joined_df.with_columns(
                    (pl.col("receiveTime_right") - pl.col("receiveTime")).alias(f"eta")
                )
                
                joined_df.drop(["scheduleNumber_right", "nextStopName_right", "nextStopID_right", "aID_right", "trainID_right", "onSchedule_right"], strict=True)
                
                #Add Time Of Day and Day of Week columns
                joined_df = joined_df.with_columns(
                    pl.col("receiveTime").dt.hour().alias("hour_of_day"),
                    pl.col("receiveTime").dt.minute().alias("minute_of_hour"),
                    pl.col("receiveTime").dt.weekday().alias("day_of_week"),
                    pl.col("eta").dt.total_seconds().alias("eta_seconds")
                )

                if mega_df is None:
                    mega_df = joined_df
                else:
                    mega_df = mega_df.vstack(joined_df)
    
    return mega_df

mega_df = compute_permutations()


[!] Computing permutations for route: 2L with stop ids len: 22
[!] Computing permutations for route: 2R with stop ids len: 23
[!] Computing permutations for route: 3 with stop ids len: 20
[!] Computing permutations for route: 10 with stop ids len: 33
[!] Computing permutations for route: 11 with stop ids len: 64
[!] Computing permutations for route: 12 with stop ids len: 15
[!] Computing permutations for route: 16 with stop ids len: 16
[!] Computing permutations for route: 17 with stop ids len: 43
[!] Computing permutations for route: 18 with stop ids len: 36
[!] Computing permutations for route: 19 with stop ids len: 58
[!] Computing permutations for route: 21 with stop ids len: 29
[!] Computing permutations for route: 21 Tripper with stop ids len: 10


routeID,patternID,equipmentID,tripID,nextStopLat,nextStopLng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__,nextStopName,lat_right,lng_right,lastStopName,lastStopLat,lastStopLng,stopChanged,nextToLast,timeDiff,routeID_right,patternID_right,tripID_right,nextStopLat_right,nextStopLng_right,load_right,capacity_right,eLoad_right,blockID_right,nextStopID_right,nextStopETA_right,nextPatternStopID_right,h_right,lastStopID_right,lastPatternStopID_right,scheduleNumber_right,inService_right,onSchedule_right,trainID_right,aID_right,captureTime_right,direction_right,seq_right,lastStopExtID_right,nextStopExtID_right,nextStopPctProg_right,atStop_right,__index_level_0___right,nextStopName_right,lat_right_right,lng_right_right,lastStopName_right,lastStopLat_right,lastStopLng_right,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds
str,i64,str,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,datetime[ns],str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns],str,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns],datetime[ns],i64,str,duration[ns],i8,i8,i8,i64
"""2L""",3,"""Old_1501""",329,41.93639,-88.7736,0,0,33,0,465,1145,10482,271,433,10481,"""19:00:00-2""",true,-2.0,11414,2024-09-01 00:04:41,"""354676050595540""",2024-09-01 00:04:48,"""Outbound""",4,433,465,0.0,true,538,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,40s,"""2L""",3,329,41.93746,-88.7754,0,0,33,0,466,1146,10483,0,465,10482,"""19:00:00-2""",true,-2.0,11414,"""354676050595540""",2024-09-01 00:05:33,"""Outbound""",5,465,466,0.0,true,619,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,1m,2024-09-01 00:05:21,465,"""Stadium Dr. at New Hall""",40s,0,4,7,40
"""2L""",3,"""1103""",330,41.93639,-88.77435,0,0,0,0,465,1175,10482,270,433,10481,"""19:30:00-2""",true,-2.0,11415,2024-09-01 00:34:35,"""35467605044c796""",2024-09-01 00:34:47,"""Outbound""",4,433,465,0.0,true,3745,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,41s,"""2L""",3,330,41.93746,-88.7754,0,0,0,0,466,1176,10483,0,465,10482,"""19:30:00-2""",true,-2.0,11415,"""35467605044c796""",2024-09-01 00:35:18,"""Outbound""",5,465,466,0.0,true,3799,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 00:35:16,465,"""Stadium Dr. at New Hall""",41s,0,34,7,41
"""2L""",3,"""1103""",331,41.93638,-88.77397,0,0,8,0,465,1204,10482,270,433,10481,"""20:00:00-2""",true,-1.0,11416,2024-09-01 01:03:34,"""35467605044c796""",2024-09-01 01:03:47,"""Outbound""",4,433,465,0.0,true,6850,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,51s,"""2L""",3,331,41.93746,-88.7754,0,0,8,0,466,1205,10483,0,465,10482,"""20:00:00-2""",true,-1.0,11416,"""35467605044c796""",2024-09-01 01:04:32,"""Outbound""",5,465,466,0.0,true,6931,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,1m,2024-09-01 01:04:25,465,"""Stadium Dr. at New Hall""",51s,1,3,7,51
"""2L""",3,"""1103""",332,41.93639,-88.77435,0,0,18,0,465,1234,10482,270,433,10481,"""20:30:00-2""",true,-1.0,11417,2024-09-01 01:33:46,"""35467605044c796""",2024-09-01 01:34:02,"""Outbound""",4,433,465,0.0,true,10090,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,51s,"""2L""",3,332,41.93778,-88.77539,0,0,18,0,466,1235,10483,0,465,10482,"""20:30:00-2""",true,-1.0,11417,"""35467605044c796""",2024-09-01 01:34:47,"""Outbound""",5,465,466,8.476742,true,10171,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 01:34:37,465,"""Stadium Dr. at New Hall""",51s,1,33,7,51
"""2L""",3,"""1103""",338,41.9364,-88.77528,0,0,31,0,465,1265,10482,270,433,10481,"""21:00:00-2""",true,-2.0,11423,2024-09-01 02:04:05,"""35467605044c796""",2024-09-01 02:04:16,"""Outbound""",4,433,465,29.23959,true,13330,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,30s,"""2L""",3,338,41.9378,-88.77539,0,0,31,0,466,1265,10483,0,465,10482,"""21:00:00-2""",true,-1.0,11423,"""35467605044c796""",2024-09-01 02:04:46,"""Outbound""",5,465,466,0.0,true,13384,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 02:04:35,465,"""Stadium Dr. at New Hall""",30s,2,4,7,30
"""2L""",3,"""1103""",337,41.93639,-88.7746,0,0,35,0,465,1294,10482,270,433,10481,"""21:30:00-2""",true,-1.0,11422,2024-09-01 02:33:37,"""35467605044c796""",2024-09-01 02:33:45,"""Outbound""",4,433,465,6.066037,true,16489,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m,"""2L""",3,337,41.9378,-88.77539,0,0,35,0,466,1295,10483,270,465,10482,"""21:30:00-2""",true,-1.0,11422,"""35467605044c796""",2024-09-01 02:34:46,"""Outbound""",5,465,466,7.197193,true,16597,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,40s,2024-09-01 02:34:37,465,"""Stadium Dr. at New Hall""",1m,2,33,7,60
"""2L""",3,"""1103""",336,41.93638,-88.77397,0,0,43,0,465,1323,10482,270,433,10481,"""22:00:00-2""",true,0.0,11421,2024-09-01 03:02:47,"""35467605044c796""",2024-09-01 03:03:00,"""Outbound""",4,433,465,0.0,true,19621,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m 1s,"""2L""",3,336,41.93777,-88.77539,0,0,43,0,466,1324,10483,0,465,10482,"""22:00:00-2""",true,0.0,11421,"""35467605044c796""",2024-09-01 03:04:00,"""Outbound""",5,465,466,0.0,true,19729,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 03:03:48,465,"""Stadium Dr. at New Hall""",1m 1s,3,2,7,61
"""2L""",3,"""1103""",335,41.93638,-88.77394,0,0,57,0,465,1355,10482,270,433,10481,"""22:30:00-2""",true,-2.0,11420,2024-09-01 03:34:16,"""35467605044c796""",2024-09-01 03:34:30,"""Outbound""",4,433,465,0.0,true,22996,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m,"""2L""",3,335,41.93779,-88.77539,0,0,57,0,466,1356,10483,0,465,10482,"""22:30:00-2""",true,-2.0,11420,"""35467605044c796""",2024-09-01 03:35:31,"""Outbound""",5,465,466,0.0,true,23104,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 03:35:16,465,"""Stadium Dr. at New Hall""",1m,3,34,7,60
"""2L""",3,"""1103""",339,41.93638,-88.77397,0,0,68,0,465,1385,10482,270,433,10481,"""23:00:00-2""",true,-2.0,11424,2024-09-01 04:04:30,"""35467605044c796""",2024-09-01 04:04:45,"""Outbound""",4,433,465,0.0,true,26236,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m 10s,"""2L""",3,339,41.93786,-88.77622,0,0,68,0,466,1386,10483,270,465,10482,"""23:00:00-2""",true,-2.0,11424,"""35467605044c796""",2024-09-01 04:05:45,"""Outbound""",5,465,466,35.89457,true,26344,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,40s,2024-09-01 04:05:40,465,"""Stadium Dr. at New Hall""",1m 10s,4,4,7,70
"""2L""",3,"""1103""",340,41.93639,-88.77354,0,0,68,0,465,1414,10482,271,433,10481,"""23:30:00-2""",true,-1.0,11425,2024-09-01 04:33:34,"""35467605044c796""",2024-09-01 04:33:44,"""Outbound""",4,433,465,0.0,true,29341,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,40s,"""2L""",3,340,41.9378,-88.77539,0,0,68,0,466,1415,10483,0,465,10482,"""23:30:00-2""",true,-1.0,11425,"""35467605044c796""",2024-09-01 04:34:29,"""Outbound""",5,465,466,0.0,true,29422,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 04:34:14,465,"""Stadium Dr. at New Hall""",40s,4,33,7,40


In [36]:
#Calculate the euclidean distance between the two stop locations
def euclidean_distance(row):
    R = 6371.0  # Earth's radius in km

    lat1, lon1, lat2, lon2 = row["nextStopLat"], row["nextStopLng"], row["lastStopLat"], row["lastStopLng"]
  
    # Convert degrees to radians
    lat1_rad, lon1_rad = math.radians(lat1), math.radians(lon1)
    lat2_rad, lon2_rad = math.radians(lat2), math.radians(lon2)

    # Approximate Euclidean distance
    x1 = R * lat1_rad
    y1 = R * lon1_rad
    x2 = R * lat2_rad
    y2 = R * lon2_rad

    distance_km =  math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
    
    # Convert to miles
    return distance_km * 0.621371  
  
#Add distance column
mega_df = mega_df.with_columns(
    pl.struct(["nextStopLat", "nextStopLng", "lastStopLat", "lastStopLng"])
    .map_elements(euclidean_distance, return_dtype=pl.Float32)
    .alias("distance")
)

routeID,patternID,equipmentID,tripID,nextStopLat,nextStopLng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__,nextStopName,lat_right,lng_right,lastStopName,lastStopLat,lastStopLng,stopChanged,nextToLast,timeDiff,routeID_right,patternID_right,tripID_right,nextStopLat_right,nextStopLng_right,load_right,capacity_right,eLoad_right,blockID_right,nextStopID_right,nextStopETA_right,nextPatternStopID_right,h_right,lastStopID_right,lastPatternStopID_right,scheduleNumber_right,inService_right,onSchedule_right,trainID_right,aID_right,captureTime_right,direction_right,seq_right,lastStopExtID_right,nextStopExtID_right,nextStopPctProg_right,atStop_right,__index_level_0___right,nextStopName_right,lat_right_right,lng_right_right,lastStopName_right,lastStopLat_right,lastStopLng_right,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds,distance
str,i64,str,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,datetime[ns],str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns],str,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns],datetime[ns],i64,str,duration[ns],i8,i8,i8,i64,f32
"""2L""",3,"""Old_1501""",329,41.93639,-88.7736,0,0,33,0,465,1145,10482,271,433,10481,"""19:00:00-2""",true,-2.0,11414,2024-09-01 00:04:41,"""354676050595540""",2024-09-01 00:04:48,"""Outbound""",4,433,465,0.0,true,538,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,40s,"""2L""",3,329,41.93746,-88.7754,0,0,33,0,466,1146,10483,0,465,10482,"""19:00:00-2""",true,-2.0,11414,"""354676050595540""",2024-09-01 00:05:33,"""Outbound""",5,465,466,0.0,true,619,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,1m,2024-09-01 00:05:21,465,"""Stadium Dr. at New Hall""",40s,0,4,7,40,0.054808
"""2L""",3,"""1103""",330,41.93639,-88.77435,0,0,0,0,465,1175,10482,270,433,10481,"""19:30:00-2""",true,-2.0,11415,2024-09-01 00:34:35,"""35467605044c796""",2024-09-01 00:34:47,"""Outbound""",4,433,465,0.0,true,3745,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,41s,"""2L""",3,330,41.93746,-88.7754,0,0,0,0,466,1176,10483,0,465,10482,"""19:30:00-2""",true,-2.0,11415,"""35467605044c796""",2024-09-01 00:35:18,"""Outbound""",5,465,466,0.0,true,3799,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 00:35:16,465,"""Stadium Dr. at New Hall""",41s,0,34,7,41,0.004875
"""2L""",3,"""1103""",331,41.93638,-88.77397,0,0,8,0,465,1204,10482,270,433,10481,"""20:00:00-2""",true,-1.0,11416,2024-09-01 01:03:34,"""35467605044c796""",2024-09-01 01:03:47,"""Outbound""",4,433,465,0.0,true,6850,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,51s,"""2L""",3,331,41.93746,-88.7754,0,0,8,0,466,1205,10483,0,465,10482,"""20:00:00-2""",true,-1.0,11416,"""35467605044c796""",2024-09-01 01:04:32,"""Outbound""",5,465,466,0.0,true,6931,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,1m,2024-09-01 01:04:25,465,"""Stadium Dr. at New Hall""",51s,1,3,7,51,0.029283
"""2L""",3,"""1103""",332,41.93639,-88.77435,0,0,18,0,465,1234,10482,270,433,10481,"""20:30:00-2""",true,-1.0,11417,2024-09-01 01:33:46,"""35467605044c796""",2024-09-01 01:34:02,"""Outbound""",4,433,465,0.0,true,10090,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,51s,"""2L""",3,332,41.93778,-88.77539,0,0,18,0,466,1235,10483,0,465,10482,"""20:30:00-2""",true,-1.0,11417,"""35467605044c796""",2024-09-01 01:34:47,"""Outbound""",5,465,466,8.476742,true,10171,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 01:34:37,465,"""Stadium Dr. at New Hall""",51s,1,33,7,51,0.004875
"""2L""",3,"""1103""",338,41.9364,-88.77528,0,0,31,0,465,1265,10482,270,433,10481,"""21:00:00-2""",true,-2.0,11423,2024-09-01 02:04:05,"""35467605044c796""",2024-09-01 02:04:16,"""Outbound""",4,433,465,29.23959,true,13330,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,30s,"""2L""",3,338,41.9378,-88.77539,0,0,31,0,466,1265,10483,0,465,10482,"""21:00:00-2""",true,-1.0,11423,"""35467605044c796""",2024-09-01 02:04:46,"""Outbound""",5,465,466,0.0,true,13384,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 02:04:35,465,"""Stadium Dr. at New Hall""",30s,2,4,7,30,0.061588
"""2L""",3,"""1103""",337,41.93639,-88.7746,0,0,35,0,465,1294,10482,270,433,10481,"""21:30:00-2""",true,-1.0,11422,2024-09-01 02:33:37,"""35467605044c796""",2024-09-01 02:33:45,"""Outbound""",4,433,465,6.066037,true,16489,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m,"""2L""",3,337,41.9378,-88.77539,0,0,35,0,466,1295,10483,270,465,10482,"""21:30:00-2""",true,-1.0,11422,"""35467605044c796""",2024-09-01 02:34:46,"""Outbound""",5,465,466,7.197193,true,16597,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,40s,2024-09-01 02:34:37,465,"""Stadium Dr. at New Hall""",1m,2,33,7,60,0.014962
"""2L""",3,"""1103""",336,41.93638,-88.77397,0,0,43,0,465,1323,10482,270,433,10481,"""22:00:00-2""",true,0.0,11421,2024-09-01 03:02:47,"""35467605044c796""",2024-09-01 03:03:00,"""Outbound""",4,433,465,0.0,true,19621,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m 1s,"""2L""",3,336,41.93777,-88.77539,0,0,43,0,466,1324,10483,0,465,10482,"""22:00:00-2""",true,0.0,11421,"""35467605044c796""",2024-09-01 03:04:00,"""Outbound""",5,465,466,0.0,true,19729,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 03:03:48,465,"""Stadium Dr. at New Hall""",1m 1s,3,2,7,61,0.029283
"""2L""",3,"""1103""",335,41.93638,-88.77394,0,0,57,0,465,1355,10482,270,433,10481,"""22:30:00-2""",true,-2.0,11420,2024-09-01 03:34:16,"""35467605044c796""",2024-09-01 03:34:30,"""Outbound""",4,433,465,0.0,true,22996,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m,"""2L""",3,335,41.93779,-88.77539,0,0,57,0,466,1356,10483,0,465,10482,"""22:30:00-2""",true,-2.0,11420,"""35467605044c796""",2024-09-01 03:35:31,"""Outbound""",5,465,466,0.0,true,23104,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 03:35:16,465,"""Stadium Dr. at New Hall""",1m,3,34,7,60,0.031344
"""2L""",3,"""1103""",339,41.93638,-88.77397,0,0,68,0,465,1385,10482,270,433,10481,"""23:00:00-2""",true,-2.0,11424,2024-09-01 04:04:30,"""35467605044c796""",2024-09-01 04:04:45,"""Outbound""",4,433,465,0.0,true,26236,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m 10s,"""2L""",3,339,41.93786,-88.77622,0,0,68,0,466,1386,10483,270,465,10482,"""23:00:00-2""",true,-2.0,11424,"""35467605044c796""",2024-09-01 04:05:45,"""Outbound""",5,465,466,35.89457,true,26344,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,40s,2024-09-01 04:05:40,465,"""Stadium Dr. at New Hall""",1m 10s,4,4,7,70,0.029283
"""2L""",3,"""1103""",340,41.93639,-88.77354,0,0,68,0,465,1414,10482,271,433,10481,"""23:30:00-2""",true,-1.0,11425,2024-09-01 04:33:34,"""35467605044c796""",2024-09-01 04:33:44,"""Outbound""",4,433,465,0.0,true,29341,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,40s,"""2L""",3,340,41.9378,-88.77539,0,0,68,0,466,1415,10483,0,465,10482,"""23:30:00-2""",true,-1.0,11425,"""35467605044c796""",2024-09-01 04:34:29,"""Outbound""",5,465,466,0.0,true,29422,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 04:34:14,465,"""Stadium Dr. at New Hall""",40s,4,33,7,40,0.058944


In [37]:
mega_df.write_parquet('./data/mega_df.parquet')

print("[X] Succesfully wrote mega_df to disk")

[X] Succesfully wrote mega_df to disk


In [38]:
mega_df['routeID', 'equipmentID', 'lastStopID', 'nextStopID_actual', 'lastStopName', 'nextStopName_actual',  'receiveTime', 'receiveTime_right', 'eta', 'day_of_week', 'hour_of_day', 'minute_of_hour', 'distance']

routeID,equipmentID,lastStopID,nextStopID_actual,lastStopName,nextStopName_actual,receiveTime,receiveTime_right,eta,day_of_week,hour_of_day,minute_of_hour,distance
str,str,i64,i64,str,str,datetime[ns],datetime[ns],duration[ns],i8,i8,i8,f32
"""2L""","""Old_1501""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 00:04:41,2024-09-01 00:05:21,40s,7,0,4,0.054808
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 00:34:35,2024-09-01 00:35:16,41s,7,0,34,0.004875
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 01:03:34,2024-09-01 01:04:25,51s,7,1,3,0.029283
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 01:33:46,2024-09-01 01:34:37,51s,7,1,33,0.004875
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 02:04:05,2024-09-01 02:04:35,30s,7,2,4,0.061588
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 02:33:37,2024-09-01 02:34:37,1m,7,2,33,0.014962
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 03:02:47,2024-09-01 03:03:48,1m 1s,7,3,2,0.029283
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 03:34:16,2024-09-01 03:35:16,1m,7,3,34,0.031344
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 04:04:30,2024-09-01 04:05:40,1m 10s,7,4,4,0.029283
"""2L""","""1103""",433,465,"""Lincoln Hall""","""Stadium Dr. at New Hall""",2024-09-01 04:33:34,2024-09-01 04:34:14,40s,7,4,33,0.058944


In [39]:
two_l_mega_df = mega_df.filter(
    pl.col('routeID') == '2L',
    pl.col('eta_seconds') < (60 * 20)
)

routeID,patternID,equipmentID,tripID,nextStopLat,nextStopLng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__,nextStopName,lat_right,lng_right,lastStopName,lastStopLat,lastStopLng,stopChanged,nextToLast,timeDiff,routeID_right,patternID_right,tripID_right,nextStopLat_right,nextStopLng_right,load_right,capacity_right,eLoad_right,blockID_right,nextStopID_right,nextStopETA_right,nextPatternStopID_right,h_right,lastStopID_right,lastPatternStopID_right,scheduleNumber_right,inService_right,onSchedule_right,trainID_right,aID_right,captureTime_right,direction_right,seq_right,lastStopExtID_right,nextStopExtID_right,nextStopPctProg_right,atStop_right,__index_level_0___right,nextStopName_right,lat_right_right,lng_right_right,lastStopName_right,lastStopLat_right,lastStopLng_right,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds,distance
str,i64,str,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,datetime[ns],str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns],str,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns],datetime[ns],i64,str,duration[ns],i8,i8,i8,i64,f32
"""2L""",3,"""Old_1501""",329,41.93639,-88.7736,0,0,33,0,465,1145,10482,271,433,10481,"""19:00:00-2""",true,-2.0,11414,2024-09-01 00:04:41,"""354676050595540""",2024-09-01 00:04:48,"""Outbound""",4,433,465,0.0,true,538,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,40s,"""2L""",3,329,41.93746,-88.7754,0,0,33,0,466,1146,10483,0,465,10482,"""19:00:00-2""",true,-2.0,11414,"""354676050595540""",2024-09-01 00:05:33,"""Outbound""",5,465,466,0.0,true,619,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,1m,2024-09-01 00:05:21,465,"""Stadium Dr. at New Hall""",40s,0,4,7,40,0.054808
"""2L""",3,"""1103""",330,41.93639,-88.77435,0,0,0,0,465,1175,10482,270,433,10481,"""19:30:00-2""",true,-2.0,11415,2024-09-01 00:34:35,"""35467605044c796""",2024-09-01 00:34:47,"""Outbound""",4,433,465,0.0,true,3745,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,41s,"""2L""",3,330,41.93746,-88.7754,0,0,0,0,466,1176,10483,0,465,10482,"""19:30:00-2""",true,-2.0,11415,"""35467605044c796""",2024-09-01 00:35:18,"""Outbound""",5,465,466,0.0,true,3799,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 00:35:16,465,"""Stadium Dr. at New Hall""",41s,0,34,7,41,0.004875
"""2L""",3,"""1103""",331,41.93638,-88.77397,0,0,8,0,465,1204,10482,270,433,10481,"""20:00:00-2""",true,-1.0,11416,2024-09-01 01:03:34,"""35467605044c796""",2024-09-01 01:03:47,"""Outbound""",4,433,465,0.0,true,6850,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,51s,"""2L""",3,331,41.93746,-88.7754,0,0,8,0,466,1205,10483,0,465,10482,"""20:00:00-2""",true,-1.0,11416,"""35467605044c796""",2024-09-01 01:04:32,"""Outbound""",5,465,466,0.0,true,6931,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,1m,2024-09-01 01:04:25,465,"""Stadium Dr. at New Hall""",51s,1,3,7,51,0.029283
"""2L""",3,"""1103""",332,41.93639,-88.77435,0,0,18,0,465,1234,10482,270,433,10481,"""20:30:00-2""",true,-1.0,11417,2024-09-01 01:33:46,"""35467605044c796""",2024-09-01 01:34:02,"""Outbound""",4,433,465,0.0,true,10090,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,51s,"""2L""",3,332,41.93778,-88.77539,0,0,18,0,466,1235,10483,0,465,10482,"""20:30:00-2""",true,-1.0,11417,"""35467605044c796""",2024-09-01 01:34:47,"""Outbound""",5,465,466,8.476742,true,10171,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 01:34:37,465,"""Stadium Dr. at New Hall""",51s,1,33,7,51,0.004875
"""2L""",3,"""1103""",338,41.9364,-88.77528,0,0,31,0,465,1265,10482,270,433,10481,"""21:00:00-2""",true,-2.0,11423,2024-09-01 02:04:05,"""35467605044c796""",2024-09-01 02:04:16,"""Outbound""",4,433,465,29.23959,true,13330,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,30s,"""2L""",3,338,41.9378,-88.77539,0,0,31,0,466,1265,10483,0,465,10482,"""21:00:00-2""",true,-1.0,11423,"""35467605044c796""",2024-09-01 02:04:46,"""Outbound""",5,465,466,0.0,true,13384,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 02:04:35,465,"""Stadium Dr. at New Hall""",30s,2,4,7,30,0.061588
"""2L""",3,"""1103""",337,41.93639,-88.7746,0,0,35,0,465,1294,10482,270,433,10481,"""21:30:00-2""",true,-1.0,11422,2024-09-01 02:33:37,"""35467605044c796""",2024-09-01 02:33:45,"""Outbound""",4,433,465,6.066037,true,16489,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m,"""2L""",3,337,41.9378,-88.77539,0,0,35,0,466,1295,10483,270,465,10482,"""21:30:00-2""",true,-1.0,11422,"""35467605044c796""",2024-09-01 02:34:46,"""Outbound""",5,465,466,7.197193,true,16597,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,40s,2024-09-01 02:34:37,465,"""Stadium Dr. at New Hall""",1m,2,33,7,60,0.014962
"""2L""",3,"""1103""",336,41.93638,-88.77397,0,0,43,0,465,1323,10482,270,433,10481,"""22:00:00-2""",true,0.0,11421,2024-09-01 03:02:47,"""35467605044c796""",2024-09-01 03:03:00,"""Outbound""",4,433,465,0.0,true,19621,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m 1s,"""2L""",3,336,41.93777,-88.77539,0,0,43,0,466,1324,10483,0,465,10482,"""22:00:00-2""",true,0.0,11421,"""35467605044c796""",2024-09-01 03:04:00,"""Outbound""",5,465,466,0.0,true,19729,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 03:03:48,465,"""Stadium Dr. at New Hall""",1m 1s,3,2,7,61,0.029283
"""2L""",3,"""1103""",335,41.93638,-88.77394,0,0,57,0,465,1355,10482,270,433,10481,"""22:30:00-2""",true,-2.0,11420,2024-09-01 03:34:16,"""35467605044c796""",2024-09-01 03:34:30,"""Outbound""",4,433,465,0.0,true,22996,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m,"""2L""",3,335,41.93779,-88.77539,0,0,57,0,466,1356,10483,0,465,10482,"""22:30:00-2""",true,-2.0,11420,"""35467605044c796""",2024-09-01 03:35:31,"""Outbound""",5,465,466,0.0,true,23104,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 03:35:16,465,"""Stadium Dr. at New Hall""",1m,3,34,7,60,0.031344
"""2L""",3,"""1103""",339,41.93638,-88.77397,0,0,68,0,465,1385,10482,270,433,10481,"""23:00:00-2""",true,-2.0,11424,2024-09-01 04:04:30,"""35467605044c796""",2024-09-01 04:04:45,"""Outbound""",4,433,465,0.0,true,26236,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,1m 10s,"""2L""",3,339,41.93786,-88.77622,0,0,68,0,466,1386,10483,270,465,10482,"""23:00:00-2""",true,-2.0,11424,"""35467605044c796""",2024-09-01 04:05:45,"""Outbound""",5,465,466,35.89457,true,26344,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,40s,2024-09-01 04:05:40,465,"""Stadium Dr. at New Hall""",1m 10s,4,4,7,70,0.029283
"""2L""",3,"""1103""",340,41.93639,-88.77354,0,0,68,0,465,1414,10482,271,433,10481,"""23:30:00-2""",true,-1.0,11425,2024-09-01 04:33:34,"""35467605044c796""",2024-09-01 04:33:44,"""Outbound""",4,433,465,0.0,true,29341,"""Stadium Dr. at New Hall""",41.937874,-88.775536,"""Lincoln Hall""",41.936333,-88.774391,true,true,40s,"""2L""",3,340,41.9378,-88.77539,0,0,68,0,466,1415,10483,0,465,10482,"""23:30:00-2""",true,-1.0,11425,"""35467605044c796""",2024-09-01 04:34:29,"""Outbound""",5,465,466,0.0,true,29422,"""Grant North""",41.938599,-88.777321,"""Stadium Dr. at New Hall""",41.937874,-88.775536,true,true,50s,2024-09-01 04:34:14,465,"""Stadium Dr. at New Hall""",40s,4,33,7,40,0.058944


In [40]:
two_l_mega_df.filter(pl.col('eta_seconds') > (60 * 20)).count()

routeID,patternID,equipmentID,tripID,nextStopLat,nextStopLng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__,nextStopName,lat_right,lng_right,lastStopName,lastStopLat,lastStopLng,stopChanged,nextToLast,timeDiff,routeID_right,patternID_right,tripID_right,nextStopLat_right,nextStopLng_right,load_right,capacity_right,eLoad_right,blockID_right,nextStopID_right,nextStopETA_right,nextPatternStopID_right,h_right,lastStopID_right,lastPatternStopID_right,scheduleNumber_right,inService_right,onSchedule_right,trainID_right,aID_right,captureTime_right,direction_right,seq_right,lastStopExtID_right,nextStopExtID_right,nextStopPctProg_right,atStop_right,__index_level_0___right,nextStopName_right,lat_right_right,lng_right_right,lastStopName_right,lastStopLat_right,lastStopLng_right,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds,distance
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
two_l_mega_df.write_parquet('./data/two_l_mega_df.parquet')
print("[X] Succesfully wrote two_l_mega_df to disk")

[X] Succesfully wrote two_l_mega_df to disk


In [42]:
twelve_route_mega_df = mega_df.filter(
    pl.col('routeID') == '12',
    pl.col('eta_seconds') < (60 * 35)
)

routeID,patternID,equipmentID,tripID,nextStopLat,nextStopLng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__,nextStopName,lat_right,lng_right,lastStopName,lastStopLat,lastStopLng,stopChanged,nextToLast,timeDiff,routeID_right,patternID_right,tripID_right,nextStopLat_right,nextStopLng_right,load_right,capacity_right,eLoad_right,blockID_right,nextStopID_right,nextStopETA_right,nextPatternStopID_right,h_right,lastStopID_right,lastPatternStopID_right,scheduleNumber_right,inService_right,onSchedule_right,trainID_right,aID_right,captureTime_right,direction_right,seq_right,lastStopExtID_right,nextStopExtID_right,nextStopPctProg_right,atStop_right,__index_level_0___right,nextStopName_right,lat_right_right,lng_right_right,lastStopName_right,lastStopLat_right,lastStopLng_right,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds,distance
str,i64,str,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,datetime[ns],str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns],str,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,bool,f64,i64,str,datetime[ns],cat,i64,i64,i64,f64,bool,i64,str,f64,f64,str,f64,f64,bool,bool,duration[ns],datetime[ns],i64,str,duration[ns],i8,i8,i8,i64,f32
"""12""",23,"""701""",9971,41.93278,-88.7668,0,0,24,127,536,690,10721,266,809,10306,"""11:47:00-12""",true,20.0,11525,2024-09-20 16:27:12,"""35467605080601a""",2024-09-20 16:27:19,"""Inbound""",2,809,536,0.6775288,true,200348,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,17m 32s,"""12""",23,9971,41.93113,-88.76569,0,0,24,127,809,714,10308,359,453,10307,"""11:47:00-12""",true,71.0,11525,"""35467605080601a""",2024-09-20 16:52:31,"""Inbound""",15,453,809,14.02523,true,202859,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,"""Psychology Building""",41.93119,-88.765167,true,true,51s,2024-09-20 16:52:25,453,"""Psychology Building""",25m 13s,16,27,5,1513,0.006204
"""12""",23,"""72307""",9968,41.93278,-88.76675,0,0,75,130,536,589,10721,270,809,10306,"""09:47:00-12""",true,1.0,11535,2024-09-01 14:46:45,"""354676050c83bf0""",2024-09-01 14:46:57,"""Inbound""",2,809,536,0.2612009,true,94928,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,3m 20s,"""12""",23,9968,41.93278,-88.76675,0,0,75,130,536,589,10721,270,809,10306,"""09:47:00-12""",true,1.0,11535,"""354676050c83bf0""",2024-09-01 14:46:57,"""Inbound""",2,809,536,0.2612009,true,94928,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,3m 20s,2024-09-01 14:46:45,809,"""Normal Rd at W Locust St Stop …",0ns,14,46,7,0,0.003213
"""12""",23,"""72307""",9992,41.93278,-88.76675,0,0,0,132,536,940,10721,266,809,10306,"""15:47:00-12""",true,10.0,11537,2024-09-01 20:37:25,"""354676050c83bf0""",2024-09-01 20:37:33,"""Inbound""",2,809,536,0.310181,true,132485,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,11m 51s,"""12""",23,9992,41.93278,-88.76675,0,0,0,132,536,940,10721,266,809,10306,"""15:47:00-12""",true,10.0,11537,"""354676050c83bf0""",2024-09-01 20:37:33,"""Inbound""",2,809,536,0.310181,true,132485,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,11m 51s,2024-09-01 20:37:25,809,"""Normal Rd at W Locust St Stop …",0ns,20,37,7,0,0.003213
"""12""",23,"""72307""",9993,41.93278,-88.76675,0,0,13,132,536,1050,10721,245,809,10306,"""17:47:00-12""",true,20.0,11538,2024-09-01 22:27:16,"""354676050c83bf0""",2024-09-01 22:27:26,"""Inbound""",2,809,536,0.318344,true,144257,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,21m 23s,"""12""",23,9993,41.93278,-88.76675,0,0,13,132,536,1050,10721,245,809,10306,"""17:47:00-12""",true,20.0,11538,"""354676050c83bf0""",2024-09-01 22:27:26,"""Inbound""",2,809,536,0.318344,true,144257,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,21m 23s,2024-09-01 22:27:16,809,"""Normal Rd at W Locust St Stop …",0ns,22,27,7,0,0.003213
"""12""",23,"""1003""",9994,41.93278,-88.76681,0,0,26,132,536,1290,10721,264,809,10306,"""21:47:00-12""",true,20.0,11540,2024-09-02 02:27:10,"""3546760504518c0""",2024-09-02 02:27:23,"""Inbound""",2,809,536,0.7265087,true,169953,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,21m 54s,"""12""",23,9994,41.93278,-88.76681,0,0,26,132,536,1290,10721,264,809,10306,"""21:47:00-12""",true,20.0,11540,"""3546760504518c0""",2024-09-02 02:27:23,"""Inbound""",2,809,536,0.7265087,true,169953,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,21m 54s,2024-09-02 02:27:10,809,"""Normal Rd at W Locust St Stop …",0ns,2,27,1,0,0.006855
"""12""",23,"""701""",9967,41.93278,-88.76717,0,0,30,125,536,341,10721,270,809,10306,"""05:37:00-12""",true,-1.0,11522,2024-09-02 10:38:20,"""35467605080601a""",2024-09-02 10:38:38,"""Inbound""",2,809,536,3.746928,true,222434,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,1m 20s,"""12""",23,9967,41.93278,-88.76717,0,0,30,125,536,341,10721,270,809,10306,"""05:37:00-12""",true,-1.0,11522,"""35467605080601a""",2024-09-02 10:38:38,"""Inbound""",2,809,536,3.746928,true,222434,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,1m 20s,2024-09-02 10:38:20,809,"""Normal Rd at W Locust St Stop …",0ns,10,38,1,0,0.031445
"""12""",23,"""701""",9967,41.93278,-88.76684,0,0,30,125,536,419,10721,270,809,10306,"""23-0902_M3""",true,0.0,11522,2024-09-02 11:56:23,"""35467605080601a""",2024-09-02 11:56:31,"""Inbound""",2,809,536,1.020387,true,230777,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,23m 53s,"""12""",23,9967,41.93278,-88.76684,0,0,30,125,536,419,10721,270,809,10306,"""23-0902_M3""",true,0.0,11522,"""35467605080601a""",2024-09-02 11:56:31,"""Inbound""",2,809,536,1.020387,true,230777,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,23m 53s,2024-09-02 11:56:23,809,"""Normal Rd at W Locust St Stop …",0ns,11,56,1,0,0.008844
"""12""",23,"""701""",9987,41.93278,-88.76654,0,0,30,126,536,537,10721,270,809,10306,"""08:47:00-12""",true,-7.0,11530,2024-09-02 13:54:51,"""35467605080601a""",2024-09-02 13:54:59,"""Inbound""",2,809,536,1.436764,true,243467,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,1h 7m 21s,"""12""",23,9987,41.93278,-88.76654,0,0,30,126,536,537,10721,270,809,10306,"""08:47:00-12""",true,-7.0,11530,"""35467605080601a""",2024-09-02 13:54:59,"""Inbound""",2,809,536,1.436764,true,243467,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,1h 7m 21s,2024-09-02 13:54:51,809,"""Normal Rd at W Locust St Stop …",0ns,13,54,1,0,0.012357
"""12""",23,"""701""",9971,41.93278,-88.76692,0,0,30,127,536,690,10721,268,809,10306,"""11:47:00-12""",true,20.0,11525,2024-09-02 16:27:15,"""35467605080601a""",2024-09-02 16:27:28,"""Inbound""",2,809,536,1.681614,true,259802,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,23m 34s,"""12""",23,9971,41.93278,-88.76692,0,0,30,127,536,690,10721,268,809,10306,"""11:47:00-12""",true,20.0,11525,"""35467605080601a""",2024-09-02 16:27:28,"""Inbound""",2,809,536,1.681614,true,259802,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,23m 34s,2024-09-02 16:27:15,809,"""Normal Rd at W Locust St Stop …",0ns,16,27,1,0,0.014264
"""12""",23,"""Old_1501""",9972,41.93278,-88.76678,0,0,18,128,536,948,10721,0,809,10306,"""15:47:00-12""",true,2.0,11526,2024-09-02 20:45:28,"""354676050595540""",2024-09-02 20:45:34,"""Inbound""",2,809,536,0.4897732,true,8638,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,4m 31s,"""12""",23,9972,41.93278,-88.76678,0,0,18,128,536,948,10721,0,809,10306,"""15:47:00-12""",true,2.0,11526,"""354676050595540""",2024-09-02 20:45:34,"""Inbound""",2,809,536,0.4897732,true,8638,"""Lincoln Hwy at Pearl St""",41.930813,-88.757736,"""Normal Rd at W Locust St Stop …",41.932812,-88.766716,true,true,4m 31s,2024-09-02 20:45:28,809,"""Normal Rd at W Locust St Stop …",0ns,20,45,1,0,0.004936


In [43]:
twelve_route_mega_df.filter(pl.col('eta_seconds') > (60 * 35)).count()

routeID,patternID,equipmentID,tripID,nextStopLat,nextStopLng,load,capacity,eLoad,blockID,nextStopID,nextStopETA,nextPatternStopID,h,lastStopID,lastPatternStopID,scheduleNumber,inService,onSchedule,trainID,receiveTime,aID,captureTime,direction,seq,lastStopExtID,nextStopExtID,nextStopPctProg,atStop,__index_level_0__,nextStopName,lat_right,lng_right,lastStopName,lastStopLat,lastStopLng,stopChanged,nextToLast,timeDiff,routeID_right,patternID_right,tripID_right,nextStopLat_right,nextStopLng_right,load_right,capacity_right,eLoad_right,blockID_right,nextStopID_right,nextStopETA_right,nextPatternStopID_right,h_right,lastStopID_right,lastPatternStopID_right,scheduleNumber_right,inService_right,onSchedule_right,trainID_right,aID_right,captureTime_right,direction_right,seq_right,lastStopExtID_right,nextStopExtID_right,nextStopPctProg_right,atStop_right,__index_level_0___right,nextStopName_right,lat_right_right,lng_right_right,lastStopName_right,lastStopLat_right,lastStopLng_right,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds,distance
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
twelve_route_mega_df.write_parquet('./data/twelve_route_mega_df.parquet')
print("[X] Succesfully wrote two_l_mega_df and twelve_route_mega_df to disk")

[X] Succesfully wrote two_l_mega_df and twelve_route_mega_df to disk


### Another way to compute n stops ahead with some caviates

In [45]:
# # Approach #2 - Using the 'shift' method to calculate the time difference between two stops (Not recommended)
# # Calcuate the time difference between going to Linchon hall to Stevenson South
# shiftVal = -3
# startingStop = 465
# subset = df.filter(pl.col("routeID") == "2L")

# # Ensure the data is sorted by 'equipmentID' and 'receiveTime'
# df = subset.sort(["equipmentID", "receiveTime"])

# # Group by 'equipmentID' and shift the 'receiveTime' column by 3 to get the time three stops ahead
# df = df.with_columns([
#     pl.col("receiveTime").shift(shiftVal).over("equipmentID").alias("receiveTime_3_stops_ahead"),
#     pl.col("nextStopID").shift(shiftVal).over("equipmentID").alias("nextStopID_3_stops_ahead"),
#     pl.col("lastStopName").shift(shiftVal).over("equipmentID").alias("lastStopName_3_stops_ahead")
# ])

# # Join with stops to get the name of the stop 3 stops ahead
# df = df.join(stops.rename({"id": "nextStopID_3_stops_ahead"}).select(["nextStopID_3_stops_ahead", "name"]).unique(), on="nextStopID_3_stops_ahead", how="left").rename({"name": "nextStopName_3_ahead"})

# # Filter the rows where 'nextStopID' is 433 to calculate the time difference
# df_filtered = df.filter(pl.col("nextStopID") == startingStop)

# df_filtered = df_filtered.with_columns([
#     pl.col("receiveTime").dt.convert_time_zone("America/Chicago").alias("receiveTime"),
#     pl.col("receiveTime_3_stops_ahead").dt.convert_time_zone("America/Chicago").alias("receiveTime_3_stops_ahead")
# ])

# # Calculate the time difference between the current stop and three stops ahead
# df_filtered = df_filtered.with_columns([
#     (pl.col("receiveTime_3_stops_ahead") - pl.col("receiveTime")).alias("timeDiff_3_stops_ahead")
# ])

# # Display the result
# df_filtered['routeID', 'equipmentID', 'lastStopName', 'nextStopName_3_ahead', 'receiveTime', 'receiveTime_3_stops_ahead', 'timeDiff_3_stops_ahead']