In [None]:
import polars as pl
from datetime import datetime
import matplotlib.pyplot as plt
import json
import datetime

pl.Config().set_tbl_cols(100)
pl.Config().set_tbl_rows(20)

In [None]:
#Map route id to correct route name
route_mapping = {
    3: "2L",
    4: "2R",
    33: "3",
    17: "10",
    18: "11",
    23: "12",
    12: "16",
    13: "17",
    14: "18",
    30: "19",
    29: "21",
    38: "21 Tripper",
    777: "777"
}

In [None]:
df = pl.read_parquet("2024-09-entries-start.parquet")
df = df.with_columns(pl.col("routeID").replace_strict(route_mapping))

df

In [None]:
# Load stops json
file = open("stops.json", "r")
stopsData = json.load(file)

stops = pl.DataFrame(stopsData['get_stops'])
stops

In [None]:
pattern_mapping = {
    3: "2L",
    4: "2R",
    37: "3",
    17: "10",
    18: "11",
    23: "12",
    12: "16",
    13: "17",
    14: "18",
    33: "19",
    46: "21",
    45: "21 Tripper",
}

# Load patterns json
file = open("patterns.json", "r")
patternsData = json.load(file)

patterns = pl.DataFrame(patternsData['get_patterns'])
patterns = patterns.with_columns(pl.col("id").replace_strict(pattern_mapping, default="None"))
patterns

In [None]:
#Left join stops to get stop names for nextStopID and lastStopID
df = df.join(stops.rename({"id": "nextStopID"}).select(["nextStopID", "name"]), on="nextStopID", how="left").rename({"name": "nextStopName"})
df = df.join(stops.rename({"id": "lastStopID"}).select(["lastStopID", "name"]), on="lastStopID", how="left").rename({"name": "lastStopName"})

df

In [None]:
def printStops(routeId):
    #Get all stops in the route
    stops = patterns.filter(
        pl.col("id") == routeId 
    ).select(pl.col("stopIDs")).to_series()

    stops = stops[0]
    print(stops)

printStops("2L")

In [None]:
twol_stops = [
    431,
    820,
    433,
    465,
    466,
    467,
    468,
    436,
    437,
    438,
    439,
    440,
    441,
    442,
    443,
    444,
    445,
    446,
    447,
    448,
    449,
    450
]

def calcTimeBetweenStops(stops, routeId, equipmentId):
    #Select all columns with that routeId and inService
    subset = df.filter(
        (pl.col("routeID") == routeId) &
        (pl.col("equipmentID") == equipmentId) &
        (pl.col("inService") == True)
    )
    
    #Add stopChanged column
    subset = subset.with_columns(
        (pl.col("nextStopID").diff().over("equipmentID") != 0).alias("stopChanged")
    )

    #Add timeDiff column
    subset = subset.filter(pl.col("stopChanged") == True).with_columns(
        (pl.col("receiveTime") - pl.col("receiveTime").shift(1)).alias("timeDiff")
    )

    #Filter out any timedeltas above 30 minutes
    subset = subset.filter((pl.col("timeDiff") < datetime.timedelta(minutes=30)))

    #Create a data object
    data = {
        "stopA": [],
        "stopB": [],
        "timeBetween": []
    }

    for i in range(len(stops) - 1):
        stopA = subset.filter((pl.col("stopChanged") == True) & (pl.col("nextStopID") == stops[i]))
        stopB = subset.filter((pl.col("stopChanged") == True) & (pl.col("nextStopID") == stops[i+1]))

        stopB = stopB.with_columns(
            (pl.col("receiveTime").alias("receiveTime_right"))
        )

        final = stopA.join_asof(stopB, on="receiveTime", by="equipmentID", strategy="forward")


        final = final.with_columns(
            (pl.col("receiveTime_right") - pl.col("receiveTime")).alias("timeDiffFinal")
        )
    
        data["stopA"].append(stops[i])
        data["stopB"].append(stops[i+1])
        data["timeBetween"].append(final["timeDiffFinal"].median())

    avgDf = pl.DataFrame(data)
    return avgDf

avgDf = calcTimeBetweenStops(twol_stops, "2L", "902")
#Map stopA and stopB names
avgDf = avgDf.join(stops.rename({"id": "stopA"}).select(["stopA", "name"]), on="stopA", how="left").rename({"name": "stopA_Name"})
avgDf = avgDf.join(stops.rename({"id": "stopB"}).select(["stopB", "name"]), on="stopB", how="left").rename({"name": "stopB_Name"})

avgDf["stopA", "stopA_Name", "stopB", "stopB_Name", "timeBetween"]

In [None]:
#Select a subset with route 2L, inService
subset = df.filter(
    (pl.col("routeID") == "2L") &
    (pl.col("inService") == True)
)

subset = subset.with_columns(
    (pl.col("nextStopID").diff().over("equipmentID") != 0).alias("stopChanged")
)

subset[["routeID", "equipmentID", "lat", "lng", "nextStopID", "lastStopID", "nextStopName", "lastStopName", "receiveTime", "captureTime", "stopChanged"]]

In [None]:
#Display round trip time from and to the student center
graphSet = subset.filter((pl.col("stopChanged") == True) & (pl.col("nextStopID") == 431))

graphSet = graphSet.with_columns(
    (pl.col("receiveTime") - pl.col("receiveTime").shift(1)).alias("timeDiff")
)

graphSet = graphSet.filter((pl.col("timeDiff") < datetime.timedelta(hours=1)))

graphSet['timeDiff'].describe()

graphSet[["routeID", "equipmentID", "lat", "lng", "nextStopID", "lastStopID", "nextStopName", "lastStopName", "receiveTime", "captureTime", "timeDiff"]]

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(graphSet['lng'], graphSet['lat'], c='red', marker='o')
plt.scatter(-88.76437, 41.93644, c='blue', marker='o')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

In [None]:
stopA = subset.filter((pl.col("stopChanged") == True) & (pl.col("nextStopID") == 431))
stopB = subset.filter((pl.col("stopChanged") == True) & (pl.col("nextStopID") == 820))

stopB = stopB.with_columns(
    (pl.col("receiveTime").alias("receiveTime_right"))
)

final = stopA.join_asof(stopB, on="receiveTime", by="equipmentID", strategy="forward")


final = final.with_columns(
    (pl.col("receiveTime_right") - pl.col("receiveTime")).alias("timeDiffFinal")
)

final = final.filter(pl.col("timeDiffFinal") > datetime.timedelta(minutes=3))

final["routeID", "equipmentID", "nextStopName", "lastStopName", "receiveTime", "receiveTime_right", "timeDiffFinal"]

In [None]:
graphSet["lat", "lng"].describe()

In [None]:

graphSet["timeDiff"].describe()

plt.hist(graphSet["timeDiff"].dt.total_seconds(), bins=30)

In [None]:

#Display round trip time from and to the student center
graphSet = subset.filter((pl.col("stopChanged") == True) & (pl.col("nextStopID") == 431))
graphSet[["routeID", "equipmentID", "lat", "lng", "nextStopID", "lastStopID", "nextStopName", "lastStopName", "receiveTime", "captureTime"]]