In [None]:

# Re-import libraries after reset
import pandas as pd
from collections import defaultdict
from bisect import bisect_right
from functions import stop_templates
import numpy as np

file = #insert file of choice

df = pd.read_csv(file, parse_dates=["DateTime"])



In [3]:
df_day = df[df["DateTime"].dt.date == pd.to_datetime("2020-01-23").date()]
df_day_bro = df_day[df_day["Destination"] == "BRO"]



In [None]:
def print_station_journey(df, station, service_day):
    # Filter by station and service day
    day_data = df[(df["Origin"] == station) & (df["ServiceDay"] == service_day)]

    if day_data.empty:
        print(f"No data for station {station} on {service_day}")
        return

    # Group by journey ID and sort journeys by their first DateTime
    journeys = (
        day_data.groupby("StationJourneyID")
        .apply(lambda g: g.sort_values("DateTime"))
        .reset_index(drop=True)
    )
    
    # Sort by first DateTime of each journey
    first_times = (
        journeys.groupby("StationJourneyID")["DateTime"]
        .min()
        .sort_values()
        .index.tolist()
    )

    for journey_id in first_times:
        group = journeys[journeys["StationJourneyID"] == journey_id]
        print(f"\n StationJourney {int(journey_id)} at {station} on {service_day}")
        for _, row in group.iterrows():
            print(f"  {row['DateTime']}  |  Due in: {row['Minutes']} min | Destination: {row['Destination']}")

#print_station_journey(df, 'HAR', '2020-01-23')

In [None]:
def print_journey(df, journey_id, stop_templates):
    journey_df = df[df["TramJourneyID"] == journey_id].copy()

    if journey_df.empty:
        print(f" No journey found with ID {journey_id}")
        return

    # Attempt to extract (origin, destination) from journey ID using known templates
    def find_route_from_id(jid):
        for (start, end) in stop_templates:
            if start in jid and end in jid and jid.index(start) < jid.index(end):
                return start, end
        return None, None

    origin, destination = find_route_from_id(journey_id)

    if origin and destination and (origin, destination) in stop_templates:
        stop_order = stop_templates[(origin, destination)]
    else:
        print(f" No stop template for journey ID '{journey_id}' — using alphabetical stop order.\n")
        stop_order = sorted(journey_df["Origin"].unique())

    print(f" TramJourney {journey_id} ({origin or '???'} → {destination or '???'})\n")

    for stop in stop_order:
        stop_logs = journey_df[journey_df["Origin"] == stop].sort_values("DateTime")
        if stop_logs.empty:
            continue
        print(f" {stop}")
        for _, row in stop_logs.iterrows():
            dt = row["DateTime"].strftime("%Y-%m-%d %H:%M:%S")
            due = f"{row['Minutes']:.1f} min"
            print(f"  {dt} | Due in: {due}")
        print()


In [None]:
from collections import defaultdict
import pandas as pd

def build_clean_tram_journeys(df, stop_templates, max_gap_min=6, allow_missing=1, min_stops=6, verbose=True):
    df = df.copy()
    df["EstimatedArrival"] = df["DateTime"] + pd.to_timedelta(df["Minutes"], unit="m")
    df["ServiceDate"] = df["DateTime"].dt.date

    # Step 1: Group logs by StationJourneyID and extract metadata
    grouped = df.groupby("StationJourneyID")
    sjid_info = []
    for sjid, group in grouped:
        dest = group["Destination"].iloc[0]
        station = group["Origin"].iloc[0]
        eta = group["EstimatedArrival"].max()
        date = group["ServiceDate"].iloc[0]
        sjid_info.append({
            "StationJourneyID": sjid,
            "Station": station,
            "Destination": dest,
            "EstimatedArrival": eta,
            "ServiceDate": date
        })

    sjid_df = pd.DataFrame(sjid_info)
    used_ids = set()
    journeys = []
    journey_counts = defaultdict(lambda: defaultdict(int))  # (origin, dest) → date → counter

    # Step 2: Loop over templates
    for (start, end), template in stop_templates.items():
        for date in sjid_df["ServiceDate"].unique():
            candidates = sjid_df[
                (sjid_df["Destination"] == end) &
                (sjid_df["ServiceDate"] == date)
            ]

            starters = candidates[candidates["Station"] == start].sort_values("EstimatedArrival")

            for _, start_row in starters.iterrows():
                if start_row["StationJourneyID"] in used_ids:
                    continue

                block = [start_row["StationJourneyID"]]
                current_time = start_row["EstimatedArrival"]
                missing = 0
                valid = True

                for stop in template[1:]:
                    possible = candidates[
                        (candidates["Station"] == stop) &
                        (~candidates["StationJourneyID"].isin(used_ids))
                    ]
                    possible = possible[possible["EstimatedArrival"].between(
                        current_time, current_time + pd.Timedelta(minutes=max_gap_min)
                    )].sort_values("EstimatedArrival")

                    if possible.empty:
                        missing += 1
                        if missing > allow_missing:
                            valid = False
                            break
                    else:
                        next_row = possible.iloc[0]
                        block.append(next_row["StationJourneyID"])
                        current_time = next_row["EstimatedArrival"]

                if valid and len(block) >= min_stops:
                    journey_counts[(start, end)][date] += 1
                    jid = f"{start}_{end}{journey_counts[(start, end)][date]:02d}_{date.year}_{date.month:02d}_{date.day:02d}"
                    matched_rows = df[df["StationJourneyID"].isin(block)].copy()
                    matched_rows["TramJourneyID"] = jid
                    journeys.append(matched_rows)
                    used_ids.update(block)

                    if verbose:
                        print(f" Built journey {jid} with {len(block)} stops")

    if not journeys:
        return pd.DataFrame()

    return pd.concat(journeys, ignore_index=True)


In [None]:
faster = build_clean_tram_journeys(df_day, stop_templates)

✅ Built journey BRI_BRO01_2020_01_23 with 31 stops
✅ Built journey BRI_BRO02_2020_01_23 with 31 stops
✅ Built journey BRI_BRO03_2020_01_23 with 31 stops
✅ Built journey BRI_BRO04_2020_01_23 with 31 stops
✅ Built journey BRI_BRO05_2020_01_23 with 31 stops
✅ Built journey BRI_BRO06_2020_01_23 with 31 stops
✅ Built journey BRI_BRO07_2020_01_23 with 31 stops
✅ Built journey BRI_BRO08_2020_01_23 with 31 stops
✅ Built journey BRI_BRO09_2020_01_23 with 31 stops
✅ Built journey BRI_BRO10_2020_01_23 with 31 stops
✅ Built journey BRI_BRO11_2020_01_23 with 31 stops
✅ Built journey BRI_BRO12_2020_01_23 with 31 stops
✅ Built journey BRI_BRO13_2020_01_23 with 31 stops
✅ Built journey BRI_BRO14_2020_01_23 with 31 stops
✅ Built journey BRI_BRO15_2020_01_23 with 31 stops
✅ Built journey BRI_BRO16_2020_01_23 with 31 stops
✅ Built journey BRI_BRO17_2020_01_23 with 31 stops
✅ Built journey BRI_BRO18_2020_01_23 with 31 stops
✅ Built journey SAN_BRO01_2020_01_23 with 22 stops
✅ Built journey SAN_BRO02_2020_

In [9]:
print_journey(faster, 'SAN_BRO33_2020_01_23', stop_templates)

print_journey(faster, 'SAN_BRO12_2020_01_23', stop_templates)


🚋 TramJourney SAN_BRO33_2020_01_23 (SAN → BRO)

📍 SAN
  2020-01-23 18:32:07 | Due in: 17.0 min
  2020-01-23 18:34:06 | Due in: 15.0 min
  2020-01-23 18:36:06 | Due in: 13.0 min
  2020-01-23 18:38:06 | Due in: 11.0 min
  2020-01-23 18:40:06 | Due in: 9.0 min
  2020-01-23 18:42:07 | Due in: 7.0 min
  2020-01-23 18:44:06 | Due in: 5.0 min
  2020-01-23 18:46:07 | Due in: 3.0 min
  2020-01-23 18:48:06 | Due in: 1.0 min
  2020-01-23 18:50:06 | Due in: 0.0 min

📍 STI
  2020-01-23 18:32:07 | Due in: 19.0 min
  2020-01-23 18:34:06 | Due in: 17.0 min
  2020-01-23 18:36:06 | Due in: 15.0 min
  2020-01-23 18:38:06 | Due in: 13.0 min
  2020-01-23 18:40:06 | Due in: 11.0 min
  2020-01-23 18:42:07 | Due in: 8.0 min
  2020-01-23 18:44:06 | Due in: 6.0 min
  2020-01-23 18:46:06 | Due in: 4.0 min
  2020-01-23 18:48:06 | Due in: 2.0 min
  2020-01-23 18:50:06 | Due in: 0.0 min

📍 KIL
  2020-01-23 18:34:05 | Due in: 19.0 min
  2020-01-23 18:36:06 | Due in: 17.0 min
  2020-01-23 18:38:06 | Due in: 15.0 min


In [23]:
import pandas as pd

def validate_journey_duration(journey_df, min_minutes=35, max_minutes=75):
    """Check if the total journey time is within expected bounds."""
    journey_df = journey_df.copy()
    journey_df = journey_df.sort_values("EstimatedArrival")
    start_time = journey_df["EstimatedArrival"].min()
    end_time = journey_df["EstimatedArrival"].max()
    duration = (end_time - start_time).total_seconds() / 60  # in minutes
    return min_minutes <= duration <= max_minutes, duration

def check_for_origin_conflicts(journey_df, valid_starts=("SAN", "BRI")):
    """Check if multiple origin stations (e.g. SAN and BRI) are present at the start."""
    origins = journey_df["Origin"].unique()
    return any(start in origins for start in valid_starts) and len(origins) > 1, list(origins)

def audit_sample_journeys(df, n=25):
    """Sample n journeys and perform basic validation checks."""
    audits = []
    sample_ids = df["TramJourneyID"].dropna().unique()[:n]
    for jid in sample_ids:
        journey = df[df["TramJourneyID"] == jid].copy()
        journey = journey.sort_values("EstimatedArrival")
        valid_duration, duration = validate_journey_duration(journey)
        origin_conflict, origins = check_for_origin_conflicts(journey)
        audit = {
            "TramJourneyID": jid,
            "Start": journey["EstimatedArrival"].min(),
            "End": journey["EstimatedArrival"].max(),
            "DurationMin": duration,
            "ValidDuration": valid_duration,
            "OriginStations": origins,
            "OriginConflict": origin_conflict,
            "NumStops": journey["Origin"].nunique()
        }
        audits.append(audit)
    return pd.DataFrame(audits)


In [24]:
audit_sample_journeys(faster)

Unnamed: 0,TramJourneyID,Start,End,DurationMin,ValidDuration,OriginStations,OriginConflict,NumStops
0,BRI_BRO01_2020_01_23,2020-01-23 00:00:06,2020-01-23 00:59:04,58.966667,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31
1,BRI_BRO02_2020_01_23,2020-01-23 05:29:03,2020-01-23 06:29:03,60.0,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31
2,BRI_BRO03_2020_01_23,2020-01-23 05:49:02,2020-01-23 06:50:04,61.033333,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31
3,BRI_BRO04_2020_01_23,2020-01-23 06:09:03,2020-01-23 07:22:04,73.016667,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31
4,BRI_BRO05_2020_01_23,2020-01-23 10:58:06,2020-01-23 11:59:04,60.966667,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31
5,BRI_BRO06_2020_01_23,2020-01-23 17:40:04,2020-01-23 18:43:03,62.983333,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, KIL, GLE, ...",True,31
6,BRI_BRO07_2020_01_23,2020-01-23 18:19:04,2020-01-23 19:20:04,61.0,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31
7,BRI_BRO08_2020_01_23,2020-01-23 18:43:04,2020-01-23 19:48:03,64.983333,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31
8,BRI_BRO09_2020_01_23,2020-01-23 19:19:05,2020-01-23 20:19:03,59.966667,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31
9,BRI_BRO10_2020_01_23,2020-01-23 19:43:04,2020-01-23 20:43:04,60.0,True,"[BRI, CHE, LAU, CCK, BAW, LEO, GAL, GLE, CPK, ...",True,31


In [None]:
def simulate_platform_view(df, journey_df, station, time, direction="Outbound", window_min=20):
    """
    Simulate the platform screen at a given station and time.

    Parameters:
        df (DataFrame): The full raw forecast logs dataframe.
        journey_df (DataFrame): The DataFrame of stitched tram journeys.
        station (str): The station code to simulate the screen at.
        time (datetime): The moment you're 'standing' on the platform.
        direction (str): 'Inbound' or 'Outbound'.
        window_min (int): How many minutes ahead to show trams for.

    Prints:
        A list of trams due at that station in the forecast window, along with their TramJourneyID and destination.
    """
    time = pd.to_datetime(time)
    cutoff = time + pd.Timedelta(minutes=window_min)

    screen_view = df[
        (df["Origin"] == station) &
        (df["Direction"] == direction) &
        (df["DateTime"] >= time) &
        (df["DateTime"] <= cutoff)
    ].copy()

    if screen_view.empty:
        print(f"📭 No upcoming trams from {station} between {time.time()} and {cutoff.time()} ({direction})")
        return

    screen_view["EstimatedArrival"] = screen_view["DateTime"] + pd.to_timedelta(screen_view["Minutes"], unit="m")

    # Optional: match journey ID if we’ve stitched journeys
    joined = pd.merge(
        screen_view,
        journey_df[["StationJourneyID", "TramJourneyID"]],
        how="left",
        on="StationJourneyID"
    )

    joined = joined.sort_values("EstimatedArrival")

    print(f"\n Platform screen at {station} — {time.strftime('%Y-%m-%d %H:%M:%S')} ({direction})")
    print("────────────────────────────────────────────")
    for _, row in joined.iterrows():
        eta = row["EstimatedArrival"].strftime("%H:%M:%S")
        mins = f"{int(row['Minutes'])} min" if row["Minutes"] > 0 else "DUE"
        tid = row.get("TramJourneyID", "???")
        print(f" {eta:>8} | {mins:>5} → {row['Destination']:>4} (Journey: {tid})")


In [29]:
from datetime import datetime

simulate_platform_view(
    df=df,  # your full raw logs DataFrame
    journey_df=faster,  # your stitched journey DataFrame
    station="SAN",  # pick your station
    time="2020-01-23 16:15:00",  # sample time you're at the station
    direction="Inbound",  # or 'Inbound'
    window_min=30  # show forecasts for the next 30 minutes
)


📭 No upcoming trams from SAN between 16:15:00 and 16:45:00 (Inbound)
