In [None]:

# Re-import libraries after reset
import pandas as pd
from collections import defaultdict
from bisect import bisect_right
from functions import stop_templates
import numpy as np

file = #insert file of choice

df = pd.read_csv(file, parse_dates=["DateTime"])



In [3]:
def print_station_journey(df, station, service_day):
    # Filter by station and service day
    day_data = df[(df["Origin"] == station) & (df["ServiceDay"] == service_day)]

    if day_data.empty:
        print(f"No data for station {station} on {service_day}")
        return

    # Group by journey ID and sort journeys by their first DateTime
    journeys = (
        day_data.groupby("StationJourneyID")
        .apply(lambda g: g.sort_values("DateTime"))
        .reset_index(drop=True)
    )
    
    # Sort by first DateTime of each journey
    first_times = (
        journeys.groupby("StationJourneyID")["DateTime"]
        .min()
        .sort_values()
        .index.tolist()
    )

    for journey_id in first_times:
        group = journeys[journeys["StationJourneyID"] == journey_id]
        print(f"\n StationJourney {int(journey_id)} at {station} on {service_day}")
        for _, row in group.iterrows():
            print(f"  {row['DateTime']}  |  Due in: {row['Minutes']} min | Destination: {row['Destination']}")


In [4]:
import pandas as pd
from datetime import timedelta

def stitch_forecasts_by_station(df, max_gap_minutes=8, min_logs=5):
    df = df.sort_values(['Origin', 'ServiceDay', 'Destination', 'DateTime']).reset_index(drop=True)
    df['StationJourneyID'] = pd.NA

    journey_id = 1
    grouped = df.groupby(['Origin', 'ServiceDay', 'Destination'])

    for (station, day, dest), group in grouped:
        group = group.sort_values("DateTime")
        active_journeys = []
        journey_meta = []
        journey_timestamps = []
        journey_min_minutes = []

        for idx, row in group.iterrows():
            assigned = False
            to_remove = []

            for i in range(len(active_journeys)):
                last_idx = active_journeys[i][-1]
                last_row = group.loc[last_idx]

                time_diff = (row['DateTime'] - last_row['DateTime']).total_seconds() / 60
                minute_diff = last_row['Minutes'] - row['Minutes'] if pd.notna(row['Minutes']) and pd.notna(last_row['Minutes']) else 0
                forecast_jump = row['Minutes'] - last_row['Minutes'] if pd.notna(row['Minutes']) and pd.notna(last_row['Minutes']) else 0
                min_seen = journey_min_minutes[i]

                # Prevent duplicate timestamp inclusion
                if row['DateTime'] in journey_timestamps[i]:
                    continue

                # Allow continuation if time and countdown flow logically
                if (
                    0 <= time_diff <= max_gap_minutes and
                    -2 <= minute_diff <= 5 and  # Allow small reset tolerance
                    forecast_jump <= 3 and     # Allow for slight noise
                    not (min_seen <= 1 and forecast_jump > 3)  # Avoid real resets only if journey looks done
                ):
                    active_journeys[i].append(idx)
                    journey_meta[i] = row
                    journey_timestamps[i].add(row['DateTime'])
                    journey_min_minutes[i] = min(min_seen, row['Minutes']) if pd.notna(row['Minutes']) else min_seen
                    assigned = True
                    break

                # Mark to finalize if it's too old or clearly done
                elif time_diff > max_gap_minutes or (min_seen <= 1 and forecast_jump > 3):
                    to_remove.append(i)

            # Finalize any closed journeys
            for i in sorted(to_remove, reverse=True):
                journey = active_journeys.pop(i)
                if len(journey) >= min_logs:
                    for row_idx in journey:
                        df.at[row_idx, 'StationJourneyID'] = journey_id
                    journey_id += 1
                journey_meta.pop(i)
                journey_timestamps.pop(i)
                journey_min_minutes.pop(i)

            # Start new journey if not assigned
            if not assigned:
                active_journeys.append([idx])
                journey_meta.append(row)
                journey_timestamps.append({row['DateTime']})
                journey_min_minutes.append(row['Minutes'] if pd.notna(row['Minutes']) else float('inf'))

        # Finalize remaining journeys
        for journey in active_journeys:
            if len(journey) >= min_logs:
                for row_idx in journey:
                    df.at[row_idx, 'StationJourneyID'] = journey_id
                journey_id += 1

    return df


In [5]:
stitched = stitch_forecasts_by_station(df)


In [6]:
print_station_journey(stitched, 'HAR', '2020-01-26')


  .apply(lambda g: g.sort_values("DateTime"))



 StationJourney 104682 at HAR on 2020-01-26
  2020-01-26 06:44:03  |  Due in: 17.0 min | Destination: BRI
  2020-01-26 06:46:02  |  Due in: 15.0 min | Destination: BRI
  2020-01-26 06:48:05  |  Due in: 13.0 min | Destination: BRI
  2020-01-26 06:50:03  |  Due in: 11.0 min | Destination: BRI
  2020-01-26 06:52:02  |  Due in: 9.0 min | Destination: BRI
  2020-01-26 06:54:02  |  Due in: 7.0 min | Destination: BRI
  2020-01-26 06:56:02  |  Due in: 5.0 min | Destination: BRI
  2020-01-26 06:58:02  |  Due in: 3.0 min | Destination: BRI
  2020-01-26 07:00:02  |  Due in: 1.0 min | Destination: BRI
  2020-01-26 07:02:02  |  Due in: 0.0 min | Destination: BRI

 StationJourney 104758 at HAR on 2020-01-26
  2020-01-26 07:00:02  |  Due in: 18.0 min | Destination: BRO
  2020-01-26 07:02:02  |  Due in: 16.0 min | Destination: BRO
  2020-01-26 07:04:02  |  Due in: 14.0 min | Destination: BRO
  2020-01-26 07:06:04  |  Due in: 12.0 min | Destination: BRO
  2020-01-26 07:08:04  |  Due in: 10.0 min | Des

In [7]:
# Count total logs with no journey assigned
num_unassigned = stitched['StationJourneyID'].isna().sum()

# Total logs overall (optional context)
total_logs = len(stitched)

print(f"⚠️ {num_unassigned} out of {total_logs} logs were unassigned ({(num_unassigned/total_logs)*100:.2f}%)")


⚠️ 50641 out of 2162268 logs were unassigned (2.34%)


In [8]:
# # View some unassigned logs
# unassigned_logs = stitched[stitched['StationJourneyID'].isna()]
# print(unassigned_logs)

In [9]:
stitched.to_csv('january_2020_stations.csv', index=False)

In [10]:
stitched['Destination'].value_counts()

Destination
TPT    383017
BRI    354635
TAL    305621
BRO    301423
SAG    243850
PAR    228271
SAN    185960
CON    132961
BEL     21357
RED      3855
HEU      1318
Name: count, dtype: int64