In [3]:
import io
import json
import logging
import os
import re
import shutil
import sys
import uuid
import zipfile
from datetime import datetime
from typing import List

import pandas as pd
import requests
from bs4 import BeautifulSoup
from google.transit import gtfs_realtime_pb2

with open("./config.json", "r") as config:
    GTFS_RT_API_KEY = json.load(config)["gtfs_rt_api_key"]

data_dir = "./data/gtfs_fp2024_2024-07-11"

DEV_MODE = True

transport_name_mapping = pd.read_csv(
    "./transport_name_mapping.csv", index_col="Abbreviation"
)
transport_abbr2french = transport_name_mapping["FR"].to_dict()

logger = logging.getLogger(__name__)

logger.addHandler(logging.StreamHandler(stream=sys.stdout))

In [18]:
gtfs_dataset_homepage = "https://opentransportdata.swiss/de/dataset/timetable-2024-gtfs2020"
data_base_dir = "./data"

In [19]:
homepage = gtfs_dataset_homepage

In [20]:
table_name2dtypes = {
    "agency": {
        "agency_id": str,
        "agency_name": str,
        "agency_url": str,
        "agency_timezone": str,
        "agency_lang": str,
        "agency_phone": str,
    },
    "calendar": {
        "service_id": str,
        "monday": int,
        "tuesday": int,
        "wednesday": int,
        "thursday": int,
        "friday": int,
        "saturday": int,
        "sunday": int,
        "start_date": int,
        "end_date": int,
    },
    "calendar_dates": {
        "service_id": str,
        "date": int,
        "exception_type": int,
    },
    "feed_info": {
        "feed_publisher_name": str,
        "feed_publisher_url": str,
        "feed_lang": str,
        "feed_start_date": str,
        "feed_end_date": str,
        "feed_version": str,
    },
    "routes": {
        "route_id": str,
        "agency_id": str,
        "route_short_name": str,
        "route_long_name": str,
        "route_desc": str,
        "route_type": str,
    },
    "stops": {
        "stop_id": str,
        "stop_name": str,
        "stop_lat": float,
        "stop_lon": float,
        "location_type": str,
        "parent_station": str,
    },
    "stop_times": {
        "trip_id": str,
        "arrival_time": str,
        "departure_time": str,
        "stop_id": str,
        "stop_sequence": int,
        "pickup_type": str,
        "drop_off_type": str,
    },
    "trips": {
        "route_id": str,
        "service_id": str,
        "trip_id": str,
        "trip_headsign": str,
        "trip_short_name": str,
        "direction_id": str,
        "block_id": str,
    },
    "transfers": {
        "from_stop_id": str,
        "to_stop_id": str,
        "transfer_type": str,
        "min_transfer_time": str,
    },
}

table_name2index_col = {
    "agency": "agency_id",
    "calendar": "service_id",
    "calendar_dates": "service_id",
    "feed_info": None,
    "routes": "route_id",
    "stops": "stop_id",
    "stop_times": None,
    "trips": "trip_id",
    "transfers": None,
}

In [38]:
def convert_csv_to_feather(csv_file, output_filename, dtypes=None, index_col=None):
    pd.read_csv(
        csv_file,
        index_col=index_col,
        dtype=dtypes,
        keep_default_na=False,
        engine="pyarrow",
    ).to_feather(output_filename, compression="lz4")

dataset_url_format = re.compile(r".*/.*\d{4}-\d{2}-\d{2}.zip")

def get_latest_dataset_url(homepage):
    request = requests.get(homepage)
    if request.status_code != 200:
        logger.error(
            f"Error while requestion the dataset homepage, got status code {request.status_code}"
        )
        return None

    soup = BeautifulSoup(request.content, "html.parser")
    resource_list = soup.find("ul", {"class": "resource-list"})
    if resource_list is None:
        logger.error("Could not find resource list in dataset page")
        return None
    resource = resource_list.find("li")
    if resource is None:
        logger.error("Could not find resource in resources list in dataset page")
        return None
    link = resource.find("a", attrs={"href": True, "download": True})
    if link is None:
        logger.error("Could not find link in resource in dataset page")
        return None
    url = link["href"]
    if not dataset_url_format.match(url):
        return None
    return url


def get_date_from_dataset_url(url):
    return url.split("/")[-1].split("_")[-1].split(".")[0].replace("-", "")


def update_symlink(source, target):
    source = os.path.abspath(source)
    target = os.path.abspath(target)
    if os.path.exists(target) and not os.path.islink(target):
        logger.error(f"Cannot update the symlink for {target}, it is not a link")
        return False
    temp_target = target + str(uuid.uuid4)
    try:
        os.symlink(source, temp_target)
        os.replace(temp_target, target)
        return True
    except:
        if os.path.exists(temp_target):
            os.remove(temp_target)
        return False


def download_and_extract_zip(url, extract_to, overwrite=False):
    path_exists = os.path.exists(extract_to)
    if overwrite:
        if path_exists:
            shutil.rmtree(extract_to)
    else:
        if path_exists:
            logger.error(
                f"Error: the path {extract_to} already exists and overwrite is false."
            )
            return False
    response = requests.get(url)

    if response.status_code == 200:
        zip_file = io.BytesIO(response.content)

        with zipfile.ZipFile(zip_file, "r") as zip_ref:
            zip_ref.extractall(extract_to)
        return True
    else:
        return False


dataset_regex = re.compile(r"\d{8}")


def get_local_dataset_folders(base_dir, dataset_regex=dataset_regex):
    local_dataset_folders = []
    for folder in os.listdir(base_dir):
        basename = os.path.basename(folder)
        if dataset_regex.match(basename):
            local_dataset_folders.append(basename)
    return local_dataset_folders


def get_latest_dataset_folder(base_dir, dataset_regex=dataset_regex):
    local_dataset_folders = get_local_dataset_folders(base_dir, dataset_regex)
    return (
        sorted(local_dataset_folders)[-1]
        if len(local_dataset_folders) > 0
        else "19700101"
    )


def get_gtfs_data_if_needed(
    base_dir,
    dataset_homepage,
    force=False,
    cleanup_old_datasets=True,
    table_extension=".txt",
):
    latest_dataset_url = get_latest_dataset_url(dataset_homepage)
    if latest_dataset_url is None:
        return False
    latest_remote_dataset_date = get_date_from_dataset_url(latest_dataset_url)
    latest_local_dataset_date = get_latest_dataset_folder(base_dir)

    if latest_remote_dataset_date <= latest_local_dataset_date and not force:
        logger.info(
            f"Local dataset {latest_local_dataset_date} is same or newer than remote {latest_remote_dataset_date}"
        )
        update_symlink(
            os.path.join(base_dir, latest_local_dataset_date),
            os.path.join(base_dir, "latest"),
        )
        return True

    download_dir = os.path.join(base_dir, latest_remote_dataset_date)
    if not download_and_extract_zip(latest_dataset_url, download_dir, force):
        logger.error("Error while download the dataset")
        return False

    downloaded_tables = set(
        [
            os.path.basename(path).replace(table_extension, "")
            for path in os.listdir(download_dir)
        ]
    )
    extra_tables = downloaded_tables.difference(table_name2dtypes.keys())
    if len(extra_tables) > 0:
        extra_tables_string = []
        for table in extra_tables:
            os.remove(os.path.join(download_dir, table + table_extension))
            extra_tables_string.append(f"- {table}")
        logger.warn(
            "Removed the following extra tables: " + "\n".join(extra_tables_string)
        )

    missing_tables = set(table_name2dtypes.keys()).difference(downloaded_tables)
    if len(missing_tables) > 0:
        logger.error(
            "Missing the tables: "
            + "\n".join([f"- {table}" for table in missing_tables])
        )
        return False

    for table, dtypes in table_name2dtypes.items():
        basename = os.path.join(download_dir, table)
        convert_csv_to_feather(
            basename + table_extension, basename + ".feather", dtypes=dtypes
        )
        os.remove(basename + table_extension)

    update_symlink(
        os.path.join(base_dir, latest_remote_dataset_date),
        os.path.join(base_dir, "latest"),
    )

    if cleanup_old_datasets:
        for dataset in set(get_local_dataset_folders(base_dir)).difference(
            [latest_remote_dataset_date]
        ):
            logger.info(f"Removing old dataset {dataset}")
            shutil.rmtree(os.path.join(base_dir, dataset))
    return True

def open_dataset_tables(base_dir, dataset_name="latest"):
    tables = {}
    for table_name, index_col in table_name2index_col.items():
        table_path = os.path.join(base_dir, dataset_name, table_name + ".feather")
        if not os.path.exists(table_path):
            logger.error(f"Missing table {table_path}")
            return None
        table = pd.read_feather(table_path)
        if index_col is not None:
            table.set_index(index_col, inplace=True)
        tables[table_name] = table
    return tables

In [44]:
get_gtfs_data_if_needed(data_base_dir, gtfs_dataset_homepage)
dataset = open_dataset_tables(data_base_dir)

In [31]:
url = get_latest_dataset_url(gtfs_dataset_homepage)

In [50]:
url.split('/')[-1].split('_')[-1].split('.')[0].replace('-', '')

'20240711'

In [35]:
download_and_extract_zip(url, "./data/test", True) 

7.69 s ± 561 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
soup.find("ul", {"class": "resource-list"}).find("li")

<li class="resource-item" data-id="7d1d80fe-336b-4b5c-81d3-ac0b3b6c869a">
<a class="heading" href="/de/dataset/timetable-2024-gtfs2020/resource/7d1d80fe-336b-4b5c-81d3-ac0b3b6c869a" title="GTFS_FP2024_2024-07-11.zip">
    GTFS_FP2024_2024-07-11.zip<span class="format-label" data-format="zip" property="dc:format">ZIP</span>
</a>
<p class="description">
</p>
<div class="dropdown btn-group">
<a aria-expanded="false" class="btn btn-primary dropdown-toggle" data-bs-toggle="dropdown" href="#" id="dropdownExplorer" type="button">
<i class="fa fa-share"></i>
      Entdecke
      <span class="caret"></span>
</a>
<ul aria-labelledby="dropdownExplorer" class="dropdown-menu">
<li>
<a class="dropdown-item" href="/de/dataset/timetable-2024-gtfs2020/resource/7d1d80fe-336b-4b5c-81d3-ac0b3b6c869a">
<i class="fa fa-info-circle"></i>
        Mehr Information
        
    </a>
</li>
<li>
<a class="dropdown-item resource-url-analytics" download="GTFS_FP2024_2024-07-11.zip" href="https://opentransportdata.s

In [72]:
local_stops_ids = tuple([
    "8501124:0:1", # Cully, voie 1?
    "8501124:0:2", # Cully, voie 2?
    "8501124:0:3", # Cully, voie 3?
    "8570559", # Cully gare Post?
    "8501125:0:1", # Epesses CFF, voie 1?
    "8501125:0:2", # Epesses CFF, voie 2?
    "8510137", # Epesses, gare Post?
    "8570561", # Epesses, village
])

local_stops_ids_merge = {
    "8501124:0:1": "Cully",
    "8501124:0:2": "Cully",
    "8501124:0:3": "Cully",
    "8570559": "Cully, gare",
    "8501125:0:1": "Epesses",
    "8501125:0:2": "Epesses",
    "8510137": "Epesses",
    "8570561": "Epesses, village"
}

In [40]:
{k: str for k in "from_stop_id,to_stop_id,transfer_type,min_transfer_time".split(',')}


{'from_stop_id': str,
 'to_stop_id': str,
 'transfer_type': str,
 'min_transfer_time': str}

In [25]:
agency = pd.read_csv(
    os.path.join(data_dir, "agency.txt"),
    dtype={
        "agency_id": str,
        "agency_name": str,
        "agency_url": str,
        "agency_timezone": str,
        "agency_lang": str,
        "agency_phone": str,
    },
    keep_default_na=False,
    engine="pyarrow",
)
calendar = pd.read_csv(
    os.path.join(data_dir, "calendar.txt"),
    index_col="service_id",
    dtype={
        "service_id": str,
        "monday": int,
        "tuesday": int,
        "wednesday": int,
        "thursday": int,
        "friday": int,
        "saturday": int,
        "sunday": int,
        "start_date": int,
        "end_date": int,
    },
    keep_default_na=False,
    engine="pyarrow",
)
calendar_dates = pd.read_csv(
    os.path.join(data_dir, "calendar_dates.txt"),
    index_col="service_id",
    dtype={
        "service_id": str,
        "date": int,
        "exception_type": int,
    },
    keep_default_na=False,
    engine="pyarrow",
)
routes = pd.read_csv(
    os.path.join(data_dir, "routes.txt"),
    index_col="route_id",
    dtype={
        "route_id": str,
        "agency_id": str,
        "route_short_name": str,
        "route_long_name": str,
        "route_desc": str,
        "route_type": str,
    },
    keep_default_na=False,
    engine="pyarrow",
)
stops = pd.read_csv(
    os.path.join(data_dir, "stops.txt"),
    index_col="stop_id",
    dtype={
        "stop_id": str,
        "stop_name": str,
        "stop_lat": float,
        "stop_lon": float,
        "location_type": str,
        "parent_station": str,
    },
    keep_default_na=False,
    engine="pyarrow",
)
stop_times = pd.read_csv(
    os.path.join(data_dir, "stop_times.txt"),
    dtype={
        "trip_id": str,
        "arrival_time": str,
        "departure_time": str,
        "stop_id": str,
        "stop_sequence": int,
        "pickup_type": str,
        "drop_off_type": str,
    },
    keep_default_na=False,
    engine="pyarrow",
)
trips = pd.read_csv(
    os.path.join(data_dir, "trips.txt"),
    index_col="trip_id",
    dtype={
        "route_id": str,
        "service_id": str,
        "trip_id": str,
        "trip_headsign": str,
        "trip_short_name": str,
        "direction_id": str,
        "block_id": str,
    },
    keep_default_na=False,
    engine="pyarrow",
)

In [41]:
agency.to_feather('./data/latest/agency.feather', compression='lz4')
calendar.to_feather('./data/latest/calendar.feather', compression='lz4')
calendar_dates.to_feather('./data/latest/calendar_dates.feather', compression='lz4')
routes.to_feather('./data/latest/routes.feather', compression='lz4')
stops.to_feather('./data/latest/stops.feather', compression='lz4')
stop_times.to_feather('./data/latest/stop_times.feather', compression='lz4')
trips.to_feather('./data/latest/trips.feather', compression='lz4')

NameError: name 'agency' is not defined

In [131]:
stop_times.dtypes

trip_id           object
arrival_time      object
departure_time    object
stop_id           object
stop_sequence      int64
pickup_type       object
drop_off_type     object
dtype: object

In [42]:
stop_times = pd.read_feather('./data/latest/stop_times.feather')

In [43]:
stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type
0,1.TA.91-10-A-j24-1.1.H,17:27:00,17:27:00,8503054:0:1,1,0,0
1,1.TA.91-10-A-j24-1.1.H,17:28:00,17:28:00,8503053:0:1,2,0,0
2,1.TA.91-10-A-j24-1.1.H,17:29:00,17:29:00,8503052:0:1,3,0,0
3,1.TA.91-10-A-j24-1.1.H,17:31:00,17:32:00,8503051:0:1,4,0,0
4,1.TA.91-10-A-j24-1.1.H,17:34:00,17:34:00,8503090:0:1,5,0,0
...,...,...,...,...,...,...,...
18908627,999.TA.96-199-8-j24-1.13.R,17:51:00,17:52:00,8580623,18,0,0
18908628,999.TA.96-199-8-j24-1.13.R,17:53:00,17:53:00,8580622,19,0,0
18908629,999.TA.96-199-8-j24-1.13.R,17:55:00,17:55:00,8580621,20,0,0
18908630,999.TA.96-199-8-j24-1.13.R,17:57:00,17:57:00,8573257,21,0,0


In [26]:
trip_id2route_id = trips['route_id'].to_dict()
trip_id2service_id = trips['service_id'].to_dict()
trip_id2headsign = trips['trip_headsign'].to_dict()
stop_id2stop_name = stops['stop_name'].to_dict()
route_id2short_name = routes['route_short_name'].to_dict()
route_id2desc = {k: transport_abbr2french.get(v) for k,v in routes['route_desc'].to_dict().items()}

In [54]:
def get_local_trip_ids(local_stops_ids):
    return stop_times.loc[stop_times['stop_id'].isin(local_stops_ids), 'trip_id'].unique().tolist()

In [119]:
def get_todays_service_ids(
    calendar,
    calendar_dates,
    today: datetime,
) -> List[int]:
    today_int = int(today.strftime("%Y%m%d"))
    weekday_str = today.strftime("%A").lower()
    calendar_today = calendar.where(
        (calendar[weekday_str] == 1)
        & (today_int > calendar.start_date)
        & (today_int < calendar.end_date)
    ).dropna()

    calendar_dates_today_remove = (
        calendar_dates.where(
            (calendar_dates.date == today_int) & (calendar_dates.exception_type == 2)
        )
        .dropna()
        .index
    )
    calendar_dates_today_added = (
        calendar_dates.where(
            (calendar_dates.date == today_int) & (calendar_dates.exception_type == 1)
        )
        .dropna()
        .index
    )

    service_ids = set(calendar_today.index)
    service_ids.update(set(calendar_dates_today_added))
    service_ids = service_ids.difference(set(calendar_dates_today_remove))
    return list(service_ids)


def get_trips_last_max_stop_sequence(stop_times, selected_trip_ids=None):
    if selected_trip_ids is not None:
        stop_times = stop_times.loc[stop_times["trip_id"].isin(selected_trip_ids)]
    return stop_times.groupby("trip_id")["stop_sequence"].max().to_dict()


def get_selected_trip_ids(
    trips, stop_times, selected_stop_ids=None, selected_service_ids=None
) -> List[int]:
    trip_ids = set(trips.index)

    if selected_stop_ids is not None:
        trip_ids = set(
            stop_times.loc[stop_times["stop_id"].isin(selected_stop_ids), "trip_id"]
            .unique()
            .tolist()
        )

    if selected_service_ids is not None:
        trip_ids.intersection_update(
            trips.loc[trips["service_id"].isin(selected_service_ids)].index
        )

    return trip_ids


def get_selected_stop_times(stop_times, selected_stop_ids=None, selected_trip_ids=None):
    selection = stop_times["stop_id"].apply(lambda _: True)
    if selected_stop_ids is not None:
        selection = selection & stop_times["stop_id"].isin(selected_stop_ids)

    if selected_trip_ids is not None:
        selection = selection & stop_times["trip_id"].isin(selected_trip_ids)

    return stop_times.loc[selection]


def set_stop_times_to_today(stop_times, today: datetime):
    stop_times = stop_times.copy()
    today_str = today.strftime("%Y-%m-%d")
    stop_times["departure_time"] = pd.to_datetime(
        today_str + " " + stop_times["departure_time"],
        format="%Y-%m-%d %H:%M:%S",
        errors="coerce",
    ).dt.tz_localize("Europe/Zurich")
    stop_times["departure_timestamp"] = stop_times["departure_time"].apply(
        lambda time: int(time.timestamp()) if pd.notna(time) else -1
    )
    stop_times["arrival_time"] = pd.to_datetime(
        today_str + " " + stop_times["arrival_time"],
        format="%Y-%m-%d %H:%M:%S",
        errors="coerce",
    ).dt.tz_localize("Europe/Zurich")
    stop_times["arrival_timestamp"] = stop_times["arrival_time"].apply(
        lambda time: int(time.timestamp()) if pd.notna(time) else -1
    )
    return stop_times.dropna(subset=["departure_time", "arrival_time"])


def get_local_todays_stop_times(
    calendar, calendar_dates, trips, stop_times, today: datetime, local_stops_ids
):
    todays_service_ids = get_todays_service_ids(calendar, calendar_dates, today)

    local_trip_ids = get_selected_trip_ids(trips, stop_times, local_stops_ids)
    tripid2max_stop_sequence = get_trips_last_max_stop_sequence(stop_times, local_trip_ids)

    todays_local_trip_ids = get_selected_trip_ids(
        trips, stop_times, local_stops_ids, todays_service_ids
    )

    todays_local_stop_times = get_selected_stop_times(
        stop_times, local_stops_ids, todays_local_trip_ids
    )

    # remove last stops
    todays_local_stop_times = todays_local_stop_times.loc[
        todays_local_stop_times['trip_id'].apply(tripid2max_stop_sequence.get) != todays_local_stop_times['stop_sequence']
    ]

    return set_stop_times_to_today(todays_local_stop_times, today)

In [120]:
now = datetime.now()
todays_local_stop_times = get_local_todays_stop_times(calendar, calendar_dates, trips, stop_times, now, local_stops_ids)

In [121]:
def request_gtfs_rt_from_server_and_save():
    feed = gtfs_realtime_pb2.FeedMessage()
    r = requests.get('https://api.opentransportdata.swiss/gtfsrt2020', headers={'Authorization': GTFS_RT_API_KEY})
        
    if bytes('disallowed', 'utf-8') in r.content:
        print('ERROR: Please check API key. Server returned {}'.format(r.content))
        return
    with open('./data/sample_rt/latest.protobuf', 'wb') as outfile:
        outfile.write(r.content)

def request_gtfs_rt_from_server():
    feed = gtfs_realtime_pb2.FeedMessage()
    if DEV_MODE:
        with open('./data/sample_rt/latest.protobuf', 'rb') as infile:
            feed.ParseFromString(infile.read())
    else:
        r = requests.get('https://api.opentransportdata.swiss/gtfsrt2020', headers={'Authorization': GTFS_RT_API_KEY})
        
        if bytes('disallowed', 'utf-8') in r.content:
            exit()

        feed.ParseFromString(r.content)
    return feed

In [122]:
def format_if_datetime(value, strfmt="%H:%M"):
    return value.strftime(strfmt) if isinstance(value, datetime) else value


def update_delays(stop_times, feed, minimum_arrival_delay=0, minimum_departure_delay=0):
    # reset delays
    stop_times["arrival_delay"] = 0
    stop_times["departure_delay"] = 0

    trip_ids = set(stop_times["trip_id"].unique().tolist())
    stops_ids = set(stop_times["stop_id"].unique().tolist())

    stop_times.set_index(["trip_id", "stop_id", "stop_sequence"], inplace=True)
    stop_times.sort_index(inplace=True)

    for entity in feed.entity:
        if (
            entity.HasField("trip_update")
            and entity.trip_update.trip.trip_id in trip_ids
        ):
            trip_id = entity.trip_update.trip.trip_id
            for stop_update in entity.trip_update.stop_time_update:
                if stop_update.stop_id in stops_ids:
                    stop_id = stop_update.stop_id
                    stop_sequence = str(stop_update.stop_sequence)
                    arrival_delay = (
                        stop_update.arrival.delay
                        if stop_update.HasField("arrival")
                        and stop_update.arrival.HasField("delay")
                        else 0
                    )
                    departure_delay = (
                        stop_update.departure.delay
                        if stop_update.HasField("departure")
                        and stop_update.departure.HasField("delay")
                        else 0
                    )
                    if (
                        arrival_delay >= minimum_arrival_delay
                        and departure_delay >= minimum_departure_delay
                    ):

                        arrival_delay = round(arrival_delay / 60)
                        departure_delay = round(departure_delay / 60)
                        stop_times.loc[
                            (trip_id, stop_id, stop_sequence), "arrival_delay"
                        ] = arrival_delay
                        stop_times.loc[
                            (trip_id, stop_id, stop_sequence), "departure_delay"
                        ] = departure_delay
    stop_times.reset_index(inplace=True)


def map_stop_times_row_to_infos(row, min_delay=2):
    route_id = trip_id2route_id.get(row["trip_id"])
    arrival_delay = (
        f"+{row['arrival_delay']}"
        if "arrival_delay" in row and row["arrival_delay"] >= min_delay
        else ""
    )
    departure_delay = (
        f"+{row['departure_delay']}"
        if "departure_delay" in row and row["departure_delay"] >= min_delay
        else ""
    )
    return {
        "Name": route_id2short_name.get(route_id),
        "Stop": stop_id2stop_name.get(row["stop_id"]),
        "Direction": trip_id2headsign.get(row["trip_id"]),
        "Arrival time": f"{format_if_datetime(row['arrival_time'])}{arrival_delay}",
        "Departure time": f"{format_if_datetime(row['departure_time'])}{departure_delay}",
        "Type": route_id2desc.get(route_id),
    }


def map_stop_times_to_infos(stop_times, min_delay=2):
    if stop_times.size == 0:
        return pd.DataFrame(columns=["Name", "Stop", "Direction", "Arrival time", "Departure time", "Type"])
    return (
        stop_times.apply(map_stop_times_row_to_infos, axis=1)
        .apply(pd.Series)
        .sort_values(by="Arrival time")
    )


def get_now_timestamp():
    return int(datetime.now().timestamp())


def get_next_departures(stop_times, time_delta=300):
    if not "departure_timestamp" in stop_times.columns:
        return None
    now = get_now_timestamp()
    return stop_times.loc[
        (stop_times["departure_timestamp"] >= now)
        & (stop_times["departure_timestamp"] <= now + time_delta)
    ]


def get_nth_next_departure(stop_times, stop_id_merging_dict={}, n=5):
    if not "departure_timestamp" in stop_times.columns:
        return None
    now = get_now_timestamp()
    return (
        stop_times.loc[stop_times["departure_timestamp"] >= now]
        .sort_values("arrival_time")
        .groupby(
            stop_times["stop_id"].apply(
                lambda stop_id: stop_id_merging_dict.get(stop_id, stop_id)
            )
        )
        .head(n)
    )

In [123]:
#request_gtfs_rt_from_server_and_save()

In [124]:
feed = request_gtfs_rt_from_server()
update_delays(todays_local_stop_times, feed)

  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[
  stop_times.loc[


In [125]:
map_stop_times_to_infos(get_next_departures(todays_local_stop_times, 3600))

Unnamed: 0,Name,Stop,Direction,Arrival time,Departure time,Type
35,R3,Cully,St-Maurice,10:37,10:37,Train régional
36,R3,Epesses,St-Maurice,10:39,10:39,Train régional
83,R1,Cully,Grandson,10:42,10:42,Train régional
102,R4,Cully,Le Day,10:50,10:50,Train régional
6,R4,Cully,Aigle,11:09,11:09,Train régional
62,382,"Cully, gare","Palézieux, gare",11:12,11:12,Autobus
108,R2,Cully,Grandson,11:13,11:13,Train régional
170,R3,Epesses,Vallorbe,11:16,11:16,Train régional
169,R3,Cully,Vallorbe,11:18,11:18,Train régional


In [126]:
map_stop_times_to_infos(get_nth_next_departure(todays_local_stop_times, local_stops_ids_merge))

Unnamed: 0,Name,Stop,Direction,Arrival time,Departure time,Type
35,R3,Cully,St-Maurice,10:37,10:37,Train régional
36,R3,Epesses,St-Maurice,10:39,10:39,Train régional
83,R1,Cully,Grandson,10:42,10:42,Train régional
102,R4,Cully,Le Day,10:50,10:50,Train régional
6,R4,Cully,Aigle,11:09,11:09,Train régional
62,382,"Cully, gare","Palézieux, gare",11:12,11:12,Autobus
108,R2,Cully,Grandson,11:13,11:13,Train régional
170,R3,Epesses,Vallorbe,11:16,11:16,Train régional
38,R3,Epesses,St-Maurice,11:39,11:39,Train régional
151,R3,Epesses,Vallorbe,12:16,12:16,Train régional


In [127]:
get_nth_next_departure(todays_local_stop_times, local_stops_ids_merge)

Unnamed: 0,trip_id,stop_id,stop_sequence,arrival_time,departure_time,pickup_type,drop_off_type,departure_timestamp,arrival_timestamp,arrival_delay,departure_delay
35,1270.TA.91-3-T-j24-1.89.R,8501124:0:1,14,2024-07-12 10:37:00+02:00,2024-07-12 10:37:00+02:00,0,0,1720773000.0,1720773000.0,0.0,0.0
36,1270.TA.91-3-T-j24-1.89.R,8501125:0:1,15,2024-07-12 10:39:00+02:00,2024-07-12 10:39:00+02:00,0,0,1720774000.0,1720774000.0,0.0,0.0
83,237.TA.91-1-M-j24-1.30.H,8501124:0:2,1,2024-07-12 10:42:00+02:00,2024-07-12 10:42:00+02:00,0,0,1720774000.0,1720774000.0,0.0,0.0
102,369.TA.91-4-L-j24-1.19.H,8501124:0:3,8,2024-07-12 10:50:00+02:00,2024-07-12 10:50:00+02:00,0,0,1720774000.0,1720774000.0,0.0,0.0
6,1003.TA.91-4-L-j24-1.176.R,8501124:0:1,12,2024-07-12 11:09:00+02:00,2024-07-12 11:09:00+02:00,0,0,1720775000.0,1720775000.0,0.0,0.0
62,16.TA.96-203-j24-1.13.H,8570559,1,2024-07-12 11:12:00+02:00,2024-07-12 11:12:00+02:00,0,0,1720776000.0,1720776000.0,0.0,0.0
108,402.TA.91-2-R-j24-1.34.R,8501124:0:2,1,2024-07-12 11:13:00+02:00,2024-07-12 11:13:00+02:00,0,0,1720776000.0,1720776000.0,0.0,0.0
170,579.TA.91-3-T-j24-1.32.H,8501125:0:2,15,2024-07-12 11:16:00+02:00,2024-07-12 11:16:00+02:00,0,0,1720776000.0,1720776000.0,0.0,0.0
38,1271.TA.91-3-T-j24-1.89.R,8501125:0:1,15,2024-07-12 11:39:00+02:00,2024-07-12 11:39:00+02:00,0,0,1720777000.0,1720777000.0,0.0,0.0
151,569.TA.91-3-T-j24-1.32.H,8501125:0:2,15,2024-07-12 12:16:00+02:00,2024-07-12 12:16:00+02:00,0,0,1720779000.0,1720779000.0,0.0,0.0


In [128]:
stop_times.loc[stop_times['trip_id'] == '67.TA.96-206-j24-1.24.H'].groupby('trip_id')

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type
18661521,67.TA.96-206-j24-1.24.H,12:04:00,12:04:00,8570440,1,0,0
18661522,67.TA.96-206-j24-1.24.H,12:08:00,12:08:00,8582200,2,0,0
18661523,67.TA.96-206-j24-1.24.H,12:13:00,12:13:00,8579251,3,0,0
18661524,67.TA.96-206-j24-1.24.H,12:16:00,12:16:00,8557076,4,0,0
18661525,67.TA.96-206-j24-1.24.H,12:18:00,12:18:00,8582772,5,0,0
18661526,67.TA.96-206-j24-1.24.H,12:19:00,12:19:00,8582773,6,0,0
18661527,67.TA.96-206-j24-1.24.H,12:21:00,12:21:00,8582774,7,0,0
18661528,67.TA.96-206-j24-1.24.H,12:22:00,12:22:00,8582776,8,0,0
18661529,67.TA.96-206-j24-1.24.H,12:23:00,12:23:00,8582777,9,0,0
18661530,67.TA.96-206-j24-1.24.H,12:24:00,12:24:00,8582778,10,0,0


In [349]:
todays_local_stop_times.apply(map_stop_times_row_to_infos, axis=1).apply(pd.Series)

Unnamed: 0,Name,Stop,Direction,Arrival time,Departure time,Type
0,382,"Cully, gare","Puidoux, gare",12:48,12:48,Autobus
1,382,"Cully, gare","Palézieux, gare",17:12,17:12,Autobus
2,381,"Cully, gare","Grandvaux, Pra Grana",18:12,18:12,Autobus
3,381,"Epesses, village","Grandvaux, Pra Grana",18:16,18:16,Autobus
4,R4,Cully,Aigle,08:09,08:09,Train régional
...,...,...,...,...,...,...
253,R4,Cully,Aigle,16:09,16:09,Train régional
254,R4,Cully,Aigle,18:09,18:09,Train régional
255,R4,Cully,Aigle,06:09,06:09,Train régional
256,R4,Cully,Aigle,09:09,09:09,Train régional
