In [41]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import duckdb

### Stop names and info

In [None]:
base_url = 'https://api.golemio.cz/v2/gtfs/stops'

# stop_names for Letná district
stop_names = [
    "Hradčanská", "Sparta", "Korunovační", "Letenské náměstí",
    "Kamenická", "Strossmayerovo náměstí", "Nábřeží Kapitána Jaroše",
    "Vltavská", "Výstaviště", "Veletržní palác"
]

params = {
    'names[]': stop_names,
    'offset': 0
}

headers = {
    'accept': 'application/json',
    'X-Access-Token': os.getenv('X-Access-Token-Golemio')
    }

response = requests.get(base_url, params=params, headers=headers)

In [None]:
feat = response.json()['features']
stops = []
for x in feat:
    stops.append(x['properties']['stop_name'])

In [21]:
len(set(stops)) == len(stop_names)

True

In [22]:
feat

[{'geometry': {'coordinates': [14.438508, 50.100229], 'type': 'Point'},
  'properties': {'location_type': 1,
   'parent_station': None,
   'platform_code': None,
   'stop_id': 'U100S1',
   'stop_name': 'Vltavská',
   'wheelchair_boarding': 1,
   'zone_id': 'P',
   'level_id': None},
  'type': 'Feature'},
 {'geometry': {'coordinates': [14.438122, 50.099228], 'type': 'Point'},
  'properties': {'location_type': 0,
   'parent_station': None,
   'platform_code': 'A',
   'stop_id': 'U100Z1P',
   'stop_name': 'Vltavská',
   'wheelchair_boarding': 1,
   'zone_id': 'P',
   'level_id': None},
  'type': 'Feature'},
 {'geometry': {'coordinates': [14.438596, 50.100229], 'type': 'Point'},
  'properties': {'location_type': 0,
   'parent_station': 'U100S1',
   'platform_code': '1',
   'stop_id': 'U100Z101P',
   'stop_name': 'Vltavská',
   'wheelchair_boarding': 1,
   'zone_id': 'P',
   'level_id': 'U100L2'},
  'type': 'Feature'},
 {'geometry': {'coordinates': [14.438395, 50.100227], 'type': 'Point'},


### Stop times for each stop
can be used as sanity check for actual scheduled times

In [None]:
url = 'https://api.golemio.cz/v2/gtfs/stoptimes/U163Z6P' # test for Hradčanská

params = {
    'date': '2024-12-02',
    'from': '07:12:33',
    'to': '12:12:33',
    'includeStop': 'true',
    'limit': 10,
    'offset': 0
}

headers = {
    'accept': 'application/json',
    'X-Access-Token': os.getenv('X-Access-Token-Golemio')
    }

response = requests.get(url, params=params, headers=headers)
response.json()

[{'arrival_time': '10:09:00',
  'departure_time': '10:09:00',
  'drop_off_type': '3',
  'pickup_type': '3',
  'shape_dist_traveled': 0,
  'stop_headsign': None,
  'stop_id': 'U163Z6P',
  'stop_sequence': 1,
  'trip_id': '131_1549_241104',
  'stop': {'location_type': 0,
   'parent_station': None,
   'platform_code': 'C',
   'stop_id': 'U163Z6P',
   'stop_lat': 50.098118,
   'stop_lon': 14.405217,
   'stop_name': 'Hradčanská',
   'wheelchair_boarding': 0,
   'zone_id': 'P',
   'level_id': None}},
 {'arrival_time': '10:19:00',
  'departure_time': '10:19:00',
  'drop_off_type': '3',
  'pickup_type': '3',
  'shape_dist_traveled': 0,
  'stop_headsign': None,
  'stop_id': 'U163Z6P',
  'stop_sequence': 1,
  'trip_id': '131_1508_241104',
  'stop': {'location_type': 0,
   'parent_station': None,
   'platform_code': 'C',
   'stop_id': 'U163Z6P',
   'stop_lat': 50.098118,
   'stop_lon': 14.405217,
   'stop_name': 'Hradčanská',
   'wheelchair_boarding': 0,
   'zone_id': 'P',
   'level_id': None}},


### Get actual delays 
parquet files examples provided by Golemio, if the project is agreed with, Golemio will provide desired parquet files for our use case

important columns are `current_stop_dep_delay`, `current_stop_arr_delay` and `current_stop_departure`:
- the scheduled departure is `current_stop_departure`
- the actual departure is `current_stop_departure` + `current_stop_dep_delay` or `current_stop_arr_delay` if the dep delay is NULL
- we can take either one of the delay values as the time spent at stop by normal public transport is 0
  
this data will help us determine the real travel times and create base for the delay prediction

In [36]:
db = duckdb.connect()

In [38]:
df = db.sql("SELECT * FROM read_parquet('*.parquet')").df()

In [40]:
df[df['stop_name'] == 'Hradčanská'].head()

Unnamed: 0,rt_trip_id,gtfs_date,gtfs_trip_id,gtfs_direction_id,gtfs_route_short_name,gtfs_route_type,run_number,vehicle_registration_number,gtfs_stop_sequence,gtfs_stop_id,...,create_batch_id,created_at,created_by,update_batch_id,updated_at,updated_by,origin_route_name,stop_name,lat,lng
2867,2024-10-19T22:07:00+02:00_8_10717_240902_9095,2024-10-19,8_10717_240902,1,8,0,1.0,9095.0,25,U163Z2P,...,,2024-10-20 00:18:00.179319+02:00,,,2024-10-20 00:18:00.179319+02:00,,8,Hradčanská,50.09721,14.403853
2969,2024-10-19T22:09:00+02:00_2_1342_240928_8268,2024-10-19,2_1342_240928,0,2,0,73.0,8268.0,10,U163Z1P,...,,2024-10-20 00:18:00.179319+02:00,,,2024-10-20 00:18:00.179319+02:00,,93,Hradčanská,50.097298,14.404873
3019,2024-10-19T22:10:00+02:00_26_11296_240928_9281,2024-10-19,26_11296_240928,1,26,0,10.0,9281.0,33,U163Z2P,...,,2024-10-20 00:18:00.179319+02:00,,,2024-10-20 00:18:00.179319+02:00,,26,Hradčanská,50.09721,14.403853
3323,2024-10-19T22:15:00+02:00_2_10645_240928_8263,2024-10-19,2_10645_240928,1,2,0,71.0,8263.0,18,U163Z2P,...,,2024-10-20 00:18:00.179319+02:00,,,2024-10-20 00:18:00.179319+02:00,,91,Hradčanská,50.09721,14.403853
3518,2024-10-19T22:16:00+02:00_26_16705_240928_8532,2024-10-19,26_16705_240928,0,26,0,5.0,8532.0,17,U163Z1P,...,,2024-10-20 00:18:00.179319+02:00,,,2024-10-20 00:18:00.179319+02:00,,26,Hradčanská,50.097298,14.404873
