In [196]:
from bs4 import BeautifulSoup
import requests
import os
from pathlib import Path
import pandas as pd
import datetime

path_prefix = "dump/"
base_url = "http://www.streamlinerschedules.com/"

In [2]:
def download_track_pages():
    track_pages = [f"concourse/track{i}" for i in range(1, 13)]
    for page in track_pages:
        Path(path_prefix + page).mkdir(exist_ok=True, parents=True)
        with open(f"{path_prefix}{page}/index.html", "wb+") as f:
            f.write(requests.get(f"{base_url}{page}/index.html").content)

check_file = Path(f"{path_prefix}/concourse/track1/index.html")

if check_file.exists() and check_file.stat().st_size > 0:
    print("Already downloaded")
else:
    # download_track_pages()
    pass

Already downloaded


In [3]:
track_pages = [f"concourse/track{i}/index.html" for i in range(1, 13)]

In [4]:
with open(path_prefix + track_pages[0], "rb") as f:
    soup = BeautifulSoup(f, "html.parser")


In [5]:
# Returns a list like "birmspecial194112.html", "carolinaspecial196410.html", etc
def get_train_links(soup):
    relative_links = []
    for a in soup.find_all("a"):
        href = a.get("href")
        if href is not None and ".." not in href and "http://" not in href:
            relative_links.append(href)
    return relative_links

def get_link(relative_link, page):
    page = page.rstrip("index.html")
    return f"{base_url}{page}{relative_link}"


In [6]:
def read_child_pages(page):
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            with open(f"{path_prefix}{page.rstrip('index.html')}/{link}", "wb+") as f:
                f.write(requests.get(get_link(link, page)).content)

In [7]:
all_train_files = []
for page in track_pages:
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            file_location = f"{path_prefix}{page.rstrip('index.html')}{link}"
            all_train_files.append(file_location)


In [294]:
def maybe_get_schedule_from_row(stops, row, train_nums):
    if train_nums is None:
        return False
    try:
        # Don't differentiate between departure and arrival time, since they can be on different lines.
        if row.find("td", class_="times") is None or len(row.find_all("td", class_="times")) != len(train_nums):
            return False
        
        possible_miles = row.find_all("td", class_="miles")
        if len(possible_miles) == 2:
            miles = row.find_all("td", class_="miles")[1]
        elif len(possible_miles) == 1:
            miles = row.find("td", class_="miles")
        else:
            return False
        station = miles.findNext('td')

        times = row.find_all("td", class_="times")
        for num, time in zip(train_nums, times):
            if len(time) == 0:
                continue
            stops.loc[len(stops)] = [num, station.text.strip(), time.text.replace("D", "").replace("R", "").strip(), miles.text.strip()]
        return True
    except Exception as e:
        print(row, e)
        raise e

def parse_schedule(schedule_filename):
    with open(schedule_filename, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    train_name = soup.find("h1").text

    # TODO: Date
    description_element = soup.find('h3', id='rrdate')
    table = soup.find('table')


    stops = pd.DataFrame(columns=["train_number", "station", "time", "miles"])

    rows = table.find_all("tr")


    train_nums = None
    num_rows_skipped = 0
    for row in rows:
        if row.find("td", class_="trainnum") is not None:
            # TODO: Map train number since they continue off each other
            train_nums = [t.text.strip() for t in row.find_all("td", class_="trainnum")]
            continue

        added_schedule = maybe_get_schedule_from_row(stops, row, train_nums)
        if not added_schedule:
            num_rows_skipped += 1
    print(train_name, num_rows_skipped, "rows skipped")
    
    return train_name, stops




# parse_schedule("dump/concourse/track1/birmspecial194112.html")[1]
# parse_schedule("dump/concourse/track1/carolinaspecial196410.html")
# parse_schedule("dump/concourse/track1/cityneworl194706.html")
# parse_schedule("dump/concourse/track1/cityneworl197104.html")
# parse_schedule("dump/concourse/track1/georgewash196706.html")
# parse_schedule("dump/concourse/track1/gulfcoastrebel195008.html")
# parse_schedule("dump/concourse/track3/erieltd195103.html")
# parse_schedule("dump/concourse/track11/indianpacific198805.html")[1]

In [295]:
file_name_to_name_and_schedule = {}
for file in all_train_files:
    if "track10" in file or "track11" in file:
        # This is Mexico and/or Europe
        continue
    try:
        train_name, stops = parse_schedule(file)
        file_name_to_name_and_schedule[file] = (train_name, stops)
    except Exception as e:
        print(f"Error parsing {file}: {e}")
        continue
    

The Birmingham Special 9 rows skipped
The Carolina Special 9 rows skipped
The City of Memphis 1 rows skipped
The City of New Orleans 10 rows skipped
The City of New Orleans 9 rows skipped
The Crescent 14 rows skipped
The Southern Crescent 12 rows skipped
The Southern Crescent 7 rows skipped
The George Washington
The F. F. V.
The Sportsman 27 rows skipped
The Georgian 13 rows skipped
The Gulf Coast Rebel 8 rows skipped
The Humming Bird 6 rows skipped
The Humming Bird 27 rows skipped
The Pan-American 11 rows skipped
The Pan-American 10 rows skipped
The Piedmont
Trains 7 & 8
Trains 3 & 4 6 rows skipped
The Powhatan Arrow 5 rows skipped
The Rebel 8 rows skipped
The Rebel 6 rows skipped
The Silver Comet 7 rows skipped
The Southern Belle 4 rows skipped
The Southern Belle 5 rows skipped
The Southerner 8 rows skipped
The Tennessean 16 rows skipped
The Tamiami Champion (East Coast) 10 rows skipped
The Tamiami Champion (West Coast) 10 rows skipped
The Champion 9 rows skipped
The City of Miami 2 

In [296]:
for file, (_, stops) in file_name_to_name_and_schedule.items():
    print(file, set(stops["station"]))

dump/concourse/track1/birmspecial194112.html {'Manassas, VA', 'Amherst, VA', 'Greeneville, TN', 'Attalla, AL', 'Christiansburg, VA', 'Sheffield, AL (Muscle Shoals)', 'Charlottesville, VA', 'Memphis, TN (Union Sta.) (CT)', 'Sweetbriar, VA', 'Chattanooga, TN', 'Washington, DC', 'North Philadelphia, PA', 'Hattiesburg, MS', 'New Orleans, LA (Terminal\r\nSta.) (CT)', 'Trenton, NJ', 'Philadelphia,\r\nPA (30th\r\nSt. Sta.)', 'Birmingham, AL', 'Bedford, VA', 'Lynchburg, VA (Kemper St.)', 'Pulaski, VA', 'Bristol, TN-VA (ET)', 'Sweetwater, TN', 'Loudon, TN', 'Orange, VA', 'Knoxville,\r\nTN', 'Newark, NJ', 'Monroe, VA', 'Radford, VA', 'Chattanooga, TN (Terminal\r\nSta.)', 'Chattanooga, TN (Terminal Sta.)', 'Fort Payne, AL', 'Birmingham, AL (CT)', 'Jonesboro, TN', 'Culpeper, VA', 'Wytheville, VA', 'Johnson City, TN', 'Calverton, VA', 'Athens, TN', 'Roanoke,\r\nVA', 'New York, NY (Penna. Sta.) (ET)', 'Shipman, VA', 'Salem, VA', 'Morristown, TN', 'Lenoir City, TN', 'Cleveland, TN', 'Laurel, MS', 'Wi

In [297]:
timezones = {"(ET)", "(MT)", "(CT)", "(PT)", "(AT)", "(EDT)", "(CST)", "(CDT)", "(EST)", "(PST)", "(MST)", "(PDT)", "(MST)", "(AST)"}
def clean_station_name(station):
    for t in timezones:
        station = station.replace(t, "")
    return station.replace("\r\n", " ")

In [298]:
all_station_names = set()
for _, stops in file_name_to_name_and_schedule.values():
    all_station_names.update(stops["station"])

In [299]:
clean_station_names = {clean_station_name(s) for s in all_station_names}

In [300]:
len(clean_station_names)

4679

In [301]:
cities_table = pd.read_csv("uscities.csv")

In [290]:
def get_matching_city(city, state):
    return cities_table[(cities_table["city"] == city) & (cities_table["state_id"] == state)]

unmatched_stations = []

station_name_to_coords = {}
for station in clean_station_names:
    s = station.split(",")
    if len(s) != 2:
        continue
    city = s[0]
    state = s[1][1:3]

    match = get_matching_city(city, state)
    if len(match) == 0:
        unmatched_stations.append(station)
        continue
    station_name_to_coords[station] = (match.iloc[0]["lat"], match.iloc[0]["lng"])

In [302]:
schedule = file_name_to_name_and_schedule["dump/concourse/track1/birmspecial194112.html"][1]
schedule

Unnamed: 0,train_number,station,time,miles
0,125,"New York, NY (Penna. Sta.) (ET)",12 30P,0.0
1,176(3rd),"New York, NY (Penna. Sta.) (ET)",4 45P,0.0
2,125,"Newark, NJ",12 45P,10.0
3,176(3rd),"Newark, NJ",4 29P,10.0
4,125,"Trenton, NJ",1 31P,58.1
...,...,...,...,...
97,48,"Laurel, MS",11 04A,209.1
98,47,"Hattiesburg, MS",5 43P,238.0
99,48,"Hattiesburg, MS",10 30A,238.0
100,47,"New Orleans, LA (Terminal\r\nSta.) (CT)",8 15P,354.6


# Group by departure station in this format:

```ts
type Coords = [number, number, number?] // [x, y, z?]
type Route = {
  departureTime: number // seconds since midnight (choose a timezone and stick with it)
  departureCoords: Coords
  arrivalTime: number // seconds since midnight (choose a timezone and stick with it)
  arrivalCoords: Coords

  // optional fields if we have the data for it
  trainIdentifier?: string // name or id
  departureIdentifier?: string // station name
  arrivalIdentifier?: string // station name
}
```

In [303]:
def time_to_seconds(time_str):
    """Converts a time string in the format 'HH MMX' (e.g., '3 22P') to seconds past midnight.
  
    Args:
      time_str: The time string to convert.
  
    Returns:
      The number of seconds past midnight.
    """
    s = time_str.lstrip("F").replace("\n", " ").strip().split(" ")
    if len(s) != 2:
        print("INVALID TIME PASSED IN", time_str)
        1 / 0

    hour, minute = s 
    hour = int(hour)

    period = minute[2]
    minute = int(minute[:2])

    if period == 'P' and hour != 12:
      hour += 12
    elif period == 'A' and hour == 12:
      hour = 0

    return hour * 3600 + minute * 60


def maybe_get_next_stop(group, current_stop):
    for i in range(current_stop + 1, len(group)):
        if clean_station_name(group.iloc[i]["station"]) in station_name_to_coords:
            return group.iloc[i]
    return None
    
def get_next_stop_with_time(group, current_stop):
    for i in range(current_stop + 1, len(group)):
        if group.iloc[i]["time"] != "F":
            return group.iloc[i]
    return None


def schedule_to_routes(train_name, schedule):
    departure_station_to_route = {}

    for _, group in schedule.groupby('train_number'):
        last_entry_with_time = None
        for i in range(len(group) - 1):
            current_stop, next_stop = group.iloc[i], maybe_get_next_stop(group, i)
            current_station  = clean_station_name(current_stop["station"])

            if current_station not in station_name_to_coords or next_stop is None:
                continue
            next_station = clean_station_name(next_stop["station"])



            if current_stop["time"] == "F":
                # Impute the current time based on the last and next entry with time
                next_stop_with_time = get_next_stop_with_time(group, i)
                if last_entry_with_time is None or next_stop_with_time is None:
                    print("ERROR", current_stop)
                    continue

                distance_with_time = float(next_stop_with_time["miles"]) - float(last_entry_with_time["miles"])
                miles_per_second = distance_with_time / (time_to_seconds(next_stop_with_time["time"]) - time_to_seconds(last_entry_with_time["time"]))

                distance_elapsed = float(current_stop["miles"]) - float(last_entry_with_time["miles"])
                time_elapsed = distance_elapsed / miles_per_second

                departure_time = time_elapsed + time_to_seconds(last_entry_with_time["time"])
            else:
                departure_time = time_to_seconds(current_stop["time"])
                last_entry_with_time = current_stop



            if next_stop["time"] == "F":
                # Impute the current time based on the last and next entry with time
                next_stop_with_time = get_next_stop_with_time(group, i)
                if last_entry_with_time is None or next_stop_with_time is None:
                    print("ERROR", current_stop)
                    continue

                distance_with_time = float(next_stop_with_time["miles"]) - float(last_entry_with_time["miles"])
                miles_per_second = distance_with_time / (time_to_seconds(next_stop_with_time["time"]) - time_to_seconds(last_entry_with_time["time"]))

                distance_elapsed = float(next_stop["miles"]) - float(last_entry_with_time["miles"])
                time_elapsed = distance_elapsed / miles_per_second

                arrival_time = time_elapsed + time_to_seconds(last_entry_with_time["time"])
            else:
                arrival_time = time_to_seconds(next_stop["time"])

            
            route = {
                'departureTime': departure_time,
                'departureCoords': list(station_name_to_coords[current_station]),
                'departureIdentifier': current_station,

                'arrivalTime':  arrival_time,
                'arrivalCoords': list(station_name_to_coords[next_station]),
                'arrivalIdentifier': next_station,

                'trainName': train_name,
                'trainNumber': current_stop["train_number"]
            }
            if current_station in departure_station_to_route:
                departure_station_to_route[current_station].append(route)
            else:
                departure_station_to_route[current_station] = [route]
    return departure_station_to_route

routes = schedule_to_routes("The Birmingham Special", schedule)

In [304]:
all_routes = {}

for file_name, (name, schedule) in file_name_to_name_and_schedule.items():
    print(file_name)
    routes = schedule_to_routes(name, schedule)
    all_routes[file_name] = routes

dump/concourse/track1/birmspecial194112.html
dump/concourse/track1/carolinaspecial196410.html
dump/concourse/track1/citymemphis195008.html
dump/concourse/track1/cityneworl194706.html
dump/concourse/track1/cityneworl197104.html
dump/concourse/track1/crescent195008.html
dump/concourse/track1/crescent197104.html
dump/concourse/track1/crescent197303.html
dump/concourse/track1/georgewash196706.html
dump/concourse/track1/georgian196308.html
dump/concourse/track1/gulfcoastrebel195008.html
dump/concourse/track1/humbird194706.html
dump/concourse/track1/humbird196308.html
dump/concourse/track1/pan-am192701.html


ValueError: could not convert string to float: ''