In [1]:
from bs4 import BeautifulSoup
import requests
import os
from pathlib import Path
import pandas as pd

path_prefix = "dump/"
base_url = "http://www.streamlinerschedules.com/"

In [2]:
def download_track_pages():
    track_pages = [f"concourse/track{i}" for i in range(1, 13)]
    for page in track_pages:
        Path(path_prefix + page).mkdir(exist_ok=True, parents=True)
        with open(f"{path_prefix}{page}/index.html", "wb+") as f:
            f.write(requests.get(f"{base_url}{page}/index.html").content)

check_file = Path(f"{path_prefix}/concourse/track1/index.html")

if check_file.exists() and check_file.stat().st_size > 0:
    print("Already downloaded")
else:
    # download_track_pages()
    pass

Already downloaded


In [3]:
track_pages = [f"concourse/track{i}/index.html" for i in range(1, 13)]

In [4]:
with open(path_prefix + track_pages[0], "rb") as f:
    soup = BeautifulSoup(f, "html.parser")


In [5]:
# Returns a list like "birmspecial194112.html", "carolinaspecial196410.html", etc
def get_train_links(soup):
    relative_links = []
    for a in soup.find_all("a"):
        href = a.get("href")
        if href is not None and ".." not in href and "http://" not in href:
            relative_links.append(href)
    return relative_links

def get_link(relative_link, page):
    page = page.rstrip("index.html")
    return f"{base_url}{page}{relative_link}"


In [6]:
def read_child_pages(page):
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            with open(f"{path_prefix}{page.rstrip('index.html')}/{link}", "wb+") as f:
                f.write(requests.get(get_link(link, page)).content)

In [7]:
all_train_files = []
for page in track_pages:
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            file_location = f"{path_prefix}{page.rstrip('index.html')}{link}"
            all_train_files.append(file_location)


In [9]:
all_train_files

['dump/concourse/track1/birmspecial194112.html',
 'dump/concourse/track1/carolinaspecial196410.html',
 'dump/concourse/track1/citymemphis195008.html',
 'dump/concourse/track1/cityneworl194706.html',
 'dump/concourse/track1/cityneworl197104.html',
 'dump/concourse/track1/crescent195008.html',
 'dump/concourse/track1/crescent197104.html',
 'dump/concourse/track1/crescent197303.html',
 'dump/concourse/track1/georgewash196706.html',
 'dump/concourse/track1/georgian196308.html',
 'dump/concourse/track1/gulfcoastrebel195008.html',
 'dump/concourse/track1/humbird194706.html',
 'dump/concourse/track1/humbird196308.html',
 'dump/concourse/track1/pan-am192701.html',
 'dump/concourse/track1/pan-am195407.html',
 'dump/concourse/track1/piedmontltd197303.html',
 'dump/concourse/track1/powhatan195008.html',
 'dump/concourse/track1/rebel193809.html',
 'dump/concourse/track1/rebel195304.html',
 'dump/concourse/track1/silvercomet194706.html',
 'dump/concourse/track1/soubelle194106.html',
 'dump/concours

In [109]:
def maybe_get_schedule_from_row(stops, row, train_nums, miles_index, station_index):
    if train_nums is None:
        return False
    try:
        # Don't differentiate between departure and arrival time, since they can be on different lines.
        if row.find("td", class_="times") is None or len(row.find_all("td", class_="times")) != len(train_nums):
            return False
        
        possible_miles = row.find_all("td", class_="miles")
        if len(possible_miles) == 2:
            miles = row.find_all("td", class_="miles")[1]
        elif len(possible_miles) == 1:
            miles = row.find("td", class_="miles")
        else:
            return False
        station = miles.findNext('td')

        times = row.find_all("td", class_="times")
        for num, time in zip(train_nums, times):
            stops.loc[len(stops)] = [num, station.text.strip(), time.text.strip(), miles.text.strip()]
        return True
    except Exception as e:
        print(row, e)
        raise e

def parse_schedule(schedule_filename):
    with open(schedule_filename, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    train_name = soup.find("h1").text

    # TODO: Date
    description_element = soup.find('h3', id='rrdate')
    table = soup.find('table')


    stops = pd.DataFrame(columns=["train_number", "station", "time", "miles"])

    rows = table.find_all("tr")

    heads = table.find_all("td", class_="tthead")
    for row in heads:
        if row.text.strip() == "Miles":
            p = row.parent

            for i, td in enumerate(p.find_all("td")):
                if td.text.strip() == "Miles":
                    miles_index = i
                    station_index = i + 1
                    break

            break

    train_nums = None
    num_rows_skipped = 0
    for row in rows:
        if row.find("td", class_="trainnum") is not None:
            # TODO: Map train number since they continue off each other
            train_nums = [t.text.strip() for t in row.find_all("td", class_="trainnum")]
            continue

        added_schedule = maybe_get_schedule_from_row(stops, row, train_nums, miles_index, station_index)
        if not added_schedule:
            num_rows_skipped += 1
    print(train_name, num_rows_skipped, "rows skipped")
    
    return train_name, stops




# parse_schedule("dump/concourse/track1/birmspecial194112.html")[1]
# parse_schedule("dump/concourse/track1/carolinaspecial196410.html")
# parse_schedule("dump/concourse/track1/cityneworl194706.html")
# parse_schedule("dump/concourse/track1/cityneworl197104.html")
# parse_schedule("dump/concourse/track1/georgewash196706.html")
# parse_schedule("dump/concourse/track1/gulfcoastrebel195008.html")
# parse_schedule("dump/concourse/track3/erieltd195103.html")
# parse_schedule("dump/concourse/track11/indianpacific198805.html")[1]

In [118]:
file_name_to_name_and_schedule = {}
for file in all_train_files:
    if "track10" in file or "track11" in file:
        # This is Mexico and/or Europe
        continue
    try:
        train_name, stops = parse_schedule(file)
        file_name_to_name_and_schedule[file] = (train_name, stops)
    except Exception as e:
        print(f"Error parsing {file}: {e}")
        continue
    

The Birmingham Special 9 rows skipped
The Carolina Special 9 rows skipped
The City of Memphis 1 rows skipped
The City of New Orleans 10 rows skipped
The City of New Orleans 9 rows skipped
The Crescent 14 rows skipped
The Southern Crescent 12 rows skipped
The Southern Crescent 7 rows skipped
The George Washington
The F. F. V.
The Sportsman 27 rows skipped
The Georgian 13 rows skipped
The Gulf Coast Rebel 8 rows skipped
The Humming Bird 6 rows skipped
The Humming Bird 27 rows skipped
The Pan-American 11 rows skipped
The Pan-American 10 rows skipped
The Piedmont
Trains 7 & 8
Trains 3 & 4 6 rows skipped
The Powhatan Arrow 5 rows skipped
The Rebel 8 rows skipped
The Rebel 6 rows skipped
The Silver Comet 7 rows skipped
The Southern Belle 4 rows skipped
The Southern Belle 5 rows skipped
The Southerner 8 rows skipped
The Tennessean 16 rows skipped
The Tamiami Champion (East Coast) 10 rows skipped
The Tamiami Champion (West Coast) 10 rows skipped
The Champion 9 rows skipped
The City of Miami 2 

In [119]:
for file, (_, stops) in file_name_to_name_and_schedule.items():
    print(file, set(stops["station"]))

dump/concourse/track1/birmspecial194112.html {'Manassas, VA', 'Amherst, VA', 'Greeneville, TN', 'Attalla, AL', 'Christiansburg, VA', 'Sheffield, AL (Muscle Shoals)', 'Charlottesville, VA', 'Memphis, TN (Union Sta.) (CT)', 'Sweetbriar, VA', 'Chattanooga, TN', 'Washington, DC', 'North Philadelphia, PA', 'Hattiesburg, MS', 'New Orleans, LA (Terminal\r\nSta.) (CT)', 'Trenton, NJ', 'Philadelphia,\r\nPA (30th\r\nSt. Sta.)', 'Birmingham, AL', 'Bedford, VA', 'Lynchburg, VA (Kemper St.)', 'Pulaski, VA', 'Bristol, TN-VA (ET)', 'Sweetwater, TN', 'Loudon, TN', 'Orange, VA', 'Knoxville,\r\nTN', 'Newark, NJ', 'Monroe, VA', 'Radford, VA', 'Chattanooga, TN (Terminal\r\nSta.)', 'Chattanooga, TN (Terminal Sta.)', 'Fort Payne, AL', 'Birmingham, AL (CT)', 'Jonesboro, TN', 'Culpeper, VA', 'Wytheville, VA', 'Johnson City, TN', 'Calverton, VA', 'Athens, TN', 'Roanoke,\r\nVA', 'New York, NY (Penna. Sta.) (ET)', 'Shipman, VA', 'Salem, VA', 'Morristown, TN', 'Lenoir City, TN', 'Cleveland, TN', 'Laurel, MS', 'Wi

In [120]:
timezones = {"(ET)", "(MT)", "(CT)", "(PT)", "(AT)", "(EDT)", "(CST)", "(CDT)", "(EST)", "(PST)", "(MST)", "(PDT)", "(MST)", "(AST)"}
def clean_station_name(station):
    for t in timezones:
        station = station.replace(t, "")
    return station.replace("\r\n", " ")

In [121]:
all_station_names = set()
for _, stops in file_name_to_name_and_schedule.values():
    all_station_names.update(stops["station"])

In [122]:
all_station_names

{'',
 'Niagara Falls, NY',
 'Forrest, IL',
 'Everett, WA',
 'Chicago, IL (Dearborn Sta.)\r\n(CT)',
 'Rayne, LA',
 'Carbon Cliff, IL',
 'De Land, FL (Daytona Beach)',
 'Cascade Locks, OR',
 'New Orleans, LA (Carrollton\r\nAvenue)',
 'New\r\nYork, NY (Penn Station)',
 'McLaughlin, SD',
 'Decatur, AL',
 'Richmond, VA (Main\r\nStreet Station)',
 'Brazos, TX',
 'Gillett, FL',
 'Bay City, WI',
 'Mildred, MT',
 'Marengo, WA',
 'Normangee, TX',
 'Los Angeles, CA (S.P.\r\nStation) (PT)',
 'Westerly, RI',
 'Danby, CA',
 'Galion, OH',
 'Bradenton-Manatee, FL',
 'Dome, AZ',
 'Culpeper, VA',
 'Houston, TX (Union\r\nStation)',
 'Plymouth, MI',
 'Houston, TX (Southern Pacific\r\nSta.)',
 'Armstrong, ON (ET)',
 'Davis Junction, IL (Rockford\r\nvia bus)',
 'Victor, CA',
 'Casa Blanca, CA',
 'Worcester, MA',
 'Little Rock, AR (CT)',
 'Glacier, BC',
 'Fremont, CA',
 'Leesburg, FL',
 'Washington, DC (Union\r\nStation) (ET)',
 'Quinlan, OK',
 'Hillsdale, WY',
 'Suspension Bridge, NY (ET)',
 'East Palestine

In [123]:
clean_station_names = {clean_station_name(s) for s in all_station_names}
clean_station_names

{'',
 'Manassas, VA',
 'Boston, MA (South Station) ',
 'Dryden, WA',
 'Spearville, KS',
 'Niagara Falls, NY',
 'Yarmouth, NS ',
 'Forrest, IL',
 'Richton, MS',
 'Everett, WA',
 'Heavener, OK',
 'Louisville, KY ',
 'Sedalia, CO',
 'Chicago, IL (Central Station) ',
 'Jerita, WA',
 'Carbon Cliff, IL',
 'Rayne, LA',
 'Tama, IA',
 'North Philadelphia, PA',
 'Centerville, AL',
 'Williams Junction, AZ',
 'Sands, NM',
 'De Quincy, LA',
 'De Land, FL (Daytona Beach)',
 'Small, TX',
 'Wallis, TX',
 'Bozeman, MT',
 'Frankfort, KY',
 'Biloxi, MS',
 'Jamestown, NY',
 'Cullman, AL',
 'Frankfort, IN',
 'Humboldt, IA (Dakota City)',
 'Cascade Locks, OR',
 'Davenport, IA',
 'Gleichen, AB',
 'Silerton, TN',
 'Marianna, FL ',
 'Gastonia, NC',
 'Julesburg, CO',
 'Ono, CA',
 'Shafter, NV',
 'Wooster, OH',
 'Nassawadox, VA',
 'St. Paul, MN ',
 'Lane, ID',
 'Gordon, AL ',
 'Ferdinand, ID',
 'McLaughlin, SD',
 'Decatur, AL',
 'Lakin, KS',
 'Mayfield, KY',
 'Kentville, NS',
 'Morristown, TN',
 'Crisfield, KS',

In [124]:
len(clean_station_names)

4684