In [1]:
from bs4 import BeautifulSoup
import requests
import os
from pathlib import Path
import pandas as pd

path_prefix = "dump/"
base_url = "http://www.streamlinerschedules.com/"

In [2]:
def download_track_pages():
    track_pages = [f"concourse/track{i}" for i in range(1, 13)]
    for page in track_pages:
        Path(path_prefix + page).mkdir(exist_ok=True, parents=True)
        with open(f"{path_prefix}{page}/index.html", "wb+") as f:
            f.write(requests.get(f"{base_url}{page}/index.html").content)

check_file = Path(f"{path_prefix}/concourse/track1/index.html")

if check_file.exists() and check_file.stat().st_size > 0:
    print("Already downloaded")
else:
    # download_track_pages()
    pass

Already downloaded


In [3]:
track_pages = [f"concourse/track{i}/index.html" for i in range(1, 13)]

In [4]:
with open(path_prefix + track_pages[0], "rb") as f:
    soup = BeautifulSoup(f, "html.parser")


In [5]:
# Returns a list like "birmspecial194112.html", "carolinaspecial196410.html", etc
def get_train_links(soup):
    relative_links = []
    for a in soup.find_all("a"):
        href = a.get("href")
        if href is not None and ".." not in href and "http://" not in href:
            relative_links.append(href)
    return relative_links

def get_link(relative_link, page):
    page = page.rstrip("index.html")
    return f"{base_url}{page}{relative_link}"


In [6]:
def read_child_pages(page):
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            with open(f"{path_prefix}{page.rstrip('index.html')}/{link}", "wb+") as f:
                f.write(requests.get(get_link(link, page)).content)

In [7]:
all_train_files = []
for page in track_pages:
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            file_location = f"{path_prefix}{page.rstrip('index.html')}{link}"
            all_train_files.append(file_location)


In [9]:
all_train_files

['dump/concourse/track1/birmspecial194112.html',
 'dump/concourse/track1/carolinaspecial196410.html',
 'dump/concourse/track1/citymemphis195008.html',
 'dump/concourse/track1/cityneworl194706.html',
 'dump/concourse/track1/cityneworl197104.html',
 'dump/concourse/track1/crescent195008.html',
 'dump/concourse/track1/crescent197104.html',
 'dump/concourse/track1/crescent197303.html',
 'dump/concourse/track1/georgewash196706.html',
 'dump/concourse/track1/georgian196308.html',
 'dump/concourse/track1/gulfcoastrebel195008.html',
 'dump/concourse/track1/humbird194706.html',
 'dump/concourse/track1/humbird196308.html',
 'dump/concourse/track1/pan-am192701.html',
 'dump/concourse/track1/pan-am195407.html',
 'dump/concourse/track1/piedmontltd197303.html',
 'dump/concourse/track1/powhatan195008.html',
 'dump/concourse/track1/rebel193809.html',
 'dump/concourse/track1/rebel195304.html',
 'dump/concourse/track1/silvercomet194706.html',
 'dump/concourse/track1/soubelle194106.html',
 'dump/concours

In [20]:
def parse_schedule(schedule_filename):
    with open(schedule_filename, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    train_name = soup.find("h1").text
    print(train_name)

    # TODO: Date
    description_element = soup.find('h3', id='rrdate')
    table = soup.find('table')


    stops = pd.DataFrame(columns=["train_number", "station", "time", "miles"])

    rows = table.find_all("tr")

    heads = table.find_all("td", class_="tthead")
    for row in heads:
        if row.text.strip() == "Miles":
            p = row.parent

            for i, td in enumerate(p.find_all("td")):
                if td.text.strip() == "Miles":
                    miles_index = i
                    station_index = i + 1
                    print(miles_index)
                    break

            break

    for row in rows:
        if row.find("td", class_="trainnum") is not None:
            # TODO: Map train number since they continue off each other
            train_nums = [t.text.strip() for t in row.find_all("td", class_="trainnum")]
            continue
        # Don't differentiate between departure and arrival time, since they can be on different lines.
        if row.find("td", class_="times") is None or len(row.find_all("td", class_="times")) != len(train_nums):
            continue
        times = row.find_all("td", class_="times")

        miles = row.find_all("td")[miles_index].text
        station = row.find_all("td")[station_index].text

        for num, time in zip(train_nums, times):
            stops = stops._append({"train_number": num, "station": station.strip(), "time": time.text.strip(), "miles": miles}, ignore_index=True)
    
    return stops




# parse_schedule("dump/concourse/track1/birmspecial194112.html")
# parse_schedule("dump/concourse/track1/carolinaspecial196410.html")
# parse_schedule("dump/concourse/track1/cityneworl194706.html")
parse_schedule("dump/concourse/track1/cityneworl197104.html")

The City of New Orleans
2


Unnamed: 0,train_number,station,time,miles
0,1,"Chicago, IL (Central\r\nStation)\r\n(CT)",8 00A,0.0
1,2,"Chicago, IL (Central\r\nStation)\r\n(CT)",12 35A,0.0
2,1,"63rd St., Woodlawn, IL",8 10A,6.6
3,2,"63rd St., Woodlawn, IL",12\r\n05A,6.6
4,1,"Homewood, IL",8 40A,22.1
...,...,...,...,...
71,2,"Hammond, LA",8 05A,867.8
72,1,"New Orleans, LA (Carrollton\r\nAvenue)",1 15A,918.7
73,2,"New Orleans, LA (Carrollton\r\nAvenue)",7 07A,918.7
74,1,"New Orleans, LA (Union\r\nPsgr.\r\nTml.) (CT)",1 30A,921.1


<td class="tthead">Miles</td>

In [21]:
stations = []
for file in all_train_files:
    try:
        stations.extend(parse_schedule(file).station)
    except Exception as e:
        print(f"Error parsing {file}: {e}")
        continue
    

The Birmingham Special
2
The Carolina Special
2
The City of Memphis
2
The City of New Orleans
2
The City of New Orleans
2
The Crescent
2
The Southern Crescent
2
The Southern Crescent
2
The George Washington
The F. F. V.
The Sportsman
3
Error parsing dump/concourse/track1/georgewash196706.html: list index out of range
The Georgian
2
The Gulf Coast Rebel
2
The Humming Bird
2
The Humming Bird
2
The Pan-American
2
The Pan-American
2
The Piedmont
Trains 7 & 8
Trains 3 & 4
3
Error parsing dump/concourse/track1/piedmontltd197303.html: list index out of range
The Powhatan Arrow
2
The Rebel
2
The Rebel
2
The Silver Comet
2
The Southern Belle
2
The Southern Belle
2
The Southerner
2
The Tennessean
3
Error parsing dump/concourse/track1/tennessean195212.html: list index out of range
The Tamiami Champion (East Coast)
2
The Tamiami Champion (West Coast)
2
The Champion
2
The City of Miami
2
The City of Miami
2
The  Del-Mar-Va Express
2
The  Dixie Flagler
2
The Florida Special
2
The  Gulf Wind
2
The Gu

In [129]:
stations

[' New York, NY (Penna. Sta.) (ET)',
 ' New York, NY (Penna. Sta.) (ET)',
 ' Newark, NJ',
 ' Newark, NJ',
 ' Trenton, NJ',
 ' Trenton, NJ',
 ' North Philadelphia, PA',
 ' North Philadelphia, PA',
 'Philadelphia,\r\nPA (30th\r\nSt. Sta.)',
 'Philadelphia,\r\nPA (30th\r\nSt. Sta.)',
 '\r\n3 12P',
 '\r\n3 12P',
 ' Wilmington, DE',
 ' Wilmington, DE',
 'Baltimore,\r\nMD',
 'Baltimore,\r\nMD',
 '\r\n1 41P',
 '\r\n1 41P',
 ' Washington, DC',
 ' Washington, DC',
 ' Washington, DC',
 ' Washington, DC',
 ' Alexandria, VA',
 ' Alexandria, VA',
 ' Manassas, VA',
 ' Manassas, VA',
 ' Calverton, VA',
 ' Calverton, VA',
 ' Culpeper, VA',
 ' Culpeper, VA',
 ' Orange, VA',
 ' Orange, VA',
 ' Charlottesville, VA',
 ' Charlottesville, VA',
 ' Shipman, VA',
 ' Shipman, VA',
 ' Amherst, VA',
 ' Amherst, VA',
 ' Sweetbriar, VA',
 ' Sweetbriar, VA',
 ' Monroe, VA',
 ' Monroe, VA',
 ' Lynchburg, VA (Kemper St.)',
 ' Lynchburg, VA (Kemper St.)',
 ' Lynchburg, VA (Kemper St.)',
 ' Lynchburg, VA (Kemper St.)',


In [92]:
with open("dump/concourse/track1/birmspecial194112.html", "rb") as f:
    soup = BeautifulSoup(f, "html.parser")

train_name = soup.find("h1", id="trainname").text
print(train_name)

description_element = soup.find('h3', id='rrdate')
table = soup.find('table')

train_numbers = table.find_all("td", class_="trainnum")
for train_number in train_numbers:
    print(train_number.text)

The Birmingham Special
 125
 176(3rd)
 17
 18
 6
 5
 35
 36
 47
 48


In [101]:
stops = pd.DataFrame(columns=["train_number", "station", "time", "miles"])

rows = table.find_all("tr")
for i, header in enumerate(rows[1].find_all("td")):
    if header.text == "Miles":
        miles_index = i
        station_index = i + 1

for row in rows:
    if row.find("td", class_="trainnum") is not None:
        train_nums = [t.text.strip() for t in row.find_all("td", class_="trainnum")]
        continue
    if row.find("td", class_="times") is None:
        continue
    times = row.find_all("td", class_="times")

    miles = row.find_all("td")[miles_index].text
    station = row.find_all("td")[station_index].text

    for num, time in zip(train_nums, times):
        stops = stops._append({"train_number": num, "station": station, "time": time.text, "miles": miles}, ignore_index=True)
    
stops

Unnamed: 0,train_number,station,time,miles
0,125,"New York, NY (Penna. Sta.) (ET)",\r\n12 30P,0.0
1,176(3rd),"New York, NY (Penna. Sta.) (ET)",\r\n4 45P,0.0
2,125,"Newark, NJ",\r\n12 45P,10.0
3,176(3rd),"Newark, NJ",\r\n4 29P,10.0
4,125,"Trenton, NJ",\r\n1 31P,58.1
...,...,...,...,...
113,48,"Laurel, MS",11 04A,209.1
114,47,"Hattiesburg, MS",\r\n5 43P,238.0
115,48,"Hattiesburg, MS",10 30A,238.0
116,47,"New Orleans, LA (Terminal\r\nSta.) (CT)",\r\n8 15P,354.6


In [103]:
for page in track_pages:
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            file_location = f"{path_prefix}{page.rstrip('index.html')}{link}"
            soup = BeautifulSoup(open(file_location, "rb"), "html.parser")
            train_name = soup.find("h1", id="trainname").text
            print(train_name)

The Birmingham Special
The Carolina Special
The City of Memphis
The City of New Orleans


AttributeError: 'NoneType' object has no attribute 'text'