In [1]:
from bs4 import BeautifulSoup
import requests
import os
from pathlib import Path
import pandas as pd

path_prefix = "dump/"
base_url = "http://www.streamlinerschedules.com/"

In [2]:
def download_track_pages():
    track_pages = [f"concourse/track{i}" for i in range(1, 13)]
    for page in track_pages:
        Path(path_prefix + page).mkdir(exist_ok=True, parents=True)
        with open(f"{path_prefix}{page}/index.html", "wb+") as f:
            f.write(requests.get(f"{base_url}{page}/index.html").content)

check_file = Path(f"{path_prefix}/concourse/track1/index.html")

if check_file.exists() and check_file.stat().st_size > 0:
    print("Already downloaded")
else:
    # download_track_pages()
    pass

Already downloaded


In [3]:
track_pages = [f"concourse/track{i}/index.html" for i in range(1, 13)]

In [4]:
with open(path_prefix + track_pages[0], "rb") as f:
    soup = BeautifulSoup(f, "html.parser")


In [5]:
# Returns a list like "birmspecial194112.html", "carolinaspecial196410.html", etc
def get_train_links(soup):
    relative_links = []
    for a in soup.find_all("a"):
        href = a.get("href")
        if href is not None and ".." not in href and "http://" not in href:
            relative_links.append(href)
    return relative_links

def get_link(relative_link, page):
    page = page.rstrip("index.html")
    return f"{base_url}{page}{relative_link}"


In [6]:
def read_child_pages(page):
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            with open(f"{path_prefix}{page.rstrip('index.html')}/{link}", "wb+") as f:
                f.write(requests.get(get_link(link, page)).content)

In [7]:
all_train_files = []
for page in track_pages:
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            file_location = f"{path_prefix}{page.rstrip('index.html')}{link}"
            all_train_files.append(file_location)


In [9]:
all_train_files

['dump/concourse/track1/birmspecial194112.html',
 'dump/concourse/track1/carolinaspecial196410.html',
 'dump/concourse/track1/citymemphis195008.html',
 'dump/concourse/track1/cityneworl194706.html',
 'dump/concourse/track1/cityneworl197104.html',
 'dump/concourse/track1/crescent195008.html',
 'dump/concourse/track1/crescent197104.html',
 'dump/concourse/track1/crescent197303.html',
 'dump/concourse/track1/georgewash196706.html',
 'dump/concourse/track1/georgian196308.html',
 'dump/concourse/track1/gulfcoastrebel195008.html',
 'dump/concourse/track1/humbird194706.html',
 'dump/concourse/track1/humbird196308.html',
 'dump/concourse/track1/pan-am192701.html',
 'dump/concourse/track1/pan-am195407.html',
 'dump/concourse/track1/piedmontltd197303.html',
 'dump/concourse/track1/powhatan195008.html',
 'dump/concourse/track1/rebel193809.html',
 'dump/concourse/track1/rebel195304.html',
 'dump/concourse/track1/silvercomet194706.html',
 'dump/concourse/track1/soubelle194106.html',
 'dump/concours

In [63]:
def maybe_get_schedule_from_row(stops, row, train_nums, miles_index, station_index):
    if train_nums is None:
        return False
    try:
        # Don't differentiate between departure and arrival time, since they can be on different lines.
        if row.find("td", class_="times") is None or len(row.find_all("td", class_="times")) != len(train_nums):
            return False
        miles = row.find("td", class_="miles")
        if miles is None:
            return False
        station = miles.findNext('td')

        times = row.find_all("td", class_="times")
        for num, time in zip(train_nums, times):
            stops.loc[len(stops)] = [num, station.text.strip(), time.text.strip(), miles.text.strip()]
        return True
    except Exception as e:
        print(row, e)
        raise e

def parse_schedule(schedule_filename):
    with open(schedule_filename, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    train_name = soup.find("h1").text

    # TODO: Date
    description_element = soup.find('h3', id='rrdate')
    table = soup.find('table')


    stops = pd.DataFrame(columns=["train_number", "station", "time", "miles"])

    rows = table.find_all("tr")

    heads = table.find_all("td", class_="tthead")
    for row in heads:
        if row.text.strip() == "Miles":
            p = row.parent

            for i, td in enumerate(p.find_all("td")):
                if td.text.strip() == "Miles":
                    miles_index = i
                    station_index = i + 1
                    break

            break

    train_nums = None
    num_rows_skipped = 0
    for row in rows:
        if row.find("td", class_="trainnum") is not None:
            # TODO: Map train number since they continue off each other
            train_nums = [t.text.strip() for t in row.find_all("td", class_="trainnum")]
            continue

        added_schedule = maybe_get_schedule_from_row(stops, row, train_nums, miles_index, station_index)
        if not added_schedule:
            num_rows_skipped += 1
    print(train_name, num_rows_skipped, "rows skipped")
    
    return stops




parse_schedule("dump/concourse/track1/birmspecial194112.html")
# parse_schedule("dump/concourse/track1/carolinaspecial196410.html")
# parse_schedule("dump/concourse/track1/cityneworl194706.html")
# parse_schedule("dump/concourse/track1/cityneworl197104.html")
# parse_schedule("dump/concourse/track1/georgewash196706.html")
# parse_schedule("dump/concourse/track1/gulfcoastrebel195008.html")
# parse_schedule("dump/concourse/track3/erieltd195103.html")

The Birmingham Special 9 rows skipped


Unnamed: 0,train_number,station,time,miles
0,125,"New York, NY (Penna. Sta.) (ET)",12 30P,0.0
1,176(3rd),"New York, NY (Penna. Sta.) (ET)",4 45P,0.0
2,125,"Newark, NJ",12 45P,10.0
3,176(3rd),"Newark, NJ",4 29P,10.0
4,125,"Trenton, NJ",1 31P,58.1
...,...,...,...,...
105,48,"Laurel, MS",11 04A,209.1
106,47,"Hattiesburg, MS",5 43P,238.0
107,48,"Hattiesburg, MS",10 30A,238.0
108,47,"New Orleans, LA (Terminal\r\nSta.) (CT)",8 15P,354.6


<td class="tthead">Miles</td>

In [64]:
stations = []
for file in all_train_files:
    try:
        stations.extend(parse_schedule(file).station)
    except Exception as e:
        print(f"Error parsing {file}: {e}")
        continue
    

The Birmingham Special 9 rows skipped
The Carolina Special 9 rows skipped
The City of Memphis 1 rows skipped
The City of New Orleans 10 rows skipped
The City of New Orleans 9 rows skipped
The Crescent 14 rows skipped
The Southern Crescent 12 rows skipped
The Southern Crescent 7 rows skipped
The George Washington
The F. F. V.
The Sportsman 27 rows skipped
The Georgian 13 rows skipped
The Gulf Coast Rebel 8 rows skipped
The Humming Bird 6 rows skipped
The Humming Bird 27 rows skipped
The Pan-American 11 rows skipped
The Pan-American 10 rows skipped
The Piedmont
Trains 7 & 8
Trains 3 & 4 6 rows skipped
The Powhatan Arrow 5 rows skipped
The Rebel 8 rows skipped
The Rebel 6 rows skipped
The Silver Comet 7 rows skipped
The Southern Belle 4 rows skipped
The Southern Belle 5 rows skipped
The Southerner 8 rows skipped
The Tennessean 16 rows skipped
The Tamiami Champion (East Coast) 10 rows skipped
The Tamiami Champion (West Coast) 10 rows skipped
The Champion 9 rows skipped
The City of Miami 2 

In [66]:
set(stations)

{'',
 'Niagara Falls, NY',
 'Forrest, IL',
 'Everett, WA',
 'Chicago, IL (Dearborn Sta.)\r\n(CT)',
 'Rayne, LA',
 'Carbon Cliff, IL',
 'Sollefteå',
 'Potter, NE',
 'De Land, FL (Daytona Beach)',
 '1450',
 'Cascade Locks, OR',
 'Desert, CA',
 'New Orleans, LA (Carrollton\r\nAvenue)',
 'New\r\nYork, NY (Penn Station)',
 '221',
 'McLaughlin, SD',
 'Decatur, AL',
 'Big Horn, NM',
 '332',
 '219',
 'La Salle, CO (Greeley\r\nvia on-call taxi)',
 'Richmond, VA (Main\r\nStreet Station)',
 'Brazos, TX',
 'Gillett, FL',
 'Bay City, WI',
 'Mildred, MT',
 'Marengo, WA',
 'Vassijaure',
 'Normangee, TX',
 'Los Angeles, CA (S.P.\r\nStation) (PT)',
 'Westerly, RI',
 'Danby, CA',
 'Galion, OH',
 'Bradenton-Manatee, FL',
 'Dome, AZ',
 'Culpeper, VA',
 'Houston, TX (Union\r\nStation)',
 'Plymouth, MI',
 'Houston, TX (Southern Pacific\r\nSta.)',
 'Armstrong, ON (ET)',
 'Falköping',
 'Ucon, ID',
 'Davis Junction, IL (Rockford\r\nvia bus)',
 'Victor, CA',
 'Casa Blanca, CA',
 'Rigby, ID',
 'Worcester, MA',
 

In [92]:
with open("dump/concourse/track1/birmspecial194112.html", "rb") as f:
    soup = BeautifulSoup(f, "html.parser")

train_name = soup.find("h1", id="trainname").text
print(train_name)

description_element = soup.find('h3', id='rrdate')
table = soup.find('table')

train_numbers = table.find_all("td", class_="trainnum")
for train_number in train_numbers:
    print(train_number.text)

The Birmingham Special
 125
 176(3rd)
 17
 18
 6
 5
 35
 36
 47
 48


In [101]:
stops = pd.DataFrame(columns=["train_number", "station", "time", "miles"])

rows = table.find_all("tr")
for i, header in enumerate(rows[1].find_all("td")):
    if header.text == "Miles":
        miles_index = i
        station_index = i + 1

for row in rows:
    if row.find("td", class_="trainnum") is not None:
        train_nums = [t.text.strip() for t in row.find_all("td", class_="trainnum")]
        continue
    if row.find("td", class_="times") is None:
        continue
    times = row.find_all("td", class_="times")

    miles = row.find_all("td")[miles_index].text
    station = row.find_all("td")[station_index].text

    for num, time in zip(train_nums, times):
        stops = stops._append({"train_number": num, "station": station, "time": time.text, "miles": miles}, ignore_index=True)
    
stops

Unnamed: 0,train_number,station,time,miles
0,125,"New York, NY (Penna. Sta.) (ET)",\r\n12 30P,0.0
1,176(3rd),"New York, NY (Penna. Sta.) (ET)",\r\n4 45P,0.0
2,125,"Newark, NJ",\r\n12 45P,10.0
3,176(3rd),"Newark, NJ",\r\n4 29P,10.0
4,125,"Trenton, NJ",\r\n1 31P,58.1
...,...,...,...,...
113,48,"Laurel, MS",11 04A,209.1
114,47,"Hattiesburg, MS",\r\n5 43P,238.0
115,48,"Hattiesburg, MS",10 30A,238.0
116,47,"New Orleans, LA (Terminal\r\nSta.) (CT)",\r\n8 15P,354.6


In [103]:
for page in track_pages:
    with open(path_prefix + page, "rb") as f:
        soup = BeautifulSoup(f, "html.parser")
        links = get_train_links(soup)
        for link in links:
            file_location = f"{path_prefix}{page.rstrip('index.html')}{link}"
            soup = BeautifulSoup(open(file_location, "rb"), "html.parser")
            train_name = soup.find("h1", id="trainname").text
            print(train_name)

The Birmingham Special
The Carolina Special
The City of Memphis
The City of New Orleans


AttributeError: 'NoneType' object has no attribute 'text'