In [3]:
import numpy as np

In [4]:
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session

In [5]:
engine = create_engine('sqlite:///data/aviation_accidents.sqlite')

In [6]:
# reflect and automap, view existing tables in source db
Base = automap_base()
Base.prepare(engine,reflect=True)
Base.classes.keys()

['aviation_accidents']

In [7]:
Accident = Base.classes.aviation_accidents

In [8]:
session = Session(bind=engine)

In [9]:
results = session.query(Accident.EventId).all()

In [10]:
# check for duplicates >> 2 max, none have count > 2
events = list(np.ravel(results))
set([x for x in events if events.count(x) > 1])

In [11]:
# generate 2-letter codes from AccidentNumber
results = session.query(Accident.AccidentNumber).all()

types = []
for result in list(np.ravel(results)):
    a = result[5:7]
    types.append(a)

In [12]:
# confirm 11248 codes
len(results)

11248

#### webscrape HTML reports

In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs

In [27]:
executable_path = {'executable_path':'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

In [13]:
# generate direct urls to HTML reports
# EventID is not unique => increment AKey for multiple ids
# IType is pulled from generated 2-letter codes
base_url = 'https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx'
'EventID={event[i]}&AKey={key[i]}&RType=HTML&IType={type[i]}'

to_visit = []
for i, event in enumerate(events):
    url = base_url + '?EventID=' + event + '&AKey=1' + \
        '&RType=HTML' + '&IType=' + types[i]
    if url in to_visit:
        url = base_url + '?EventID=' + event + '&AKey=2' + \
            '&RType=HTML' + '&IType=' + types[i]
    to_visit.append(url)

In [14]:
# confirm 11248 generated urls
len(to_visit)

11248

**scrape NTSB data**

In [15]:
import re

In [28]:
%%time

# visit each generated URL, find departure point
# grab airport code in parentheses
airports = []
for i, url in enumerate(to_visit):
    browser.visit(url)
    html = browser.html
    soup = bs(html,'html.parser')

    # navigate to departure point if exists
    if soup(text=re.compile('Departure Point')):
        t = soup(text=re.compile('Departure Point'))[0]

        # grab airport code in parentheses if exists
        if re.search('\(([^)]+)', t.parent.parent.parent.parent.text):
            code = re.search('\(([^)]+)', t.parent.parent.parent.parent.text).group(1)
        else:
            code = 'NO CODE'
    else:
        code = 'Missing Report'
        
    airports.append(code)

3 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20181112X55458&AKey=1&RType=HTML&IType=WA
21 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20181025X34651&AKey=1&RType=HTML&IType=WA
25 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20181022X64842&AKey=1&RType=HTML&IType=WA
39 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20181009X50459&AKey=1&RType=HTML&IType=WA
44 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20181108X50459&AKey=1&RType=HTML&IType=WA
47 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20181004X75323&AKey=1&RType=HTML&IType=WA
55 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20181004X25929&AKey=1&RType=HTML&IType=WA
71 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20180917X41852&AKey=1&RType=HTML&IType=WA
81 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20180912X80231&AKey=1&RType=HTML&IT

1164 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170517X14413&AKey=1&RType=HTML&IType=WA
1167 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170524X63941&AKey=1&RType=HTML&IType=WA
1220 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170609X60849&AKey=1&RType=HTML&IType=WA
1229 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170419X40838&AKey=1&RType=HTML&IType=WA
1257 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170330X00553&AKey=1&RType=HTML&IType=WA
1278 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170316X90652&AKey=1&RType=HTML&IType=WA
1281 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170315X62526&AKey=1&RType=HTML&IType=WA
1282 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170313X61441&AKey=1&RType=HTML&IType=WA
1295 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20170308X91038&A

2327 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20151127X31751&AKey=1&RType=HTML&IType=WA
2329 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20151123X54714&AKey=1&RType=HTML&IType=WA
2344 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20151123X63526&AKey=1&RType=HTML&IType=WA
2352 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20151109X20902&AKey=1&RType=HTML&IType=WA
2421 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20151013X23913&AKey=1&RType=HTML&IType=WA
2436 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20150928X20026&AKey=1&RType=HTML&IType=WA
2437 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20150928X20026&AKey=2&RType=HTML&IType=WA
2451 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20150924X44711&AKey=1&RType=HTML&IType=WA
2486 https://app.ntsb.gov/pdfgenerator/ReportGeneratorFile.ashx?EventID=20160411X70823&A

In [29]:
# confirm 11248 airport codes
len(airports)

11248

In [31]:
# number of missing codes
airports.count('NO CODE')

1018

In [32]:
# number of unfinished reports
airports.count('Missing Report')

209

In [33]:
browser.quit()

#### save airport codes to .TXT file

In [37]:
file = open('data/airports.txt','w+')

for airport in airports:
    file.write(f'{airport}\n')
    
file.close()