In [3]:
import pandas as pd

import re

import requests
import pdfplumber

from bs4 import BeautifulSoup
import os

# Checking out the website

In [176]:
# Contains the name of the reports that have been updated
def update_reports():
    return os.listdir('./reports')
REPORTS = update_reports()

In [228]:
def beautify_page(url="https://www.transportation.gov/individuals/aviation-consumer-protection/air-travel-consumer-reports-2020"):
    """
    returns BeautifulSoup object that can be used to parse.
    
    Args:
    url (str) : URL that contains links to DOT reports
    
    Returns:
    soup (obj) : BeautifulSoup object
    """
    page = requests.get(url)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        print('Connection Successful!')
        print(url)
        return soup
    else:
        print('Connection Failure!')
        print(f'Status Code: {page.status_code}')

In [98]:
soup = beautify_page()

Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/air-travel-consumer-reports-2020


In [302]:
def get_all_pdf(soup):
    """
    Extracts all the pdf links from beautified soup of DOT website
    
    Args:
    soup (obj) : Beautiful Soup object 
    
    Returns:
    list_to_update (list) : list of pdf links to DOT data
    """
    
    list_to_update = []
    report = soup.find_all('div', class_='mb-4 clearfix')
    
    for a in report[0].find_all('a', href=True):
        sub_link = a['href']
        if 'individual' in sub_link:
            if not (sub_link.startswith('http') or sub_link.startswith('www')):
                sub_link = 'https://www.transportation.gov' + sub_link
            sub_page = beautify_page(sub_link)

            list_to_update.append(sub_page.find(class_='file').find('a')['href'])
            
    return list_to_update

In [101]:
list_to_update = get_all_pdf(soup)

Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/december-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/november-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/october-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/september-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/august-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/july-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/june-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/indi

# Downloading and Openining the file

In [229]:
# Source: https://www.youtube.com/watch?v=eTz3VZmNPSE
def download_pdf(url):
    """
    Saves pdf files to local directory
    
    Args:
    url (str) : string that contains PDF url information
    
    Returns:
    (str) : string that shows the local directory of where the pdf file is stored
    """
    # Extracts the last part of the URL to be used as the name of the file
    local_filename = url.split('/')[-1].replace('%','')
    
    if local_filename not in REPORTS:
        with requests.get(url) as r:
            with open(f'reports/{local_filename}', 'wb') as f:
                f.write(r.content)
                
        # updates report files in the directory
        return f'reports/{local_filename}'
    else:
        print(f'Already in the database - {local_filename}')
        return False

In [184]:
# URL = 'https://www.transportation.gov/sites/dot.gov/files/2021-02/February_%202021%20ATCR.pdf'
URL = 'https://www.transportation.gov/sites/dot.gov/files/2020-01/january-2020-atcr-1-16-2020.pdf'

In [185]:
filename = download_pdf(URL)

In [186]:
filename

'reports/january-2020-atcr-1-16-2020.pdf'

# Dictionaries

In [187]:
months_dict = {'january': 1, 'february': 2, 'march': 3, 'april':4, 'may': 5, 
               'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
               'november': 11, 'december': 12, 'jan': 1, 'feb': 2, 'mar': 3, 'apr':4,
               'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

In [35]:
quarters_dict = {'january': 1, 'april':2, 'july': 3, 'october': 4,
                 'jan': 1, 'apr':2, 'jul': 3, 'oct': 4}

In [304]:
features_dict = {'mishandled_baggage': ['Number of Bags Enplaned',
                                        'Number of Bags Mishandled',
                                        'Number of Bags Mishandled Per 1000 Enplaned'],
                 
                'mishandled_ws': ['Number of Wheelchairs and Scooters Enplaned',
                                  'Number of Wheelchairs and Scooters Mishandled',
                                  'Percent of wheelchairs and Scooters Mishandled'],
                 
                'denied_boarding': ['Voluntary', 
                                    'Involuntary', 
                                    'Enplaned', 
                                    'Involuntary DB Per 10000 Passengers']}

In [305]:
features_order = {'mishandled_baggage': ['Carrier', 
                                        'Year',
                                        'Month',
                                        'Number of Bags Enplaned',
                                        'Number of Bags Mishandled',
                                        'Number of Bags Mishandled Per 1000 Enplaned'],
                 
                'mishandled_ws': ['Carrier', 
                                  'Year',
                                  'Month',
                                  'Number of Wheelchairs and Scooters Enplaned',
                                  'Number of Wheelchairs and Scooters Mishandled',
                                  'Percent of wheelchairs and Scooters Mishandled'],
                 
                'denied_boarding': ['Carrier', 
                                    'Year',
                                    'Quarter',
                                    'Voluntary', 
                                    'Involuntary', 
                                    'Enplaned', 
                                    'Involuntary DB Per 10000 Passengers']}

# Finding the desired page number

In [230]:
def find_operating_page_numbers(filename):
    """
    Finds page numbers for operating metrics.
    
    Args:
    filename (str) : Directory for DOT pdf file.
    
    Returns:
    operating_pages (list) : list of page numbers where the following are located [index]:
        [0]: Mishandled baggages
        [1]: Mishandled wheelchairs and scooters
        [2]: Denied Boarding
    """
    with pdfplumber.open(filename) as pdf:
        page = pdf.pages[1] # page 41 is missing baggage information
        text = page.extract_text()
    op_re_exp = r'(Operating Carrier (\(Monthly\)|\(Quarterly\)) \s*\d{1,})|(Reporting Carrier(\s*|\s\(Quarterly\)\s*)\d{1,})'
    re_operating_pages = re.compile(op_re_exp)
    
    operating_pages = [] # mishandled baggage / mishandled wheelchairs
    for line in text.split('\n'):
        if re_operating_pages.search(line):
            operating_pages.append(int(list(filter(lambda x: x!='', line.split(' ')))[-1]))
    if len(operating_pages) < 3:
        print('There are only 3 operating pages!')
    return operating_pages

In [188]:
operating_pages = find_operating_page_numbers(filename)

In [189]:
operating_pages

[36, 39, 42]

# Mishandled Baggages

In [231]:
def get_table_values_monthly(filename, page_num):
    """
    Extracts monthly values out of pdf file.
    
    Args: 
    filename (str) : string that contains local directory of DOT pdf file
    
    page_num (int) : integer that shows the page number in PDF file with desired data
    
    Returns:
    carrier_names (list) : list of airline carriers in order appeared in pdf
    
    values (list) : list metric values extracted from pdf (same order as carrier_names)
    
    month (int) : month (M)
    
    year (int) : year (YYYY)
    """
    with pdfplumber.open(filename) as pdf:
        page = pdf.pages[page_num - 1] 
        text = page.extract_text()

    # Regex
    re_month = re.compile(r'^[A-Za-z]*.\d{2,}') # Finds the month/year
    re_new_rank = re.compile(r'^\d{1,}\s*[A-Z].*') # finds indices
    re_carrier_name = re.compile(r'[A-Z].*[A-Z]') # Carrier Name

    # instantiate lists for values
    values = []
    carrier_names = []
    month = None
    for i, line in enumerate(text.split('\n')):
        # Extracts month infomation
        if not month and re_month.search(line):
            if '-' in line:
                month = months_dict[line.split('-')[0].lower()]
                year = int('20'+line.split('-')[1][:2])
            else:
                month = months_dict[line.split(' ')[0].lower()]
                year = int(line.split(' ')[1])
        if re_new_rank.match(line):
            try:
                carrier_names.append(re_carrier_name.search(line)[0])

            except:
                print(f'An error has occured while parsing through a line. [Carrier Name] Line number {i}')
                print(line)
            try:
                # Filters out all empty strings from the list
                vals = list(filter(lambda x: x!='', line.split('  ')))[2:5]

                # Converts string to numerical values int or float
                vals = [int(x.replace(',', '')) if i!=len(vals)-1 else float(x) for i, x in enumerate(vals)]


                values.append(vals)
            except:
                print(f'An error has occured while parsing through a line. [MB or WS] Line number {i}')
                print(line)
    return carrier_names, values, month, year

In [190]:
carrier_names, values, month, year = get_table_values_monthly(filename, operating_pages[0])
# Creating DataFrame

df_mb = pd.DataFrame(values)
df_mb.columns = features_dict['mishandled_baggage']
df_mb['Carrier'] = carrier_names
df_mb['Month'] = month
df_mb['Year'] = year

In [191]:
df_mb

Unnamed: 0,Number of Bags Enplaned,Number of Bags Mishandled,Number of Bags Mishandled Per 1000 Enplaned,Carrier,Month,Year
0,475890,728,1.53,ALLEGIANT AIR,11,2019
1,856234,2855,3.33,ENDEAVOR AIR,11,2019
2,9287654,33306,3.59,SOUTHWEST AIRLINES,11,2019
3,500011,1894,3.79,HAWAIIAN AIRLINES,11,2019
4,6102639,23162,3.8,DELTA AIR LINES,11,2019
5,832622,3382,4.06,FRONTIER AIRLINES,11,2019
6,1604337,6521,4.06,ALASKA AIRLINES,11,2019
7,991642,4323,4.36,SPIRIT AIRLINES,11,2019
8,347816,1532,4.4,EXPRESSJET AIRLINES,11,2019
9,2391568,10968,4.59,SKYWEST AIRLINES,11,2019


# Mishandled Wheelchairs and Scooters

In [192]:
carrier_names, values, month, year = get_table_values_monthly(filename, operating_pages[1])

df_ws = pd.DataFrame(values)
df_ws.columns = features_dict['mishandled_ws']
df_ws['Carrier'] = carrier_names
df_ws['Month'] = month
df_ws['Year'] = year

df_ws.set_index('Carrier')

Unnamed: 0_level_0,Number of Wheelchairs and Scooters Enplaned,Number of Wheelchairs and Scooters Mishandled,Percent of wheelchairs and Scooters Mishandled,Month,Year
Carrier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALLEGIANT AIR,1271,3,0.24,11,2019
EXPRESSJET AIRLINES,401,3,0.75,11,2019
ENDEAVOR AIR,1196,10,0.84,11,2019
DELTA AIR LINES,12228,109,0.89,11,2019
SKYWEST AIRLINES,3177,32,1.01,11,2019
JETBLUE AIRWAYS,3141,35,1.11,11,2019
ALASKA AIRLINES,2163,30,1.39,11,2019
MESA AIRLINES,789,11,1.39,11,2019
SOUTHWEST AIRLINES,14706,217,1.48,11,2019
REPUBLIC AIRWAYS,1026,16,1.56,11,2019


# Denied Boarding

In [232]:
def get_table_values_quarterly(filename, page_num): 
    """
    Extracts quarterly values out of pdf file.
    
    Args: 
    filename (str) : string that contains local directory of DOT pdf file
    
    page_num (int) : integer that shows the page number in PDF file with desired data
    
    Returns:
    carrier_names (list) : list of airline carriers in order appeared in pdf
    
    values (list) : list metric values extracted from pdf (same order as carrier_names)
    
    quarter (int) : quarter (1-4)
    
    year (int) : year (YYYY)
    """
    with pdfplumber.open(filename) as pdf:
        page = pdf.pages[page_num - 1] 
        text = page.extract_text()

    # Regex
    re_month = re.compile(r'^[A-Za-z]*.-.[A-Za-z]*.\d{2,}') # Finds the month/year
    re_new_rank = re.compile(r'^\d{1,}\s*[A-Z].*') # finds indices
    re_carrier_name = re.compile(r'[A-Z].*[A-Z]') # Carrier Name

    # instantiate lists for values
    values = []
    carrier_names = []
    quarter = None
    for i, line in enumerate(text.split('\n')):
        # Extracts time infomation
        if not quarter and re_month.search(line):
            quarter = quarters_dict[line.split(' ')[0].split('-')[0].lower()]
#             year = int(line.split(' ')[2])
            year = re.search('\d{2,}', line)[0]
    
            if len(year) == 2:
                year = int('20' + year) # adds 2000 to the year
            else:
                year = int(year)
                
        if re_new_rank.match(line):
            try:
                carrier_names.append(re_carrier_name.search(line)[0])

            except:
                print(f'An error has occured while parsing through a line. [Carrier Name] Line number {i}')
                print(line)
            try:
                # Filters out all empty strings from the list
                vals = list(filter(lambda x: x!='', line.split('  ')))[2:6]

                vals = [int(x.replace(',', '')) if i!=len(vals)-1 else float(x) for i, x in enumerate(vals)]

                values.append(vals)
            except:
                print(f'An error has occured while parsing through a line. [Denied Boarding] Line number {i}')
                print(line)
    return carrier_names, values, quarter, year

In [226]:
carrier_names, values, quarter, year = get_table_values_quarterly(filename, operating_pages[2])

df_db = pd.DataFrame(values)
df_db.columns = features_dict['denied_boarding']
df_db['Carrier'] = carrier_names
df_db['Quarter'] = quarter
df_db['Year'] = year

df_db.set_index('Carrier')

Unnamed: 0_level_0,Voluntary,Involuntary,Enplaned,Involuntary DB Per 10000 Passengers,Quarter,Year
Carrier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENDEAVOR AIR,6772,0,3780820,0.0,4,2019
DELTA AIR LINES,26246,0,36796401,0.0,4,2019
HAWAIIAN AIRLINES,101,0,2674636,0.0,4,2019
UNITED AIRLINES,5668,13,25141731,0.01,4,2019
ALLEGIANT AIR,347,2,3613426,0.01,4,2019
EXPRESSJET AIRLINES,990,1,1467237,0.01,4,2019
JETBLUE AIRWAYS,637,16,9504092,0.02,4,2019
SOUTHWEST AIRLINES,4339,269,40839000,0.07,4,2019
SKYWEST AIRLINES,13524,116,10085483,0.12,4,2019
ALASKA AIRLINES,3147,108,8547197,0.13,4,2019


# Iterating through different files

In [307]:
REPORTS = update_reports()

# Instantiate dataframes
df_mb = pd.DataFrame() # missing baggages
df_ws = pd.DataFrame() # missing wheelchair/scooters
df_db = pd.DataFrame() # denied boarding

# Gets the DOT page with reports
soup = beautify_page()

# Gets list of URL that are available on DOT site
list_to_update = get_all_pdf(soup)

for i, url in enumerate(list_to_update[::-1]):
    filename = download_pdf(url)
    print(f'({i+1}/{len(list_to_update)}) {filename}')
    if filename:
        operating_pages = find_operating_page_numbers(filename)
        print(operating_pages)
        
        ## Mishandling Baggages ##
        print('Processing... Mishandling Baggages')
        carrier_names, values, month, year = get_table_values_monthly(filename, operating_pages[0])
        # Creating DataFrame

        df = pd.DataFrame(values)
        df.columns = features_dict['mishandled_baggage']
        df['Carrier'] = carrier_names
        df['Month'] = month
        df['Year'] = year
        
        df_mb = pd.concat([df_mb, df], axis=0)
        # Changes the order of the features
        df_mb = df_mb[features_order['mishandled_baggage']]
        
        
        ## Mishandling Wheelchairs and Scooters ##
        print('Processing... Mishandling W/S')
        carrier_names, values, month, year = get_table_values_monthly(filename, operating_pages[1])
        # Creating DataFrame

        df = pd.DataFrame(values)
        df.columns = features_dict['mishandled_ws']
        df['Carrier'] = carrier_names
        df['Month'] = month
        df['Year'] = year
        
        df_ws = pd.concat([df_ws, df], axis=0)
        # Changes the order of the features
        df_ws = df_ws[features_order['mishandled_ws']]
        
        ## Denied Boarding ##
        print('Processing... Denied Boarding')
        carrier_names, values, quarter, year = get_table_values_quarterly(filename, operating_pages[2])
        # Creating DataFrame

        df = pd.DataFrame(values)
        df.columns = features_dict['denied_boarding']
        df['Carrier'] = carrier_names
        df['Quarter'] = quarter
        df['Year'] = year
        
        df_db = pd.concat([df_db, df], axis=0)
        # Changes the order of the features
        df_db = df_db[features_order['denied_boarding']]
        
        print(f'Done.')
print(f'All {len(list_to_update)} files processed!')

Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/air-travel-consumer-reports-2020
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/december-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/november-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/october-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/september-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/august-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/july-2020-air-travel-consumer-report
Connection Successful!
https://www.transportation.gov/individu

In [308]:
REPORTS = update_reports()

# Gets the DOT page with reports
soup = beautify_page('https://www.transportation.gov/individuals/aviation-consumer-protection/air-travel-consumer-reports-2021')

# Gets list of URL that are available on DOT site
list_to_update = get_all_pdf(soup)

for i, url in enumerate(list_to_update[::-1]):
    filename = download_pdf(url)
    print(f'({i+1}/{len(list_to_update)}) {filename}')
    if filename:
        operating_pages = find_operating_page_numbers(filename)
        print(operating_pages)
        
        ## Mishandling Baggages ##
        print('Processing... Mishandling Baggages')
        carrier_names, values, month, year = get_table_values_monthly(filename, operating_pages[0])
        # Creating DataFrame

        df = pd.DataFrame(values)
        df.columns = features_dict['mishandled_baggage']
        df['Carrier'] = carrier_names
        df['Month'] = month
        df['Year'] = year
        
        df_mb = pd.concat([df_mb, df], axis=0)
        # Changes the order of the features
        df_mb = df_mb[features_order['mishandled_baggage']]
        
        
        ## Mishandling Wheelchairs and Scooters ##
        print('Processing... Mishandling W/S')
        carrier_names, values, month, year = get_table_values_monthly(filename, operating_pages[1])
        # Creating DataFrame

        df = pd.DataFrame(values)
        df.columns = features_dict['mishandled_ws']
        df['Carrier'] = carrier_names
        df['Month'] = month
        df['Year'] = year
        
        df_ws = pd.concat([df_ws, df], axis=0)
        # Changes the order of the features
        df_ws = df_ws[features_order['mishandled_ws']]
        
        ## Denied Boarding ##
        print('Processing... Denied Boarding')
        carrier_names, values, quarter, year = get_table_values_quarterly(filename, operating_pages[2])
        # Creating DataFrame

        df = pd.DataFrame(values)
        df.columns = features_dict['denied_boarding']
        df['Carrier'] = carrier_names
        df['Quarter'] = quarter
        df['Year'] = year
        
        df_db = pd.concat([df_db, df], axis=0)
        # Changes the order of the features
        df_db = df_db[features_order['denied_boarding']]
        
        print(f'Done.')
print(f'All {len(list_to_update)} files processed!')

Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/air-travel-consumer-reports-2021
Connection Successful!
https://www.transportation.gov/individuals/aviation-consumer-protection/february-2021-air-travel-consumer-report
Connection Successful!
http://www.transportation.gov/individuals/aviation-consumer-protection/january-2021-air-travel-consumer-report
(1/2) reports/January20202120ATCR20rev202-2-2021.pdf
[36, 39, 42]
Processing... Mishandling Baggages
Processing... Mishandling W/S
Processing... Denied Boarding
Done.
(2/2) reports/February_20202120ATCR.pdf
[41, 46, 51]
Processing... Mishandling Baggages
Processing... Mishandling W/S
Processing... Denied Boarding
Done.
All 2 files processed!


# Converting DataFrame to CSV

In [309]:
import datetime

In [310]:
date = datetime.datetime.now().strftime("%m%d%Y")

file = 'dot_ms_report_' + date
df_mb.to_csv(f'data/{file}.csv', index = False)

file = 'dot_ws_report_' + date
df_ws.to_csv(f'data/{file}.csv', index = False)

file = 'dot_db_report_' + date
df_db.to_csv(f'data/{file}.csv', index = False)