In [11]:
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os


## 1. Quiz: Carrier List

In [12]:
#####################################
#                 1                 #
#####################################

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task in this exercise is to modify 'extract_carrier()` to get a list of
all airlines. Exclude all of the combination values like "All U.S. Carriers"
from the data that you return. You should return a list of codes for the
carriers.

All your changes should be in the 'extract_carrier()' function. The
'options.html' file in the tab above is a stripped down version of what is
actually on the website, but should provide an example of what you should get
from the full file.

Please note that the function 'make_request()' is provided for your reference
only. You will not be able to to actually use it from within the Udacity web UI.
"""

'\nYour task in this exercise is to modify \'extract_carrier()` to get a list of\nall airlines. Exclude all of the combination values like "All U.S. Carriers"\nfrom the data that you return. You should return a list of codes for the\ncarriers.\n\nAll your changes should be in the \'extract_carrier()\' function. The\n\'options.html\' file in the tab above is a stripped down version of what is\nactually on the website, but should provide an example of what you should get\nfrom the full file.\n\nPlease note that the function \'make_request()\' is provided for your reference\nonly. You will not be able to to actually use it from within the Udacity web UI.\n'

In [13]:
html_page = "P4L4_Resources/options.html"


In [14]:
def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")        
        carrier_list = soup.find(id="CarrierList")
        
        for option in carrier_list.find_all('option'):
            carrier_name = option['value']
            
            if "All" in carrier_name:
                pass
            else:
                data.append(carrier_name)
        
        return data


In [15]:
def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__VIEWSTATEGENERATOR",viewstategenerator),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))

    return r.text


In [16]:
def test():
    data = extract_carriers(html_page)
    assert len(data) == 16
    assert "FL" in data
    assert "NK" in data
    

In [17]:
if __name__ == "__main__":
    test()
    

## 2. Quiz: Airport List

In [18]:
#####################################
#                 2                 #
#####################################

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Complete the 'extract_airports()' function so that it returns a list of airport
codes, excluding any combinations like "All".

Refer to the 'options.html' file in the tab above for a stripped down version
of what is actually on the website. The test() assertions are based on the
given file.
"""

'\nComplete the \'extract_airports()\' function so that it returns a list of airport\ncodes, excluding any combinations like "All".\n\nRefer to the \'options.html\' file in the tab above for a stripped down version\nof what is actually on the website. The test() assertions are based on the\ngiven file.\n'

In [19]:
def extract_airports(page):
    data = []
    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")        
        airport_list = soup.find(id="AirportList")
        
        for option in airport_list.find_all('option'):
            airport_name = option['value']
            
            if "All" in airport_name:
                pass
            else:
                data.append(airport_name)
          
    return data


In [20]:
def test():
    data = extract_airports(html_page)
    assert len(data) == 15
    assert "ATL" in data
    assert "ABR" in data


In [21]:
if __name__ == "__main__":
    test()
    

## 3. Processing All

In [297]:
#####################################
#                 3                 #
#####################################

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Let's assume that you combined the code from the previous 2 exercises with code
from the lesson on how to build requests, and downloaded all the data locally.
The files are in a directory "data", named after the carrier and airport:
"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".

The table with flight info has a table class="dataTDRight". Your task is to
use 'process_file()' to extract the flight data from that table as a list of
dictionaries, each dictionary containing relevant data from the file and table
row. This is an example of the data structure you should return:

data = [{"courier": "FL",
         "airport": "ATL",
         "year": 2012,
         "month": 12,
         "flights": {"domestic": 100,
                     "international": 100}
        },
         {"courier": "..."}
]

Note - year, month, and the flight data should be integers.
You should skip the rows that contain the TOTAL data for a year.

There are couple of helper functions to deal with the data files.
Please do not change them for grading purposes.
All your changes should be in the 'process_file()' function.

The 'data/FL-ATL.html' file in the tab above is only a part of the full data,
covering data through 2003. The test() code will be run on the full table, but
the given file should provide an example of what you will get.
"""


'\nLet\'s assume that you combined the code from the previous 2 exercises with code\nfrom the lesson on how to build requests, and downloaded all the data locally.\nThe files are in a directory "data", named after the carrier and airport:\n"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".\n\nThe table with flight info has a table class="dataTDRight". Your task is to\nuse \'process_file()\' to extract the flight data from that table as a list of\ndictionaries, each dictionary containing relevant data from the file and table\nrow. This is an example of the data structure you should return:\n\ndata = [{"courier": "FL",\n         "airport": "ATL",\n         "year": 2012,\n         "month": 12,\n         "flights": {"domestic": 100,\n                     "international": 100}\n        },\n         {"courier": "..."}\n]\n\nNote - year, month, and the flight data should be integers.\nYou should skip the rows that contain the TOTAL data for a year.\n\nThere are couple of helper

In [392]:
datadir = "P4L4_Resources/data"


In [393]:
def open_zip(datadir):
    with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
        myzip.extractall()
        

In [394]:
def process_all(datadir):
    files = os.listdir(datadir)
    return files


In [395]:
def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:

    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]


    Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.
    """
    data = []
    info = {}
    info["courier"], info["airport"] = f[:6].split("-")
    
    # Note: create a new dictionary for each entry in the output data list.
    # If you use the info dictionary defined here each element in the list 
    # will be a reference to the same info dictionary.
    
    # https://discussions.udacity.com/t/2-3-processing-all-problem-with-for-loops-and-getting-the-data-in/14411/16
    
    with open("{}/{}".format(datadir, f), "r") as html:

        soup = BeautifulSoup(html, "lxml")
        car = info["courier"]
        airp = info["airport"]
        
        mydict = {}
        
        # find a table with class="dataTDRight"
        flight_list = soup.find("table",{"class":"dataTDRight"})
        
        rows = flight_list.find_all("tr")
        
        for row in rows:

            cells = row.find_all("td")

            year = cells[0].get_text()
            year = (year.encode('ascii'))

            month = cells[1].get_text()
            month = (month.encode('ascii'))

            domestic = cells[2].get_text()
            domestic = (domestic.encode('ascii'))

            international = cells[3].get_text()
            international = (international.encode('ascii'))

            # skip "Month" and "TOTAL" because their data type is not int 
            if month != "Month" and month != "TOTAL":
                month = int(month)
                year = int(year)
                domestic = int(domestic.replace(',',''))
                international = int(international.replace(',',''))

                mydict['courier'] = car
                mydict['airport'] = airp
                mydict['year'] = year
                mydict['month'] = month
                mydict['flights'] = (domestic/100,international/100)
                data.append(mydict.copy())

            print mydict
            
    return data



In [408]:
open_zip(datadir)
files = process_all(datadir)

f = "FL-ATL.html"
data = []
info = {}

info["courier"], info["airport"] = f[:6].split("-")

with open("{}/{}".format(datadir, f), "r") as html:
    soup = BeautifulSoup(html, "lxml")
    car = info["courier"]
    airp = info["airport"]

    mydict = {}

    # find a table with class="dataTDRight"
    flight_list = soup.find("table",{"class":"dataTDRight"})
    
    rows = flight_list.find_all("tr")
    
    
    for row in rows:

        cells = row.find_all("td")

        year = cells[0].get_text()
        year = (year.encode('ascii'))

        month = cells[1].get_text()
        month = (month.encode('ascii'))

        domestic = cells[2].get_text()
        domestic = (domestic.encode('ascii'))

        international = cells[3].get_text()
        international = (international.encode('ascii'))
        
        mydict['flights'] = {}

        # skip "Month" and "TOTAL" because their data type is not int 
        if month != "Month" and month != "TOTAL":
            month = int(month)
            year = int(year)
            domestic = int(domestic.replace(',',''))
            international = int(international.replace(',',''))

            mydict['courier'] = car
            mydict['airport'] = airp
            mydict['year'] = year
            mydict['month'] = month
            mydict['flights']['domestic'] = (domestic/100)
            mydict['flights']['international'] = (international/100)
            data.append(mydict.copy())

        # print mydict
print data


[{'flights': {'international': 925, 'domestic': 8154}, 'year': 2002, 'airport': 'ATL', 'courier': 'FL', 'month': 10}, {'flights': {'international': 913, 'domestic': 7667}, 'year': 2002, 'airport': 'ATL', 'courier': 'FL', 'month': 11}, {'flights': {'international': 968, 'domestic': 7821}, 'year': 2002, 'airport': 'ATL', 'courier': 'FL', 'month': 12}, {'flights': {'international': 980, 'domestic': 7856}, 'year': 2003, 'airport': 'ATL', 'courier': 'FL', 'month': 1}, {'flights': {'international': 859, 'domestic': 6907}, 'year': 2003, 'airport': 'ATL', 'courier': 'FL', 'month': 2}, {'flights': {'international': 979, 'domestic': 7976}, 'year': 2003, 'airport': 'ATL', 'courier': 'FL', 'month': 3}, {'flights': {'international': 893, 'domestic': 7666}, 'year': 2003, 'airport': 'ATL', 'courier': 'FL', 'month': 4}, {'flights': {'international': 876, 'domestic': 7898}, 'year': 2003, 'airport': 'ATL', 'courier': 'FL', 'month': 5}, {'flights': {'international': 954, 'domestic': 7988}, 'year': 2003, 

In [399]:
def test():
    print "Running a simple test..."
    open_zip(datadir)
    files = process_all(datadir)
    data = []
    # Test will loop over three data files.
    for f in files:
        data += process_file(f)
        
    assert len(data) == 399  # Total number of rows
    for entry in data[:3]:
        assert type(entry["year"]) == int
        assert type(entry["month"]) == int
        assert type(entry["flights"]["domestic"]) == int
        assert len(entry["airport"]) == 3
        assert len(entry["courier"]) == 2
    assert data[0]["courier"] == 'FL'
    assert data[0]["month"] == 10
    assert data[-1]["airport"] == "ATL"
    assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
    
    print "... success!"
    

In [329]:
open_zip(datadir)
print myzip

NameError: name 'myzip' is not defined

In [331]:
process_all(datadir)


OSError: [Errno 2] No such file or directory: 'P4L4_Resources/data'

In [321]:
if __name__ == "__main__":
    test()
    

Running a simple test...


OSError: [Errno 2] No such file or directory: 'P4L4_Resources/data'