In [1]:
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

datadir = "03_data"


def open_zip(data_dir):
    with ZipFile('{0}.zip'.format(data_dir), 'r') as myzip:
        myzip.extractall()


def process_all(data_dir):
    files = os.listdir(data_dir)
    return files

In [15]:
def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:
    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]
     Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.
    """
    data = []
    info = dict()
    info["courier"], info["airport"] = f[:6].split("-")
    # Note: create a new dictionary for each entry in the output data list.
    # If you use the info dictionary defined here each element in the list
    # will be a reference to the same info dictionary.
    with open("{}/{}".format(datadir, f), "r") as html:
        soup = BeautifulSoup(html, 'lxml')
        trs = soup.find('table', class_='dataTDRight').find_all('tr', class_='dataTDRight')
        for tr in trs:
            tds = tr.find_all('td')
            if tds[1].text != 'TOTAL':
                info["year"] = int(tds[0].text)
                info["month"] = int(tds[1].text)
                info["flights"] = {"domestic": int(tds[2].text.replace(',', '')),
                                   "international": int(tds[3].text.replace(',', ''))}
                data.append(info)
                print(data)
                            
    return data


In [10]:
def test():
    print ("Running a simple test...")
    open_zip(datadir)
    files = process_all(datadir)
    data = []
    # Test will loop over three data files.
    for f in files:
        data += process_file(f)

    assert len(data) == 399  # Total number of rows
    for entry in data[:3]:
        assert type(entry["year"]) == int
        assert type(entry["month"]) == int
        assert type(entry["flights"]["domestic"]) == int
        assert len(entry["airport"]) == 3
        assert len(entry["courier"]) == 2
    assert data[0]["courier"] == 'FL'
    assert data[0]["month"] == 10
    assert data[-1]["airport"] == "ATL"
    assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}

    print ("... success!")