In [1]:
# Import the modules and packages we will use
import pandas as pd
import glob
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from pathlib import Path

%load_ext blackcellmagic

In [2]:
# Define GLOBAL variables

# Set the airport of interest to ORD (O'Hare International Airport)
AIRPORT_OF_INTEREST = "ORD"

#### PATHS AND DIRECTORIES
ROOT = Path(".")  # root path (the directory of the project folder)

# Where to store the input data
DATA_FOLDER = "data"
RAW_DATA_FOLDER = "raw"
PROCESSED_DATA_FOLDER = "processed"

DATA_PATH = ROOT / DATA_FOLDER

if not Path.exists(DATA_PATH):
    raise FileNotFoundError(f"The data folder does not exist: {DATA_PATH}")

RAW_DATA_PATH = DATA_PATH / RAW_DATA_FOLDER

if not Path.exists(RAW_DATA_PATH):
    raise FileNotFoundError(f"The raw data folder does not exist: {RAW_DATA_PATH}")


PROCESSED_DATA_PATH = DATA_PATH / PROCESSED_DATA_FOLDER
Path.mkdir(PROCESSED_DATA_PATH, parents=True, exist_ok=True)

# Name of the HDF5 file containing the data
HDF5_FILE = "flight_data.h5"

##  1. Process the flight data

In [3]:
def process_flight_data(
    raw_data_path: Path = RAW_DATA_PATH,
    processed_data_path: Path = PROCESSED_DATA_PATH,
    hdf5_file: str = HDF5_FILE, key=f"ORIGIN:{AIRPORT_OF_INTEREST}"
):
    
    def load_csv_files(files: list) -> pd.DataFrame:
        li = []  # where to store each csv file as a dataframe
        for filename in csv_files.__iter__():
            df = pd.read_csv(filename, index_col=None, header=0)
            li.append(df)
        return pd.concat(li, axis=0, ignore_index=True)

    # Try to load the HDF5 file
    try:
        df = pd.read_hdf(processed_data_path / hdf5_file, key=key)
    except (FileNotFoundError, KeyError): # if it does not exist, or key is wrong, create it
        # Get all the CSV Files
        csv_files = sorted(raw_data_path.glob("*.csv"))
        
        print(f"Loading files, this may take a while...")
        df = load_csv_files(csv_files)
        
        # filter the dataframe
        df = df[df["ORIGIN"] == AIRPORT_OF_INTEREST]
        # save dataframe to HDF5 file
        df.to_hdf(processed_data_path / hdf5_file, key=key, mode="w")
    
    return df # return the dataframe

df = process_flight_data()

In [4]:
df

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
14,2009-01-01,XE,1232,ORD,EWR,905.0,900.0,-5.0,16.0,916.0,...,127.0,110.0,88.0,719.0,,,,,,
16,2009-01-01,XE,1234,ORD,EWR,1230.0,1234.0,4.0,8.0,1242.0,...,149.0,111.0,89.0,719.0,,,,,,
18,2009-01-01,XE,1236,ORD,EWR,1630.0,1619.0,-11.0,19.0,1638.0,...,152.0,142.0,88.0,719.0,,,,,,
463,2009-01-01,XE,2647,ORD,IAH,1205.0,1154.0,-11.0,15.0,1209.0,...,180.0,163.0,142.0,925.0,,,,,,
593,2009-01-01,XE,2865,ORD,IAH,1500.0,1453.0,-7.0,11.0,1504.0,...,174.0,159.0,136.0,925.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61556816,2018-12-31,AA,1668,ORD,CLT,1035.0,1033.0,-2.0,35.0,1108.0,...,116.0,145.0,97.0,599.0,0.0,0.0,27.0,0.0,0.0,
61556839,2018-12-31,AA,1691,ORD,DCA,2037.0,2100.0,23.0,17.0,2117.0,...,113.0,101.0,80.0,612.0,,,,,,
61556883,2018-12-31,AA,1734,ORD,CLT,1211.0,1221.0,10.0,18.0,1239.0,...,118.0,117.0,89.0,599.0,,,,,,
61556904,2018-12-31,AA,1765,ORD,PHX,500.0,455.0,-5.0,14.0,509.0,...,235.0,231.0,213.0,1440.0,,,,,,
