In [1]:
"""
Copy this code to a databricks notebook to download the ORC
file of Airlines data and preprocess it for training.
"""
import cudf
import numpy as np
import os
from urllib.request import urlretrieve
import gzip

# ACTION REQUIRED - Set the DBFS path 
data_dir = "/_dbfs_path/" # DBFS path to the encompassing folder

# We will use airline_small.parquet as default - for benchmarks we used a larger
# dataset present at https://rapidsai-cloud-ml-sample-data.s3-us-west-2.amazonaws.com/airline_20000000.parquet
# in order to used the larger data - change filename to airline_20000000.parquet


file_name = 'airline_small.parquet' # NOTE: Change to airline_20000000.parquet to use a larger dataset
parquet_name = os.path.join(data_dir, file_name)

def prepare_dataset():

    input_cols = ["Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime", "CRSArrTime",
                  "UniqueCarrier", "FlightNum", "ActualElapsedTime", "Origin", "Dest",
                  "Distance", "Diverted"]

    # Download URL 
    url = "https://rapidsai-cloud-ml-sample-data.s3-us-west-2.amazonaws.com/" + file_name

    
    if os.path.isfile(parquet_name):
        print(f" > File already exists. Ready to load at {parquet_name}")
    else:
        # Ensure folder exists
        os.makedirs(data_dir, exist_ok=True)
        def data_progress_hook(block_number, read_size, total_filesize):
            if (block_number % 1000) == 0:
                print(
                    f" > percent complete: { 100 * ( block_number * read_size ) / total_filesize:.2f}\r",
                    end="",
                )
            return
        urlretrieve(
            url= url,
            filename=parquet_name,
            reporthook=data_progress_hook,
        )
        
        print(f" > Download complete {url}")
        
    dataset = cudf.read_parquet(parquet_name)

    # encode categoricals as numeric
    for col in dataset.select_dtypes(["object"]).columns:
        dataset[col] = dataset[col].astype("category").cat.codes.astype(np.int32)

    # cast all columns to int32
    for col in dataset.columns:
        dataset[col] = dataset[col].astype(np.float32)  # needed for random forest

    # put target/label column first [ classic XGBoost standard ]
    output_cols = ["ArrDelayBinary"] + input_cols

    dataset = dataset.reindex(columns=output_cols)
    dataset.to_parquet(parquet_name)
    return

In [None]:
prepare_dataset()