In [1]:
import datetime, warnings, time
from tqdm import tqdm
import numpy as np
import pandas as pd
import dask.array as da
import dask.dataframe as dd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as patches
from matplotlib.patches import ConnectionPatch
from collections import OrderedDict
from matplotlib.gridspec import GridSpec
# from mpl_toolkits.basemap import Basemap
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings("ignore")

## Divinding the dataset into chunks

In [31]:
delays = {}
total_rows = 10915496
num_rows = 1000000
it = total_rows//num_rows
for i in range(it):
    s = time.time()
    delays["part_{}".format(i)] = pd.read_csv('../data/FlightDelays.csv', skiprows=i*num_rows, nrows=num_rows, low_memory=False)
    if i == 0:
        col_names = delays["part_{}".format(i)].columns
        delays["part_{}".format(i)].rename(columns={'Route': 'ROUTE'}, inplace=True)
    else:
        delays["part_{}".format(i)].columns = col_names
        delays["part_{}".format(i)].rename(columns={'Route': 'ROUTE'}, inplace=True)
#     print("part_{} col names: {}".format(i, delays["part_{}".format(i)].columns))
    print("Rows {0} - {1} loaded in {2} s.".format(i*num_rows, (i+1)*num_rows, time.time()-s))
    print(delays["part_{}".format(i)].shape)
# Load last chunk
s = time.time()
delays["part_{}".format(it)] = pd.read_csv('../data/FlightDelays.csv', skiprows=it*num_rows, nrows=(total_rows - it*num_rows), low_memory=False)
delays["part_{}".format(it)].columns = col_names
delays["part_{}".format(it)].rename(columns={'Route': 'ROUTE'}, inplace=True)
print("Rows {0} - {1} loaded in {2} s.".format(it*num_rows, total_rows, time.time()-s))
print(delays["part_{}".format(it)].shape)

Rows 0 - 1000000 loaded in 8.398997068405151 s.
(1000000, 50)
Rows 1000000 - 2000000 loaded in 9.777602910995483 s.
(1000000, 50)
Rows 2000000 - 3000000 loaded in 9.217018127441406 s.
(1000000, 50)
Rows 3000000 - 4000000 loaded in 10.753101110458374 s.
(1000000, 50)
Rows 4000000 - 5000000 loaded in 11.482004880905151 s.
(1000000, 50)
Rows 5000000 - 6000000 loaded in 19.866193056106567 s.
(1000000, 50)
Rows 6000000 - 7000000 loaded in 16.660933017730713 s.
(1000000, 50)
Rows 7000000 - 8000000 loaded in 18.927738904953003 s.
(1000000, 50)
Rows 8000000 - 9000000 loaded in 21.195560932159424 s.
(1000000, 50)
Rows 9000000 - 10000000 loaded in 19.32032799720764 s.
(1000000, 50)
Rows 10000000 - 10915496 loaded in 18.505751848220825 s.
(915495, 50)


## Pick a data chunk to design data cleaning operations

1. Change column types to categorical, numeric, and datetime
    * Change all the `int64` and `float64` columns except `NET_INCOME` and `OP_REVENUES` to `integer`
2. Perform imputation to fill missing values
    * Grouped by airport, airline, year, quarter, month, day of month, day of week

In [57]:
test = delays['part_0'].copy()

In [58]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 50 columns):
YEAR                   1000000 non-null int64
QUARTER                1000000 non-null int64
MONTH                  1000000 non-null int64
DAY_OF_MONTH           1000000 non-null int64
DAY_OF_WEEK            1000000 non-null int64
FL_DATE                1000000 non-null object
CARRIER                1000000 non-null object
FL_NUM                 1000000 non-null int64
ROUTE                  1000000 non-null int64
ORIGIN                 1000000 non-null object
DEST                   1000000 non-null object
DEST_CITY              1000000 non-null object
DEST_STATE             1000000 non-null object
CRS_DEP_TIME           1000000 non-null int64
DEP_TIME               976576 non-null float64
DEP_DELAY              975611 non-null float64
DEP_DELAY_NEW          975611 non-null float64
DEP_DEL15              975611 non-null float64
DEP_DELAY_GROUP        975611 non-null float64
DEP

In [59]:
test.head().T

Unnamed: 0,0,1,2,3,4
YEAR,2018,2018,2018,2018,2018
QUARTER,1,1,1,1,1
MONTH,1,1,1,1,1
DAY_OF_MONTH,1,1,2,2,3
DAY_OF_WEEK,1,1,2,2,3
FL_DATE,2018-01-01,2018-01-01,2018-01-02,2018-01-02,2018-01-03
CARRIER,9E,9E,9E,9E,9E
FL_NUM,3331,3940,3409,3940,3353
ROUTE,42,42,42,42,42
ORIGIN,ABY,ABY,ABY,ABY,ABY


In [60]:
# Read description excel file
desc = pd.read_excel('../data/FlightDataDescription.xlsx')

In [61]:
def get_column_types(df):
    cat_, num_, dt_ = [], [], []
    for _, row in df.iterrows():
        if row['TYPE'] == 'Nominal' or row['TYPE'] == 'Ordinal' or row['TYPE'] == 'Binary':
            cat_.append(row['ATTRIBUTE'])
        if row['TYPE'] == 'Interval':
            num_.append(row['ATTRIBUTE'])
        if row['TYPE'] == 'yyyymmdd':
            dt_.append(row['ATTRIBUTE'])
    return cat_, num_, dt_

In [62]:
cats, nums, dts = get_column_types(desc)

In [63]:
# Add other date stats
cats.extend([ 'CARRIER', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK'])

## Convert the categorical data

In [64]:
def convert_nan(df, cols):
    start = time.time()
    for col in cols:
        df[col] = df[col].fillna(-9999)
    print("Converted all null values to -9999 in {} s.".format(time.time() - start))

In [65]:
def convert_to_categorical(df, cols):
    start = time.time()
    for col in cols:
        df[col] = df[col].astype('category')
    print("Converted to categorical columns in {} s.".format(time.time() - start))

In [66]:
def convert_to_integer(df, cols):
    start = time.time()
    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='integer', errors='ignore')
    print("Converted to numeric columns in {} s.".format(time.time() - start))

In [67]:
def convert_to_datetime(df, cols):
    start = time.time()
    for col in cols:
        df[col] = pd.to_datetime(df[col])
    print("Converted to datetime columns in {} s.".format(time.time() - start))

In [68]:
convert_nan(test, cats + nums)

Converted all null values to -9999 in 1.139084815979004 s.


In [69]:
convert_to_categorical(test, cats)

Converted to categorical columns in 2.20084285736084 s.


In [70]:
convert_to_integer(test, nums)

Converted to numeric columns in 1.9527678489685059 s.


In [71]:
convert_to_datetime(test, dts)

Converted to datetime columns in 0.12616896629333496 s.


In [72]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 50 columns):
YEAR                   1000000 non-null category
QUARTER                1000000 non-null category
MONTH                  1000000 non-null category
DAY_OF_MONTH           1000000 non-null category
DAY_OF_WEEK            1000000 non-null category
FL_DATE                1000000 non-null datetime64[ns]
CARRIER                1000000 non-null category
FL_NUM                 1000000 non-null category
ROUTE                  1000000 non-null category
ORIGIN                 1000000 non-null category
DEST                   1000000 non-null category
DEST_CITY              1000000 non-null category
DEST_STATE             1000000 non-null category
CRS_DEP_TIME           1000000 non-null int16
DEP_TIME               1000000 non-null int16
DEP_DELAY              1000000 non-null int16
DEP_DELAY_NEW          1000000 non-null int16
DEP_DEL15              1000000 non-null category
DEP_DELAY_GR

\>70% reduction in memory!

## Operation on whole data

In [92]:
def compress_dataset(data_dict, category_cols, numeric_cols, date_cols, file_path='../data/flight_delays.pkl'):
    def convert_nan(df, cols):
        for col in cols:
            df[col].fillna(-9999, inplace=True)
    
    def convert_to_categorical(df, cols):
        for col in cols:
            df[col].astype('category', inplace=True)
    
    def convert_to_integer(df, cols):
        for col in cols:
            df[col] = pd.to_numeric(df[col], downcast='integer', errors='ignore')
    
    def convert_to_datetime(df, cols):
        for col in cols:
            df[col] = pd.to_datetime(df[col])
    
    n_keys = len(data_dict.keys())
    dfs = []
    for i in tqdm(range(n_keys)):
        df = data_dict['part_{}'.format(i)].copy()
        if i == 9:
            df.rename(columns={'Route': 'ROUTE'}, inplace=True)
        convert_nan(df, category_cols + numeric_cols)
        convert_to_integer(df, numeric_cols)
        convert_to_datetime(df, date_cols)
        dfs.append(df)
    print("Concatenating chunks!")
    compressed_df = pd.concat(dfs)
    compressed_df.reset_index(drop=True, inplace=True)
    convert_to_categorical(compressed_df, category_cols)
    print("Saving as pickle file!")
    compressed_df.to_pickle(file_path)
    return compressed_df

In [93]:
delays_df = compress_dataset(delays, cats, nums, dts)

100%|██████████| 11/11 [00:54<00:00,  4.93s/it]


Concatenating chunks!
Saving as pickle file!


## Imputation