In [1]:
import datetime, warnings, time
import numpy as np
import pandas as pd
import dask.array as da
import dask.dataframe as dd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as patches
from matplotlib.patches import ConnectionPatch
from collections import OrderedDict
from matplotlib.gridspec import GridSpec
# from mpl_toolkits.basemap import Basemap
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings("ignore")

## Divinding the dataset into chunks

In [2]:
delays = {}
total_rows = 10915496
num_rows = 1000000
it = total_rows//num_rows
for i in range(it):
    s = time.time()
    delays["part_{}".format(i)] = pd.read_csv('../data/FlightDelays.csv', skiprows=i*num_rows, nrows=num_rows, low_memory=False)
    delays["part_{}".format(i)] = delays["part_{}".format(i)].rename(columns={'Route': 'ROUTE'})
    print("Rows {0} - {1} loaded in {2} s.".format(i*num_rows, (i+1)*num_rows, time.time()-s))
# Load last chunk
s = time.time()
delays["part_{}".format(it)] = pd.read_csv('../data/FlightDelays.csv', skiprows=it*num_rows, nrows=total_rows - it*num_rows, low_memory=False)
delays["part_{}".format(it)] = delays["part_{}".format(i)].rename(columns={'Route': 'ROUTE'})
print("Rows {0} - {1} loaded in {2} s.".format(it*num_rows, total_rows, time.time()-s))

Rows 0 - 1000000 loaded in 7.965904235839844 s.
Rows 1000000 - 2000000 loaded in 7.8432488441467285 s.
Rows 2000000 - 3000000 loaded in 8.814414978027344 s.
Rows 3000000 - 4000000 loaded in 9.499364137649536 s.
Rows 4000000 - 5000000 loaded in 10.223910808563232 s.
Rows 5000000 - 6000000 loaded in 11.339410066604614 s.
Rows 6000000 - 7000000 loaded in 11.0940101146698 s.
Rows 7000000 - 8000000 loaded in 12.51195502281189 s.
Rows 8000000 - 9000000 loaded in 14.099839925765991 s.
Rows 9000000 - 10000000 loaded in 16.20912194252014 s.
Rows 10000000 - 10915496 loaded in 16.063072204589844 s.


## Pick a data chunk to design data cleaning operations

1. Change column types to categorical, numeric, and datetime
2. Perform imputation to fill missing values
    * Grouped by airport, airline, year, quarter, month, day of month, day of week
3. Change all the `float64` columns except `NET_INCOME` and `OP_REVENUES` to `integer`after imputation

In [3]:
test = delays['part_0']

In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 50 columns):
YEAR                   1000000 non-null int64
QUARTER                1000000 non-null int64
MONTH                  1000000 non-null int64
DAY_OF_MONTH           1000000 non-null int64
DAY_OF_WEEK            1000000 non-null int64
FL_DATE                1000000 non-null object
CARRIER                1000000 non-null object
FL_NUM                 1000000 non-null int64
ROUTE                  1000000 non-null int64
ORIGIN                 1000000 non-null object
DEST                   1000000 non-null object
DEST_CITY              1000000 non-null object
DEST_STATE             1000000 non-null object
CRS_DEP_TIME           1000000 non-null int64
DEP_TIME               976576 non-null float64
DEP_DELAY              975611 non-null float64
DEP_DELAY_NEW          975611 non-null float64
DEP_DEL15              975611 non-null float64
DEP_DELAY_GROUP        975611 non-null float64
DEP

In [5]:
test.head().T

Unnamed: 0,0,1,2,3,4
YEAR,2018,2018,2018,2018,2018
QUARTER,1,1,1,1,1
MONTH,1,1,1,1,1
DAY_OF_MONTH,1,1,2,2,3
DAY_OF_WEEK,1,1,2,2,3
FL_DATE,2018-01-01,2018-01-01,2018-01-02,2018-01-02,2018-01-03
CARRIER,9E,9E,9E,9E,9E
FL_NUM,3331,3940,3409,3940,3353
ROUTE,42,42,42,42,42
ORIGIN,ABY,ABY,ABY,ABY,ABY


In [6]:
# Read description excel file
desc = pd.read_excel('../data/FlightDataDescription.xlsx')

In [7]:
def get_column_types(df):
    cat_, num_, dt_ = [], [], []
    for _, row in df.iterrows():
        if row['TYPE'] == 'Nominal' or row['TYPE'] == 'Ordinal' or row['TYPE'] == 'Binary':
            cat_.append(row['ATTRIBUTE'])
        if row['TYPE'] == 'Interval':
            num_.append(row['ATTRIBUTE'])
        if row['TYPE'] == 'yyyymmdd':
            dt_.append(row['ATTRIBUTE'])
    return cat_, num_, dt_

In [8]:
cats, nums, dts = get_column_types(desc)

In [9]:
# Add other date stats
cats.extend(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK'])

## Convert the categorical data

In [10]:
def convert_to_categorical(df, cols):
    start = time.time()
    for col in cols:
        df[col] = df[col].astype('category')
    print("Converted to categorical columns in {} s.".format(time.time() - start))
    return df

In [11]:
def convert_to_integer(df, cols):
    start = time.time()
    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='integer', errors='ignore')
    print("Converted to numeric columns in {} s.".format(time.time() - start))
    return df

In [12]:
def convert_to_datetime(df, cols):
    start = time.time()
    for col in cols:
        df[col] = pd.to_datetime(df[col])
    print("Converted to datetime columns in {} s.".format(time.time() - start))
    return df

In [13]:
test = convert_to_categorical(test, cats)

Converted to categorical columns in 2.2579710483551025 s.


In [14]:
test = convert_to_integer(test, nums)

Converted to numeric columns in 0.5388948917388916 s.


In [15]:
test= convert_to_datetime(test, dts)

Converted to datetime columns in 0.14524006843566895 s.


In [16]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 50 columns):
YEAR                   1000000 non-null category
QUARTER                1000000 non-null category
MONTH                  1000000 non-null category
DAY_OF_MONTH           1000000 non-null category
DAY_OF_WEEK            1000000 non-null category
FL_DATE                1000000 non-null datetime64[ns]
CARRIER                1000000 non-null object
FL_NUM                 1000000 non-null category
ROUTE                  1000000 non-null category
ORIGIN                 1000000 non-null category
DEST                   1000000 non-null category
DEST_CITY              1000000 non-null category
DEST_STATE             1000000 non-null category
CRS_DEP_TIME           1000000 non-null int16
DEP_TIME               976576 non-null float64
DEP_DELAY              975611 non-null float64
DEP_DELAY_NEW          975611 non-null float64
DEP_DEL15              975611 non-null category
DEP_DELAY_GR

40% reduction in memory! It will be further reduced after imputation.

## Imputation