In [1]:
import dask.dataframe as dd #http://dask.pydata.org/en/latest/
import pandas as pd
from datetime import datetime
from bokeh.io import output_notebook
import fastparquet
# from distributed import Client, progress

# # Setup Dask Distributed
# client = Client()
# print(client)

### Other Settings
# Show more rows
pd.options.display.max_rows = 999

# Prevent scientific notation of decimals
pd.set_option('precision',3)

# Assignment: Analyzing Airline Flight Delays
For a full treatment of the unit 14 case study, please review module 14.3. Some points from the video are given below.

Work with the airline data set (use R or Python to manage out-of-core).
Answer the following questions by using the split-apply-combine technique:
* Which airports are most likely to be delayed flying out of or into?
* Which flights with same origin and destination are most likely to be delayed?
* Can you regress how delayed a flight will be before it is delayed?
* What are the most important features for this regression?

Remember to properly cross-validate models.

Use meaningful evaluation criteria.

Create at least one new feature variable for the regression.

In [2]:
output_notebook()

In [None]:
with open("C:/Users/ryan.shuhart/Downloads/AirlineDelays.tar/AirlineDelays/1987.csv", "r") as f:
    for i in range(1,5):
        print(f.readline())


In [3]:
dts = {'ActualElapsedTime': 'float64', # Confirmed
 'AirTime': 'float64', # Confirmed
 'ArrDelay': 'float64', # Confirmed
 'ArrTime': 'float64', # Confirmed
 'CRSArrTime': 'int64', # Confirmed
 'CRSDepTime': 'int64', # Confirmed
 'CRSElapsedTime': 'float64', # !!!!!!!!!!!!!! This one is causing an issue as an int trying float
 'CancellationCode': 'O', # Confirmed by lesson video
 'Cancelled': 'int64', # Confirmed
 'CarrierDelay': 'float64', # Confirmed
 'DayOfWeek': 'int64', # Confirmed
 'DayofMonth': 'int64', # Confirmed
 'DepDelay': 'float64', # Confirmed
 'DepTime': 'float64', # Confirmed
 'Dest': 'O', # Confirmed
 'Distance': 'float64', # Confirmed
 'Diverted': 'int64', # Confirmed
 'FlightNum': 'int64', # Exploring if int or string
 'LateAircraftDelay': 'float64', # Confirmed
 'Month': 'int64', # Confirmed
 'NASDelay': 'float64', # Confirmed
 'Origin': 'O', # Confirmed
 'SecurityDelay': 'float64', # Confirmed
 'TailNum': 'O', # Confirmed
 'TaxiIn': 'float64', # Confirmed
 'TaxiOut': 'float64', # Confirmed
 'UniqueCarrier': 'O', # Confirmed
 'WeatherDelay': 'float64', # Confirmed
 'Year': 'int64'} # Confirmed

dts.keys()

dict_keys(['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime', 'CRSArrTime', 'CRSDepTime', 'CRSElapsedTime', 'CancellationCode', 'Cancelled', 'CarrierDelay', 'DayOfWeek', 'DayofMonth', 'DepDelay', 'DepTime', 'Dest', 'Distance', 'Diverted', 'FlightNum', 'LateAircraftDelay', 'Month', 'NASDelay', 'Origin', 'SecurityDelay', 'TailNum', 'TaxiIn', 'TaxiOut', 'UniqueCarrier', 'WeatherDelay', 'Year'])

In [4]:
# Convert csv to parquet
def csv_to_parquet():
    df_csv = dd.read_csv("C:/Users/ryan.shuhart/Downloads/AirlineDelays.tar/AirlineDelays/*.csv", 
                     usecols = dts.keys(),
                     dtype=dts, 
                     encoding='iso-8859-1')
    #print(df_csv.columns)
    df_csv.dtypes.to_dict()

    df_csv.to_parquet("C:/Users/ryan.shuhart/Downloads/AirlineDelays.tar/AirlineDelays/parquet/",
                  compression='gzip',
                  object_encoding='utf8')

start = datetime.now()
csv_to_parquet()
time_to_complete = datetime.now() - start
print(time_to_complete)

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')
0:42:23.059454


In [5]:
time_to_complete.total_seconds()/60

42.38432423333334

#### Review of Dask 
* Ryan's Hardware: 
    - CPU: Intel i5-4300M @ 2.60GHz
    - Disk: Samsung SSD 850 Pro
    - RAM: 8 GB
    

* Dask using original csv:
    - no conversion
    - size on disk
        - 11.2 gb
    - benchmark of describing 'Distance':
        - Unable to complete due to memory errors. This is surprising because this is what Dask is supposed to prevent.
* Dask using uncompressed parquet: 
    - conversion to parquet
        - approx 10 minutes (9.57 minutes on Ryan's laptop with SSD)
    - size on disk:
        - 13.8 gb
    - benchmark of describing 'Distance':
        - 1 loop, best of 3: 6.2 s per loop
* Dask using gzip compressed parquet:
    - converstion to parquet
        - approx 42 minutes (9.57 minutes on Ryan's laptop with SSD)
    - size on disk:
        - 1.36 gb <- big difference
    - benchmark of describing 'Distance':
        - 1 loop, best of 3: 8.83 s per loop

# Benchmark test

In [8]:
df_par = dd.read_parquet("C:/Users/ryan.shuhart/Downloads/AirlineDelays.tar/AirlineDelays/parquet/")
#print(df_par.columns)
#df_par.dtypes.to_dict()

In [9]:
%%timeit
format = lambda x: "{0:.3f}".format(x) 
start = datetime.now()
print(df_par[['Distance']].dropna().describe().compute().applymap(format))
time_to_complete = datetime.now() - start
time_to_complete.total_seconds()/60

            Distance
count  123332969.000
mean         701.699
std          551.253
min            0.000
25%          372.000
50%          733.000
75%         1121.000
max         4983.000
            Distance
count  123332969.000
mean         701.699
std          551.253
min            0.000
25%          372.000
50%          733.000
75%         1121.000
max         4983.000
            Distance
count  123332969.000
mean         701.699
std          551.253
min            0.000
25%          372.000
50%          733.000
75%         1121.000
max         4983.000
            Distance
count  123332969.000
mean         701.699
std          551.253
min            0.000
25%          372.000
50%          733.000
75%         1121.000
max         4983.000
1 loop, best of 3: 8.83 s per loop


In [None]:
df_csv = dd.read_csv("C:/Users/ryan.shuhart/Downloads/AirlineDelays.tar/AirlineDelays/*.csv", 
                 usecols = dts.keys(),
                 dtype=dts, 
                 encoding='iso-8859-1')
print(df_csv.info())
df_csv.dtypes.to_dict()

In [None]:
%%timeit
format = lambda x: "{0:.3f}".format(x) 
start = datetime.now()
print(df_csv[['Distance']].describe().compute().applymap(format))
time_to_complete = datetime.now() - start
time_to_complete.total_seconds()/60

## Exploration to Address Variable Issues
* CRSElapsedTime - should be an int
* FlightNum - Alphas started to be used at some point

In [None]:
df_par['FlightNum'] = df_par['FlightNum'].astype('int64')
df_par['CRSElapsedTime'] = df_par['CRSElapsedTime'].astype('float64')
df_par.dtypes.to_dict()

In [None]:
pd.to_numeric(x, errors='raise')

In [None]:
df_par.describe().compute()

In [None]:
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler
from dask.diagnostics import visualize

df_par_orgin = df_par.groupby('Origin')

with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:
    out = df_par_orgin['DepDelay'].mean().compute()

In [None]:
out.sort_values(ascending=False)

In [None]:
from dask.diagnostics import visualize
visualize([prof, rprof, cprof])

# Regression of Delay

In [None]:
df_par_samp = df_par[['ArrDelay','Distance', 'DepTime']].dropna()

In [None]:
from sklearn import linear_model

reg = linear_model.LinearRegression()

ArrDelay_X = df_par[['Distance', 'DepTime']]

ArrDelay_y = df_par[['ArrDelay']]

reg.fit(ArrDelay_X, ArrDelay_y)
print('Coefficients: \n', reg.coef_)

In [None]:
len(ArrDelay_y)
