In [1]:
import dask
from dask.distributed import Client
import dask.dataframe as dd
import tensorflow as tf
import pandas as pd

#change this to your Saturn Dask Cluster URL
SATURN_DASK_CLUSTER_URL = ''

client = Client(SATURN_DASK_CLUSTER_URL)

In [3]:
%%time
#ETL
cols = ['Trip_Pickup_DateTime','Trip_Dropoff_DateTime','Passenger_Count','Trip_Distance','Start_Lon','Start_Lat','End_Lon','End_Lat','Fare_Amt','Tip_Amt','Total_Amt']

df = dd.read_csv('s3://nyc-tlc/trip data/yellow_tripdata_2009-*.csv', usecols=cols, storage_options={'anon': True})
df.head()

# filter wrong columns
query_frags = [
    'Fare_Amt > 0 and Fare_Amt < 500',
    'Passenger_Count > 0 and Passenger_Count < 6',
    'Start_Lon > -75 and Start_Lon < -73',
    'End_Lon > -75 and End_Lon < -73',
    'Start_Lat	 > 40 and Start_Lat < 42',
    'End_Lat > 40 and End_Lat < 42'
]

df = df.query(' and '.join(query_frags))

df['Trip_Pickup_DateTime'] = df['Trip_Pickup_DateTime'].astype('datetime64[ns]')
df['Trip_Dropoff_DateTime'] = df['Trip_Dropoff_DateTime'].astype('datetime64[ns]')
df['Trip_Pickup_DateTime'] = df['Trip_Pickup_DateTime'].apply(lambda x: x.day, meta=('Trip_Dropoff_DateTime', 'int64'))
df['Trip_Dropoff_DateTime'] = df['Trip_Dropoff_DateTime'].apply(lambda x: x.day, meta=('Trip_Dropoff_DateTime', 'int64'))

df = df.compute()



CPU times: user 27.9 s, sys: 32.4 s, total: 1min
Wall time: 3min 50s


In [4]:
not_fare_cols = [col for col in df.columns if col not in ['Fare_Amt']]

def build_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(8, activation='relu', input_shape=[len(df[not_fare_cols].keys())]),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(8, activation='relu'),
        tf.keras.layers.Dense(1)
        
    ])
    optimizer = tf.keras.optimizers.Adam()
    model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
    return model


model = build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 8)                 88        
_________________________________________________________________
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 649
Trainable params: 649
Non-trainable params: 0
_________________________________________________________________


In [5]:
%%time
train = df[not_fare_cols].to_numpy()
labels =  df['Fare_Amt'].to_numpy()

EPOCHS = 5
history = model.fit(
  train, labels, batch_size=512, validation_split=0.25, shuffle=True,
  epochs=EPOCHS)

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

Train on 125990835 samples, validate on 41996945 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1h 21min 50s, sys: 14min 7s, total: 1h 35min 58s
Wall time: 46min 42s


Unnamed: 0,loss,mae,mse,val_loss,val_mae,val_mse,epoch
0,0.52574,0.338347,0.525741,0.813508,0.562216,0.813509,0
1,0.492372,0.325925,0.492371,0.832268,0.585209,0.832272,1
2,0.486419,0.322531,0.486414,0.775081,0.543843,0.775082,2
3,0.482775,0.320812,0.482772,0.760801,0.531075,0.760802,3
4,0.481329,0.320313,0.481334,0.745706,0.504571,0.745705,4
