In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
def streamer(stream, data):
    while True:
        for data in pd.read_csv(data, chunksize=stream):
            # x = data[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance']].to_numpy()
            data = data[data['tpep_pickup_datetime'].notna()]
            data = data[data['tpep_dropoff_datetime'].notna()]
            data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'], errors='coerce')
            data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'], errors='coerce')
            data['trip_seconds'] = (data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']).dt.total_seconds()
            data['month'] = data['tpep_dropoff_datetime'].dt.month
            data = data[data['trip_distance'].notna()]
            # data = data[data['tip_amount'].notna()]
            # data = data[data['passenger_count'].notna()]
            data = data[data['total_amount'].notna()]            
            x = data[['trip_distance', 'trip_seconds', 'month']].to_numpy()
            y = data['total_amount'].to_numpy()
            yield x, y

In [None]:
# yellow_19 = pd.read_csv('2019_Yellow_Taxi_Trip_Data.csv')

In [None]:
# yellow_19 = yellow_19[yellow_19['tpep_pickup_datetime'].notna()]

In [None]:
# yellow_19['tpep_pickup_datetime'] = pd.to_datetime(yellow_19['tpep_pickup_datetime'], errors='coerce')
# yellow_19['tpep_dropoff_datetime'] = pd.to_datetime(yellow_19['tpep_dropoff_datetime'], errors='coerce')

In [None]:
# yellow_19['trip_seconds'] = (yellow_19['tpep_dropoff_datetime'] - yellow_19['tpep_pickup_datetime']).dt.total_seconds()
# yellow_19['month'] = yellow_19['tpep_dropoff_datetime'].dt.month

In [None]:
# yellow_19.info()

In [None]:
# Neural network model with 2 hidden layers using mean absolute error as the loss metric
NN_model_2019 = Sequential()

# The Input Layer :
NN_model_2019.add(Dense(16, kernel_initializer='normal',input_dim = 3, activation='relu'))

# The Hidden Layer :
NN_model_2019.add(Dense(32, kernel_initializer='normal',activation='relu'))
NN_model_2019.add(Dense(32, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model_2019.add(Dense(1, kernel_initializer='normal',activation='linear'))
NN_model_2019.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error', 'mse'])


In [None]:
# yellow_19_train = pd.read_csv('2019_Yellow_Taxi_Trip_Data_training.csv')
# yellow_19_test = pd.read_csv('2019_Yellow_Taxi_Trip_Data_validation.csv')

In [None]:
# yellow_19_train.count()

In [None]:
# yellow_19_test.count()

In [None]:
# Lengths of the train and validation data
lengths_2019 = [6675578, 1432688]
train_file = '2019_Yellow_Taxi_Trip_Data_training.csv'
validation_file = '2019_Yellow_Taxi_Trip_Data_validation.csv'
chunksize = 10000

# Model runs with a generator for both the train and validation sets for 10 epochs
history = NN_model_2019.fit(streamer(chunksize, train_file),
          steps_per_epoch=lengths_2019[0]//chunksize,
          epochs=10,
          verbose=2,
          validation_data=streamer(chunksize, validation_file), 
          validation_steps = lengths_2019[1]//chunksize)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Valiation'], loc='upper left')
plt.show()
plt.savefig('2019_loss.png')

In [None]:
# Loading in test data and isolating four relevant columns
test_2019 = pd.read_csv('2019_Yellow_Taxi_Trip_Data_test.csv')  

test_2019 = test_2019[test_2019['tpep_pickup_datetime'].notna()]
test_2019 = test_2019[test_2019['tpep_dropoff_datetime'].notna()]
test_2019['tpep_pickup_datetime'] = pd.to_datetime(test_2019['tpep_pickup_datetime'], errors='coerce')
test_2019['tpep_dropoff_datetime'] = pd.to_datetime(test_2019['tpep_dropoff_datetime'], errors='coerce')
test_2019['trip_seconds'] = (test_2019['tpep_dropoff_datetime'] - test_2019['tpep_pickup_datetime']).dt.total_seconds()
test_2019['month'] = test_2019['tpep_dropoff_datetime'].dt.month
test_2019 = test_2019[test_2019['trip_distance'].notna()]
test_2019 = test_2019[test_2019['total_amount'].notna()]            
x = test_2019[['trip_distance', 'trip_seconds', 'month']]
y = test_2019['total_amount']

In [None]:
# Saving model in a H5py format
NN_model_2019.save('NN_model_2019.h5')

In [None]:
# Predicting values using test set
predictions = NN_model_2019.predict(x)

In [None]:
# Evaluating model using test data
NN_model_2019.evaluate(x)

In [None]:
# Plot of actual vs predicted values
y = y.to_numpy()
plt.figure(figsize=(10,10))
plt.scatter(y, predictions, c='crimson')


p1 = max(max(predictions), max(y))
p2 = min(min(predictions), min(y))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.xlim(0,100)
plt.ylim(0,100)
plt.show()