## Imports

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import *

from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import math

from joblib import dump, load

lookup_df = pd.read_csv("zone_lookup.csv")

DATEPARSE = "%Y-%m-%d %H:%M:%S"

# Read data
### Add data into /taxi_data/
### Github stores a sample file of this. Download the rest and do the same.

In [3]:
def time_to_minutes(_today):
    return (_today.hour*60) + (_today.minute)
def add_zero_padding(number):
    if len(str(number)) == 1: return "0" + str(number)
    else: return number
def format_df(DF, pickup_date_column, dropof_date_column, month):
    #Clean some rows where the location id is invalid
    DF = DF[((DF["PULocationID"] != 264) & (DF["PULocationID"] != 265)) & ((DF["DOLocationID"] != 264) & (DF["DOLocationID"] != 265))]

    start = pd.to_datetime(DF[pickup_date_column])
    stop = pd.to_datetime(DF[dropof_date_column])
    DF['travel_time'] = (stop - start).astype('timedelta64[m]').astype('int64')

    DF = DF[(DF["travel_time"] < 5*60)]

    DF['PU_start'] = (start).dt.hour

    DF['PUlat'] = DF.PULocationID.map(lookup_df.set_index('LocationID').lat)
    DF['PUlong'] = DF.PULocationID.map(lookup_df.set_index('LocationID').long)
    DF['DOlat'] = DF.DOLocationID.map(lookup_df.set_index('LocationID').lat)
    DF['DOlong'] = DF.DOLocationID.map(lookup_df.set_index('LocationID').long)

    DF['distance'] = ((DF.PUlat - DF.DOlat) ** 2 + (DF.PUlong - DF.DOlong) ** 2) ** 0.5

    DF = DF.drop(columns=[dropof_date_column, pickup_date_column])
    return DF

In [4]:
#dataset = ("taxi_data/green_tripdata_2019", "l")
dataset = ("taxi_data/yellow_tripdata_2019", "t")

header = True
for x in range(1, 13): # this takes 5 min
    try:
        print("Reading csv: " + str(x))
        dfGreen = pd.read_csv(dataset[0] + "-" + str(add_zero_padding(x)) + ".csv", usecols=["PULocationID", "DOLocationID", dataset[1] + "pep_pickup_datetime", dataset[1] + "pep_dropoff_datetime"])
        print("Formating csv")
        dfGreen = format_df(dfGreen, dataset[1] + "pep_pickup_datetime", dataset[1] + "pep_dropoff_datetime", x)
        print("Writing csv")
        dfGreen.to_csv(dataset[0] + "_concat.csv", mode='a+', header=header, index=False)
        header=False
        print("DONE: " + str(x))
    except Exception as f:
        print(f)

Reading csv: 1
Formating csv
Writing csv
DONE: 1
Reading csv: 2
Formating csv
Writing csv
DONE: 2
Reading csv: 3
Formating csv
Writing csv
DONE: 3
Reading csv: 4
Formating csv
Writing csv
DONE: 4
Reading csv: 5
Formating csv
Writing csv
DONE: 5
Reading csv: 6
Formating csv
Writing csv
DONE: 6
Reading csv: 7
Formating csv
Writing csv
DONE: 7
Reading csv: 8
Formating csv
Writing csv
DONE: 8
Reading csv: 9
Formating csv
Writing csv
DONE: 9
Reading csv: 10
Formating csv
Writing csv
DONE: 10
Reading csv: 11
Formating csv
Writing csv
DONE: 11
Reading csv: 12
Formating csv
Writing csv
DONE: 12


# Train

In [14]:
#dfGreen = pd.read_csv("taxi_data/green_tripdata_2019_concat.csv", chunksize=50000)
dfYellow = pd.read_csv("taxi_data/yellow_tripdata_2019_concat.csv", chunksize=50000)

model = SGDRegressor()

poly = PolynomialFeatures(degree=9, include_bias=False)
min_max = MinMaxScaler()

counter = 0

for chunk in dfYellow:

    X_green = chunk[["PU_start", "PUlat", "PUlong","DOlat", "DOlong", "distance"]] # input cols
    y_green = chunk[["travel_time"]] # output cols
    
    scaled = poly.fit_transform(min_max.fit_transform(X_green.values))

    X_train, X_test, y_train, y_test = train_test_split(scaled, y_green.values.ravel(), test_size = 0.25)

    model.partial_fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    print("Mean squared error: %.2f Accuracy: %.2f" % mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred))
    counter += 1
    if counter % 10 == 0:
        dump(model, "models/" + str(counter) + '.joblib')

Mean squared error: 85.17 Accuracy:  0.026234531108630166


KeyboardInterrupt: 

In [8]:
"""
lpep_pickup_datetime,PULocationID,DOLocationID,month
"""
copied_from_green = [2949,2964,166,142,13.7648363727615,1.0]

inputdata = [copied_from_green[0], copied_from_green[2], copied_from_green[3], copied_from_green[4], copied_from_green[5]]
outputdata = [copied_from_green[1]]

inputdata = [5469,41,74,1.0]
outputdata = [5475]
pred = model.predict([
    inputdata
])

print(pred)

[2985.58207469]
