# Learning from Yellow Taxi Data

In [22]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

### Load the data

In [32]:
taxidata = pd.read_csv("./data/2016-01.csv", header=0)


In [33]:
# cleandata = taxidata[(taxidata.pickup_longitude > -74.1) & (taxidata.pickup_longitude < -73.8)]
# cleandata = cleandata[(cleandata.pickup_latitude > 40.55) & (cleandata.pickup_latitude < 40.9)]

# cleandata = cleandata[(cleandata.dropoff_longitude > -74.1) & (cleandata.dropoff_longitude < -73.8)]
# cleandata = cleandata[(cleandata.dropoff_latitude > 40.55) & (cleandata.dropoff_latitude < 40.9)]

cleandata = taxidata
print type(cleandata["tpep_pickup_datetime"][0])

<class 'pandas.tslib.Timestamp'>


### Define train and test sets

In [34]:
itrain, itest = train_test_split(xrange(cleandata.shape[0]), train_size = 0.8)
mask=np.ones(cleandata.shape[0], dtype='int')
mask[itrain] = 1
mask[itest] = 0
mask = (mask == 1)

### Drop all of the columns except for the pickup and dropoff

In [53]:
X = cleandata[["tpep_pickup_datetime", "pickup_longitude", "pickup_latitude"]]
y = cleandata[["dropoff_longitude", "dropoff_latitude"]]

X["tpep_pickup_datetime"][100000].time().toordinal()
X["tpep_pickup_datetime"].apply(lambda x: datetime.timedelta(hours=x.time().hour, minutes=x.time().minutes, ))

AttributeError: 'datetime.time' object has no attribute 'toordinal'

In [36]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]

print Xtest
print ytest

print Xtrain.shape
max_samples = 1000000
if Xtrain.shape[0] > max_samples:
    rows = random.sample(Xtrain.index, max_samples)
    Xtrain = Xtrain.ix[rows]
    ytrain = ytrain.ix[rows]
print Xtrain.shape

         tpep_pickup_datetime  trip_distance  pickup_longitude  \
0         2016-01-01 00:00:00           1.10        -73.990372   
4         2016-01-01 00:00:00           1.76        -73.960625   
6         2016-01-01 00:00:00           7.45        -73.994057   
7         2016-01-01 00:00:01           1.20        -73.979424   
12        2016-01-01 00:00:03           0.01        -73.989021   
17        2016-01-01 00:00:06           1.70        -73.982101   
19        2016-01-01 00:00:07           4.90        -73.953033   
21        2016-01-01 00:00:08           3.09        -73.999069   
25        2016-01-01 00:00:09           1.20        -73.963913   
27        2016-01-01 00:00:10           0.87        -73.954407   
32        2016-01-01 00:00:16           2.90        -73.982155   
33        2016-01-01 00:00:17           1.20        -74.008064   
34        2016-01-01 00:00:17           1.50        -74.002678   
37        2016-01-01 00:00:18           1.17        -73.963058   
40        

### Visualizing the Data

In [37]:
# plt.rcParams['agg.path.chunksize'] = 100000
# plt.plot(Xtrain["pickup_longitude"], Xtrain["pickup_latitude"], 'ro', markersize=2, markeredgewidth=0)
# plt.grid(True)
# plt.axis([-74.4, -73.5, 40.55, 40.9])
# plt.show()

# plt.plot(ytrain["dropoff_longitude"], ytrain["dropoff_latitude"], 'ro', markersize=2, markeredgewidth=0)
# plt.grid(True)
# plt.axis([-74.4, -73.5, 40.55, 40.9])
# plt.show()

### Optimizing the parameters

In [38]:
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.cv_results_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

In [39]:
clf = RandomForestRegressor(n_estimators=20, n_jobs=-1)

In [40]:
# %%time
parameters = {
    "n_estimators": [50],  
    "max_features": ["auto"],
    "max_depth": [50]
}

best = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5, score_func='neg_mean_squared_error', verbose=3)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] max_features=auto, n_estimators=50, max_depth=50 ................


TypeError: float() argument must be a string or a number

### Train with the data

In [None]:
%%time
reg = best.fit(Xtrain, ytrain)
training_accuracy = reg.score(Xtrain, ytrain)
test_accuracy = reg.score(Xtest, ytest)
print "############# based on standard predict ################"
print "R^2 on training data: %0.4f" % (training_accuracy)
print "R^2 on test data:     %0.4f" % (test_accuracy)

In [25]:
best.predict([[-73.990371704101563, 40.734695434570313]])

array([[-73.96420963,  40.7740098 ]])