# Learning from Yellow Taxi Data

In [1]:
import random
import numpy as np
import scipy as sp
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

### Load the data

In [2]:
taxidata = pd.read_csv("./data/2016-01.csv", header=0)

### Define train and test sets

In [3]:
itrain, itest = train_test_split(xrange(taxidata.shape[0]), train_size = 0.8)
mask=np.ones(taxidata.shape[0], dtype='int')
mask[itrain] = 1
mask[itest] = 0
mask = (mask == 1)

### Drop all of the columns except for the pickup and dropoff

In [4]:
X = taxidata[["pickup_longitude", "pickup_latitude"]]
y = taxidata[["dropoff_longitude", "dropoff_latitude"]]

In [5]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]

print Xtrain.shape
max_samples = 1000000
if Xtrain.shape[0] > max_samples:
    rows = random.sample(Xtrain.index, max_samples)
    Xtrain = Xtrain.ix[rows]
    ytrain = ytrain.ix[rows]
print Xtrain.shape

(8725486, 2)
(1000000, 2)


In [6]:
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.cv_results_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

In [7]:
clf = RandomForestRegressor(n_estimators=20, n_jobs=-1)

In [8]:
%%time
parameters = {
    "n_estimators": [50],  
    "max_features": ["auto"],
    "max_depth": [50]
}

best = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5, score_func='neg_mean_squared_error', verbose=3)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] max_features=auto, n_estimators=50, max_depth=50 ................
[CV]  max_features=auto, n_estimators=50, max_depth=50, score=-12.617640, total= 2.1min
[CV] max_features=auto, n_estimators=50, max_depth=50 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


[CV]  max_features=auto, n_estimators=50, max_depth=50, score=-13.343097, total= 1.9min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.6min remaining:    0.0s


[CV] max_features=auto, n_estimators=50, max_depth=50 ................
[CV]  max_features=auto, n_estimators=50, max_depth=50, score=-13.702323, total= 1.8min
[CV] max_features=auto, n_estimators=50, max_depth=50 ................
[CV]  max_features=auto, n_estimators=50, max_depth=50, score=-13.214341, total= 1.6min
[CV] max_features=auto, n_estimators=50, max_depth=50 ................
[CV]  max_features=auto, n_estimators=50, max_depth=50, score=-13.863260, total= 1.8min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 10.5min finished


BEST {'max_features': 'auto', 'n_estimators': 50, 'max_depth': 50} -13.3481324125 [mean: -13.34813, std: 0.43413, params: {'max_features': 'auto', 'n_estimators': 50, 'max_depth': 50}] make_scorer(mean_squared_error, greater_is_better=False)
Best score:  -13.3481324125
CPU times: user 39min 22s, sys: 47.8 s, total: 40min 10s
Wall time: 12min 49s




In [9]:
%%time
reg = best.fit(Xtrain, ytrain)
training_accuracy = reg.score(Xtrain, ytrain)
test_accuracy = reg.score(Xtest, ytest)
print "############# based on standard predict ################"
print "R^2 on training data: %0.4f" % (training_accuracy)
print "R^2 on test data:     %0.4f" % (test_accuracy)

############# based on standard predict ################
R^2 on training data: 0.8376
R^2 on test data:     0.7437
CPU times: user 10min 9s, sys: 17.5 s, total: 10min 26s
Wall time: 3min 26s


In [25]:
best.predict([[-73.990371704101563, 40.734695434570313]])

array([[-73.96420963,  40.7740098 ]])