# Learning from Yellow Taxi Data

In [3]:
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cluster import KMeans

### Load the data

In [4]:
taxidata = pd.read_csv("./data/test.csv", header=0)

In [5]:
cleandata = taxidata[(taxidata.pickup_longitude > -74.1) & (taxidata.pickup_longitude < -73.8)]
cleandata = cleandata[(cleandata.pickup_latitude > 40.55) & (cleandata.pickup_latitude < 40.9)]

cleandata = cleandata[(cleandata.dropoff_longitude > -74.1) & (cleandata.dropoff_longitude < -73.8)]
cleandata = cleandata[(cleandata.dropoff_latitude > 40.55) & (cleandata.dropoff_latitude < 40.9)]

# cleandata = taxidata

### Define train and test sets

In [6]:
itrain, itest = train_test_split(xrange(cleandata.shape[0]), train_size = 0.8)
mask=np.ones(cleandata.shape[0], dtype='int')
mask[itrain] = 1
mask[itest] = 0
mask = (mask == 1)

### Create time features

In [7]:
start = time.time()

def getHours(col):
    return datetime.strptime(col.split(' ')[1], '%H:%M:%S').time().hour

def getMinutes(col):
    return datetime.strptime(col.split(' ')[1], '%H:%M:%S').time().minute

print "Creating hour feature..."
hourFeature = cleandata["tpep_pickup_datetime"].apply(getHours)

print "Creating minute feature..."
minuteFeature = cleandata["tpep_pickup_datetime"].apply(getMinutes)

df1 = hourFeature.to_frame(name='hour')
df2 = minuteFeature.to_frame(name='minute')
time_data = pd.concat([df1, df2], join='outer', axis=1)

print time_data

Creating hour feature...
Creating minute feature...
          hour  minute
0            0       0
1            0       0
2            0       0
3            0       0
4            0       0
5            0       0
6            0       0
7            0       0
8            0       0
9            0       0
10           0       0
11           0       0
12           0       0
13           0       0
14           0       0
15           0       0
16           0       0
17           0       0
18           0       0
19           0       0
20           0       0
21           0       0
22           0       0
23           0       0
24           0       0
25           0       0
26           0       0
27           0       0
28           0       0
29           0       0
...        ...     ...
10906828    23       6
10906829    23      34
10906830     1      45
10906831     2       1
10906832     2      31
10906833     2      57
10906834     3       6
10906835     3      24
10906836     3      52
10906

### Drop all of the columns except for the pickup, dropoff, and time

In [1]:
X = pd.concat([cleandata[["pickup_longitude", "pickup_latitude"]], time_data],  join='outer', axis=1)
y = cleandata[["dropoff_longitude", "dropoff_latitude"]]

NameError: name 'pd' is not defined

### Cluster the data

In [None]:
# xmatrix = X.as_matrix(columns=X.columns)
# kmeans_x = KMeans(n_clusters=24, n_jobs=-1, random_state=0).fit(xmatrix)
# centroids_x = kmeans_x.predict(xmatrix)
# X = np.array([kmeans_x.cluster_centers_[i] for i in centroids_x])

kmeans_y = KMeans(n_clusters=24, n_jobs=-1, random_state=0).fit(y)
centroids_y = kmeans_y.predict(y)
for i, row in y.iterrows():
    if i < len(y):
        y.set_value(i, "centroid", centroids_y[i])
y = y[["centroid"]]

In [None]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]

print Xtrain.shape
max_samples = 100
if Xtrain.shape[0] > max_samples:
#     rows = random.sample(range(len(Xtrain)), max_samples)
    rows = random.sample(Xtrain.index, max_samples)
    Xtrain = Xtrain.ix[rows]
    ytrain = ytrain.ix[rows]
print Xtrain.shape

### Visualizing the Data

In [None]:
# plt.rcParams['agg.path.chunksize'] = 100000
# plt.plot(Xtrain["pickup_longitude"], Xtrain["pickup_latitude"], 'ro', markersize=2, markeredgewidth=0)
# plt.grid(True)
# plt.axis([-74.4, -73.5, 40.55, 40.9])
# plt.show()

# plt.plot(ytrain["dropoff_longitude"], ytrain["dropoff_latitude"], 'ro', markersize=2, markeredgewidth=0)
# plt.grid(True)
# plt.axis([-74.4, -73.5, 40.55, 40.9])
# plt.show()

### Optimizing the parameters

In [None]:
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.cv_results_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

In [None]:
# clf = RandomForestRegressor(n_estimators=20, n_jobs=-1)
clf = AdaBoostClassifier(n_estimators=20)

In [None]:
# %%time
# parameters = {
#     "n_estimators": [50],  
#     "max_features": ["auto"],
#     "max_depth": [50]
# }

parameters = {
    "n_estimators": [50],
    "learning_rate": [1]
}

best = cv_optimize(clf, parameters, Xtrain, ytrain.as_matrix().flatten(), n_folds=5, score_func='neg_mean_squared_error', verbose=3)

### Train with the data

In [None]:
%%time
reg = best.fit(Xtrain, ytrain)
training_accuracy = reg.score(Xtrain, ytrain)
test_accuracy = reg.score(Xtest, ytest)
print "############# based on standard predict ################"
print "R^2 on training data: %0.4f" % (training_accuracy)
print "R^2 on test data:     %0.4f" % (test_accuracy)

In [None]:
best.predict([[-73.990371704101563, 40.734695434570313]])