### Preamble

In [1]:
# Configure libraries
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error

In [3]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

###  Reading the data

In [2]:
# Each line is of the format:
# ((time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend, geohash), number of pickups)
names = ["time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "geohash", "pickups"]
dftaxi=pd.read_csv("./data/taxi_data.csv", header=None, names = names)
print dftaxi.shape

# n = 50000
# dftaxi = dftaxi.head(n)
dftaxi.head()

(312330, 11)


Unnamed: 0,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,pickups
0,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5rst,5476
1,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5xep,3
2,02:00,0.104167,0.793353,0.608761,Wednesday,0.300595,-0.312572,0.949894,0,dr5rek,18
3,05:00,0.229167,0.130526,0.991445,Thursday,0.46131,-0.970597,0.240712,0,dr72qr,54
4,08:00,0.354167,-0.608761,0.793353,Tuesday,0.193452,0.347871,0.937542,0,dr70jn,2


### Define training and test sets

In [4]:
itrain, itest = train_test_split(xrange(dftaxi.shape[0]), train_size=0.8)
mask=np.ones(dftaxi.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask[:10]

array([ True, False,  True,  True,  True, False, False,  True,  True,  True], dtype=bool)

### Final preperation for machine learning

In [5]:
# Split off the features
Xnames = ["time_cat", "time_num", "time_cos", "time_sin", "day_cat",
          "day_num", "day_cos", "day_sin", "weekend", "geohash"]
X = dftaxi[Xnames]

# Split off the target (which will be the logarithm of the number of pickups (+1))
y = np.log10(dftaxi['pickups']+1)

In [6]:
# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) == 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
X['latitude'] = X['geohash'].apply(lambda geo: decodegeo(geo, 0))
X['longitude'] = X['geohash'].apply(lambda geo: decodegeo(geo, 1))

In [7]:
X.head()

Unnamed: 0,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,latitude,longitude
0,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5rst,40.723572,-73.97644
1,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5xep,40.734558,-73.690796
2,02:00,0.104167,0.793353,0.608761,Wednesday,0.300595,-0.312572,0.949894,0,dr5rek,40.718079,-74.031372
3,05:00,0.229167,0.130526,0.991445,Thursday,0.46131,-0.970597,0.240712,0,dr72qr,40.866394,-73.899536
4,08:00,0.354167,-0.608761,0.793353,Tuesday,0.193452,0.347871,0.937542,0,dr70jn,40.816956,-74.30603


In [8]:
# Create indicator variables for the hours and days of the week and drop the categorical values
# g = 5
X = X.join(pd.get_dummies(X['time_cat']))\
     .join(pd.get_dummies(X['day_cat']))\
     .drop(['time_cat','day_cat','geohash'], axis=1)
#     .join(pd.get_dummies(X['geohash'].str[:g]))\

In [9]:
X.head()
X.info() # http://pandas.pydata.org/pandas-docs/stable/faq.html

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312330 entries, 0 to 312329
Data columns (total 40 columns):
time_num     312330 non-null float64
time_cos     312330 non-null float64
time_sin     312330 non-null float64
day_num      312330 non-null float64
day_cos      312330 non-null float64
day_sin      312330 non-null float64
weekend      312330 non-null int64
latitude     312330 non-null float64
longitude    312330 non-null float64
00:00        312330 non-null float64
01:00        312330 non-null float64
02:00        312330 non-null float64
03:00        312330 non-null float64
04:00        312330 non-null float64
05:00        312330 non-null float64
06:00        312330 non-null float64
07:00        312330 non-null float64
08:00        312330 non-null float64
09:00        312330 non-null float64
10:00        312330 non-null float64
11:00        312330 non-null float64
12:00        312330 non-null float64
13:00        312330 non-null float64
14:00        312330 non-null float64
15:

### Get the training and test sets

In [10]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]
Xtrain.head()

Unnamed: 0,time_num,time_cos,time_sin,day_num,day_cos,day_sin,weekend,latitude,longitude,00:00,01:00,02:00,03:00,04:00,05:00,06:00,07:00,08:00,09:00,10:00,11:00,12:00,13:00,14:00,15:00,16:00,17:00,18:00,19:00,20:00,21:00,22:00,23:00,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0.9375,0.92388,-0.382683,0.133929,0.666347,0.745642,0,40.723572,-73.97644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,0.104167,0.793353,0.608761,0.300595,-0.312572,0.949894,0,40.718079,-74.031372,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0.229167,0.130526,0.991445,0.46131,-0.970597,0.240712,0,40.866394,-73.899536,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0.354167,-0.608761,0.793353,0.193452,0.347871,0.937542,0,40.816956,-74.30603,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
7,0.020833,0.991445,0.130526,0.28869,-0.240712,0.970597,0,40.619202,-73.943481,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### k-Nearest Neighbors Regression

In [11]:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()

#### Normalize Data

In [12]:
Xtrain_mean = Xtrain.mean()
Xtrain_std_dev = Xtrain.std()
Xtrain_normalized = (Xtrain - Xtrain_mean)/Xtrain_std_dev
Xtest_normalized = (Xtest - Xtrain_mean)/Xtrain_std_dev


In [13]:
%%time
# Define a grid of parameters over which to optimize the knn regressor
# We will figure out which number of neighbors is optimal
#knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, Xtrain_normalized, ytrain, score_func='mean_squared_error')

BEST {'n_neighbors': 2} -0.184266248657 [mean: -0.25184, std: 0.00527, params: {'n_neighbors': 1}, mean: -0.18427, std: 0.00418, params: {'n_neighbors': 2}, mean: -0.18792, std: 0.00331, params: {'n_neighbors': 5}]
CPU times: user 24min 46s, sys: 2.04 s, total: 24min 48s
Wall time: 24min 48s


### k-Nearest Neighbors Regression

In [14]:
# Fit the best Random Forest and calculate R^2 values for training and test sets
knn_reg=knn_best.fit(Xtrain_normalized, ytrain)
knn_training_accuracy = knn_reg.score(Xtrain_normalized, ytrain)
knn_test_accuracy = knn_reg.score(Xtest_normalized, ytest)
print "############# based on standard predict ################"
print "R^2 on training data: %0.4f" % (knn_training_accuracy)
print "R^2 on test data:     %0.4f" % (knn_test_accuracy)

############# based on standard predict ################
R^2 on training data: 0.9480
R^2 on test data:     0.8542


In [15]:
# Show some of the predictions vs. the real number of pickups
# predictions vs. real number of pickups
np.round(np.power(10,np.column_stack((knn_reg.predict(Xtest_normalized),ytest))) - 1,decimals=0).astype(int)

array([[  2,   3],
       [  1,   1],
       [ 10,  19],
       ..., 
       [  1,   1],
       [276, 870],
       [  9,  43]])

In [16]:
# Calculate the Root Mean Squared Error
np.sqrt(mean_squared_error(knn_reg.predict(Xtest_normalized),ytest))

0.40400787441828129