### Preamble

In [1]:
# Configure libraries

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error

In [2]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

###  Reading the data

In [67]:
# Each line is of the format:
# ((time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend, geohash), number of pickups)
names = ["time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "geohash", "pickups"]
dftaxi=pd.read_csv("./data/taxi_data.csv", header=None, names = names)
print dftaxi.shape

# n = 50000
# dftaxi = dftaxi.head(n)
dftaxi.head()

(312330, 11)


Unnamed: 0,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,pickups
0,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5rst,5476
1,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5xep,3
2,02:00,0.104167,0.793353,0.608761,Wednesday,0.300595,-0.312572,0.949894,0,dr5rek,18
3,05:00,0.229167,0.130526,0.991445,Thursday,0.46131,-0.970597,0.240712,0,dr72qr,54
4,08:00,0.354167,-0.608761,0.793353,Tuesday,0.193452,0.347871,0.937542,0,dr70jn,2


### Define training and test sets

In [4]:
itrain, itest = train_test_split(xrange(dftaxi.shape[0]), train_size=0.8)
mask=np.ones(dftaxi.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask[:10]

array([ True, False,  True,  True, False,  True,  True,  True,  True, False], dtype=bool)

### Final preperation for machine learning

In [20]:
# Split off the features
Xnames = ["time_cat", "time_num", "time_cos", "time_sin", "day_cat",
          "day_num", "day_cos", "day_sin", "weekend", "geohash"]
X = dftaxi[Xnames]

# Split off the target (which will be the logarithm of the number of pickups (+1))
y = np.log10(dftaxi['pickups']+1)

In [21]:
# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) == 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
X['latitude'] = X['geohash'].apply(lambda geo: decodegeo(geo, 0))
X['longitude'] = X['geohash'].apply(lambda geo: decodegeo(geo, 1))

In [22]:
X.head()

Unnamed: 0,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,latitude,longitude
0,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5rst,40.723572,-73.97644
1,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5xep,40.734558,-73.690796
2,02:00,0.104167,0.793353,0.608761,Wednesday,0.300595,-0.312572,0.949894,0,dr5rek,40.718079,-74.031372
3,05:00,0.229167,0.130526,0.991445,Thursday,0.46131,-0.970597,0.240712,0,dr72qr,40.866394,-73.899536
4,08:00,0.354167,-0.608761,0.793353,Tuesday,0.193452,0.347871,0.937542,0,dr70jn,40.816956,-74.30603


In [24]:
# Create indicator variables for the hours and days of the week and drop the categorical values
# g = 5
X = X.join(pd.get_dummies(X['time_cat']))\
     .join(pd.get_dummies(X['day_cat']))\
     .drop(['time_cat','day_cat','geohash'], axis=1)
#     .join(pd.get_dummies(X['geohash'].str[:g]))\

In [25]:
X.head()
X.info() # http://pandas.pydata.org/pandas-docs/stable/faq.html

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 40 columns):
time_num     50000 non-null float64
time_cos     50000 non-null float64
time_sin     50000 non-null float64
day_num      50000 non-null float64
day_cos      50000 non-null float64
day_sin      50000 non-null float64
weekend      50000 non-null int64
latitude     50000 non-null float64
longitude    50000 non-null float64
00:00        50000 non-null float64
01:00        50000 non-null float64
02:00        50000 non-null float64
03:00        50000 non-null float64
04:00        50000 non-null float64
05:00        50000 non-null float64
06:00        50000 non-null float64
07:00        50000 non-null float64
08:00        50000 non-null float64
09:00        50000 non-null float64
10:00        50000 non-null float64
11:00        50000 non-null float64
12:00        50000 non-null float64
13:00        50000 non-null float64
14:00        50000 non-null float64
15:00        50000 non-null f

### Get the training and test sets

In [26]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]
Xtrain.head()

Unnamed: 0,time_num,time_cos,time_sin,day_num,day_cos,day_sin,weekend,latitude,longitude,00:00,01:00,02:00,03:00,04:00,05:00,06:00,07:00,08:00,09:00,10:00,11:00,12:00,13:00,14:00,15:00,16:00,17:00,18:00,19:00,20:00,21:00,22:00,23:00,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0.9375,0.92388,-0.382683,0.133929,0.666347,0.745642,0,40.723572,-73.97644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,0.104167,0.793353,0.608761,0.300595,-0.312572,0.949894,0,40.718079,-74.031372,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0.229167,0.130526,0.991445,0.46131,-0.970597,0.240712,0,40.866394,-73.899536,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
5,0.229167,0.130526,0.991445,0.032738,0.978918,0.204252,0,40.641174,-73.844604,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,0.9375,0.92388,-0.382683,0.276786,-0.167506,0.985871,0,40.586243,-73.965454,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


### Random Forest Regression

In [46]:
# Create a Random Forest Regression estimator
estimator = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1)

In [61]:
# Define a grid of parameters over which to optimize the random forest
# We will figure out which number of trees is optimal
%%time
parameters = {"n_estimators": [1,2,5,10,20,50,100], "max_features": ["auto"]} # ["auto","sqrt","log2"]
best = cv_optimize(estimator, parameters, Xtrain, ytrain, score_func='mean_squared_error')

BEST {'max_features': 'auto', 'n_estimators': 100} -0.0418280392681 [mean: -0.09901, std: 0.00618, params: {'max_features': 'auto', 'n_estimators': 1}, mean: -0.06986, std: 0.00392, params: {'max_features': 'auto', 'n_estimators': 2}, mean: -0.05285, std: 0.00255, params: {'max_features': 'auto', 'n_estimators': 5}, mean: -0.04684, std: 0.00149, params: {'max_features': 'auto', 'n_estimators': 10}, mean: -0.04411, std: 0.00086, params: {'max_features': 'auto', 'n_estimators': 20}, mean: -0.04234, std: 0.00091, params: {'max_features': 'auto', 'n_estimators': 50}, mean: -0.04183, std: 0.00070, params: {'max_features': 'auto', 'n_estimators': 100}]
CPU times: user 7min 57s, sys: 8.77 s, total: 8min 6s
Wall time: 5min 26s


### Evaluate the results

In [62]:
# Fit the best Random Forest and calculate R^2 values for training and test sets
reg=best.fit(Xtrain, ytrain)
training_accuracy = reg.score(Xtrain, ytrain)
test_accuracy = reg.score(Xtest, ytest)
print "############# based on standard predict ################"
print "R^2 on training data: %0.4f" % (training_accuracy)
print "R^2 on test data:     %0.4f" % (test_accuracy)


############# based on standard predict ################
R^2 on training data: 1.00
R^2 on test data:     0.97


In [63]:
# Show some of the predictions vs. the real number of pickups
# predictions vs. real number of pickups
np.round(np.power(10,np.column_stack((reg.predict(Xtest),ytest))) - 1,decimals=0).astype(int)

array([[    3,     4],
       [    2,     3],
       [11088, 11952],
       ..., 
       [    2,     4],
       [    2,     2],
       [    5,     8]])

In [69]:
# Calculate the Root Mean Squared Error
np.sqrt(mean_squared_error(reg.predict(Xtest),ytest))

0.19428735966633445

In [74]:
# What are the most important features?
import operator
dict_feat_imp = dict(zip(list(X.columns.values),reg.feature_importances_))

sorted_features = sorted(dict_feat_imp.items(), key=operator.itemgetter(1), reverse=True)
sorted_features

[('longitude', 0.55876141784035194),
 ('latitude', 0.39422900460091148),
 ('time_num', 0.011215205214659719),
 ('day_sin', 0.0064374833697586594),
 ('day_num', 0.0056726277733164321),
 ('time_cos', 0.005514918508135248),
 ('time_sin', 0.0048942741195212962),
 ('day_cos', 0.0043641218411276106),
 ('03:00', 0.00058260510586353672),
 ('04:00', 0.00052352179813915122),
 ('Friday', 0.0004469999455339973),
 ('Wednesday', 0.00042054321780137497),
 ('Saturday', 0.00038336550502161165),
 ('02:00', 0.00037829987499232942),
 ('08:00', 0.00034495692799035356),
 ('Tuesday', 0.00033026301166975898),
 ('19:00', 0.00029073486233320366),
 ('05:00', 0.00027808113569394101),
 ('weekend', 0.00027744968673522617),
 ('07:00', 0.00026795391773462443),
 ('16:00', 0.00025825324065636512),
 ('15:00', 0.00025716278950360921),
 ('09:00', 0.00025666948033309226),
 ('01:00', 0.00024972760581700565),
 ('12:00', 0.00024253898691489959),
 ('10:00', 0.00023825119293826341),
 ('20:00', 0.00023343971109137166),
 ('22:00'