In [1]:
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error
import random
import math
import datetime


In [2]:
%time df=pd.read_csv("./tmplocaldata/final/groupbydestn/pickup_dropoff_aggregated_with_feat_ext.csv")

Wall time: 23.1 s


In [3]:
df.head()

Unnamed: 0,pickup_geohash,dropoff_geohash,time_num,day_of_week,count,time_sin,time_cos,pickup_lat,pickup_long,dropoff_lat,dropoff_long
0,dr5rtk,dr5rsj,0.416667,4,2,0.5,-0.866025,40.718079,-73.943481,40.723572,-73.998413
1,dr5x0z,dr5xcf,0.683333,0,2,-0.913545,-0.406737,40.646667,-73.789673,40.751038,-73.745728
2,dr72rd,dr782h,0.65,3,2,-0.809017,-0.587785,40.838928,-73.844604,40.849915,-73.822632
3,dr5rvn,dr5rt5,0.083333,2,11,0.5,0.866025,40.77301,-73.954468,40.712585,-73.954468
4,dr72jd,dr72m3,0.05,0,3,0.309017,0.951057,40.794983,-73.932495,40.833435,-73.943481


In [3]:
df['log_count'] = np.log10(df['count']+1)

In [5]:
trainSet, testSet = train_test_split(df, train_size=0.8)
del df

In [6]:
X_train = trainSet[['time_num', 'time_sin', 'time_cos','day_of_week', 'pickup_lat', 'pickup_long', 'dropoff_lat', 'dropoff_long']]
#y_train = trainSet['count']
y_train = trainSet['log_count']

X_test = testSet[['time_num', 'time_sin', 'time_cos','day_of_week', 'pickup_lat', 'pickup_long', 'dropoff_lat', 'dropoff_long']]
#y_test = testSet['count']
y_test = testSet['log_count']

del trainSet
del testSet

## Machine learning

In [7]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

In [8]:
estimator = RandomForestRegressor(n_estimators=20, n_jobs=-1)

In [10]:
%%time
# Define a grid of parameters over which to optimize the random forest
# We will figure out which number of trees is optimal
parameters = {"n_estimators": [10],
              "max_features": ["auto"], # ["auto","sqrt","log2"]
              "max_depth": [10]}
best = cv_optimize(estimator, parameters, X_train, y_train, n_folds=2, verbose=3)
#best = cv_optimize(estimator, parameters, X_train, y_train, n_folds=2, score_func='mean_squared_error', verbose=3)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] max_features=auto, n_estimators=10, max_depth=10 ................
[CV]  max_features=auto, n_estimators=10, max_depth=10, score=0.523728 - 1.3min
[CV] max_features=auto, n_estimators=10, max_depth=10 ................
[CV]  max_features=auto, n_estimators=10, max_depth=10, score=0.525264 - 1.3min

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.6min finished



BEST {'max_features': 'auto', 'n_estimators': 10, 'max_depth': 10} 0.524495888196 [mean: 0.52450, std: 0.00077, params: {'max_features': 'auto', 'n_estimators': 10, 'max_depth': 10}] <function _passthrough_scorer at 0x00000000204D27B8>
Best score:  0.524495888196
Wall time: 5min 4s


In [13]:
%%time
rmse = np.sqrt(mean_squared_error(best.predict(X_test),y_test))
print "RMSE = %0.3f (this is in log-space!)" % rmse
print "RMSE in real scale: %0.2f" % np.power(10,rmse)

RMSE = 0.388 (this is in log-space!)
RMSE in real scale: 2.45
Wall time: 1.02 s


In [14]:
import operator
dict_feat_imp = dict(zip(list(X_train.columns.values),best.feature_importances_))

sorted_features = sorted(dict_feat_imp.items(), key=operator.itemgetter(1), reverse=True)
sorted_features

[('dropoff_long', 0.30408825024272568),
 ('pickup_long', 0.23172967883312015),
 ('dropoff_lat', 0.22555554859312843),
 ('pickup_lat', 0.21258320917845941),
 ('time_cos', 0.019245191836231027),
 ('time_num', 0.0046247418564218839),
 ('time_sin', 0.0014808130062024325),
 ('day_of_week', 0.00069256645371093648)]

## Appendix

In [11]:
# numbers in normal scale
%%time
print "RMSE= ",np.sqrt(mean_squared_error(best.predict(X_test),y_test))

RMSE=  66.189331534
Wall time: 6.21 s
