In [1]:
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error
import random
import math
import datetime
import geopy as gp
from geopy.distance import vincenty

In [2]:
%time df=pd.read_csv("./tmplocaldata/final/groupbydestn/pickup_dropoff_aggregated_with_feat_ext.csv")

Wall time: 30.6 s


In [3]:
def getLatLongBasedOnGeocode(lat, longitude, accuracy = 6):
    gh = geohash.encode(lat, longitude, accuracy)
    ghLatLong = geohash.decode(gh)
    return ghLatLong[0], ghLatLong[1]

In [4]:
df.head()

Unnamed: 0,pickup_geohash,dropoff_geohash,time_num,day_of_week,count,time_sin,time_cos,pickup_lat,pickup_long,dropoff_lat,dropoff_long
0,dr5rtk,dr5rsj,0.416667,4,2,0.5,-0.866025,40.718079,-73.943481,40.723572,-73.998413
1,dr5x0z,dr5xcf,0.683333,0,2,-0.913545,-0.406737,40.646667,-73.789673,40.751038,-73.745728
2,dr72rd,dr782h,0.65,3,2,-0.809017,-0.587785,40.838928,-73.844604,40.849915,-73.822632
3,dr5rvn,dr5rt5,0.083333,2,11,0.5,0.866025,40.77301,-73.954468,40.712585,-73.954468
4,dr72jd,dr72m3,0.05,0,3,0.309017,0.951057,40.794983,-73.932495,40.833435,-73.943481


In [7]:
df["pickup_lat_from_gh"], df["pickup_long_from_gh"] = zip(*df.apply(lambda x: geohash.decode(x["pickup_geohash"]), axis=1))
df["dropoff_lat_from_gh"], df["dropoff_long_from_gh"] = zip(*df.apply(lambda x: geohash.decode(x["dropoff_geohash"]), axis=1))

In [8]:
df.head()

Unnamed: 0,pickup_geohash,dropoff_geohash,time_num,day_of_week,count,time_sin,time_cos,pickup_lat,pickup_long,dropoff_lat,dropoff_long,pickup_lat_from_gh,pickup_long_from_gh,dropoff_lat_from_gh,dropoff_long_from_gh
0,dr5rtk,dr5rsj,0.416667,4,2,0.5,-0.866025,40.718079,-73.943481,40.723572,-73.998413,40.718079,-73.943481,40.723572,-73.998413
1,dr5x0z,dr5xcf,0.683333,0,2,-0.913545,-0.406737,40.646667,-73.789673,40.751038,-73.745728,40.646667,-73.789673,40.751038,-73.745728
2,dr72rd,dr782h,0.65,3,2,-0.809017,-0.587785,40.838928,-73.844604,40.849915,-73.822632,40.838928,-73.844604,40.849915,-73.822632
3,dr5rvn,dr5rt5,0.083333,2,11,0.5,0.866025,40.77301,-73.954468,40.712585,-73.954468,40.77301,-73.954468,40.712585,-73.954468
4,dr72jd,dr72m3,0.05,0,3,0.309017,0.951057,40.794983,-73.932495,40.833435,-73.943481,40.794983,-73.932495,40.833435,-73.943481


In [9]:
X = df[['time_num', 'time_sin', 'time_cos','day_of_week', 'pickup_lat_from_gh', 'pickup_long_from_gh']]
y = df[['dropoff_lat_from_gh', 'dropoff_long_from_gh']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

In [11]:
estimator = RandomForestRegressor(n_estimators=20, n_jobs=-1)

In [12]:
%%time

# Define a grid of parameters over which to optimize the random forest
# We will figure out which number of trees is optimal
parameters = {"n_estimators": [20],
              "max_features": ["auto"], # ["auto","sqrt","log2"]
              "max_depth": [20]}
best = cv_optimize(estimator, parameters, X_train, y_train, n_folds=5, verbose=2)
#best = cv_optimize(estimator, parameters, X_train, y_train, n_folds=2, score_func='mean_squared_error', verbose=3)
print "RMSE= ",np.sqrt(mean_squared_error(best.predict(X_test),y_test))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] max_features=auto, n_estimators=20, max_depth=20 ................
[CV] ....... max_features=auto, n_estimators=20, max_depth=20 - 3.9min
[CV] max_features=auto, n_estimators=20, max_depth=20 ................
[CV] ....... max_features=auto, n_estimators=20, max_depth=20 - 3.9min
[CV] max_features=auto, n_estimators=20, max_depth=20 ................
[CV] ....... max_features=auto, n_estimators=20, max_depth=20 - 3.9min
[CV] max_features=auto, n_estimators=20, max_depth=20 ................
[CV] ....... max_features=auto, n_estimators=20, max_depth=20 - 3.9min
[CV] max_features=auto, n_estimators=20, max_depth=20 ................
[CV] ....... max_features=auto, n_estimators=20, max_depth=20 - 3.9min

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  3.9min
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 19.4min finished



BEST {'max_features': 'auto', 'n_estimators': 20, 'max_depth': 20} 0.141270390462 [mean: 0.14127, std: 0.01244, params: {'max_features': 'auto', 'n_estimators': 20, 'max_depth': 20}] <function _passthrough_scorer at 0x00000000204EC7B8>
Best score:  0.141270390462
RMSE=  0.114326475401
Wall time: 24min 13s


In [None]:
actualPredictedDf = pd.DataFrame(np.hstack((best.predict(X_test), y_test.values)),columns=["pred_dropoff_lat_from_gh", "pred_dropoff_long_from_gh", "act_dropoff_lat_from_gh", "act_dropoff_long_from_gh"])
actualPredictedDf.head()