### Preamble

In [1]:
# Configure libraries

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error

In [2]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

###  Reading the data

In [3]:
# Each line is of the format:
# ((time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend, geohash), number of pickups)
names = ["time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "geohash", "pickups"]
dftaxi=pd.read_csv("./data/taxi_data.csv", header=None, names = names)
print dftaxi.shape

# n = 50000
# dftaxi = dftaxi.head(n)
dftaxi.head()

(312330, 11)


Unnamed: 0,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,pickups
0,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5rst,5476
1,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5xep,3
2,02:00,0.104167,0.793353,0.608761,Wednesday,0.300595,-0.312572,0.949894,0,dr5rek,18
3,05:00,0.229167,0.130526,0.991445,Thursday,0.46131,-0.970597,0.240712,0,dr72qr,54
4,08:00,0.354167,-0.608761,0.793353,Tuesday,0.193452,0.347871,0.937542,0,dr70jn,2


### Define training and test sets

In [4]:
itrain, itest = train_test_split(xrange(dftaxi.shape[0]), train_size=0.8)
mask=np.ones(dftaxi.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask[:10]

array([ True,  True, False, False, False,  True, False,  True,  True, False], dtype=bool)

### Final preperation for machine learning

In [57]:
# Split off the features
Xnames = ["time_cat", "time_num", "time_cos", "time_sin", "day_cat",
          "day_num", "day_cos", "day_sin", "weekend", "geohash"]
X = dftaxi[Xnames]

# Split off the target (which will be the logarithm of the number of pickups (+1))
y = np.log10(dftaxi['pickups']+1)

In [58]:
# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) == 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
X['latitude'] = X['geohash'].apply(lambda geo: decodegeo(geo, 0))
X['longitude'] = X['geohash'].apply(lambda geo: decodegeo(geo, 1))

In [59]:
X.head()

Unnamed: 0,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,latitude,longitude
0,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5rst,40.723572,-73.97644
1,22:00,0.9375,0.92388,-0.382683,Monday,0.133929,0.666347,0.745642,0,dr5xep,40.734558,-73.690796
2,02:00,0.104167,0.793353,0.608761,Wednesday,0.300595,-0.312572,0.949894,0,dr5rek,40.718079,-74.031372
3,05:00,0.229167,0.130526,0.991445,Thursday,0.46131,-0.970597,0.240712,0,dr72qr,40.866394,-73.899536
4,08:00,0.354167,-0.608761,0.793353,Tuesday,0.193452,0.347871,0.937542,0,dr70jn,40.816956,-74.30603


In [60]:
# Create indicator variables for the hours and days of the week and drop the categorical values
# g = 5
X = X.join(pd.get_dummies(X['time_cat']))\
     .join(pd.get_dummies(X['day_cat']))\
     .drop(['time_cat','day_cat','geohash'], axis=1)
#     .join(pd.get_dummies(X['geohash'].str[:g]))\

In [61]:
X.head()
X.info() # http://pandas.pydata.org/pandas-docs/stable/faq.html

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312330 entries, 0 to 312329
Data columns (total 40 columns):
time_num     312330 non-null float64
time_cos     312330 non-null float64
time_sin     312330 non-null float64
day_num      312330 non-null float64
day_cos      312330 non-null float64
day_sin      312330 non-null float64
weekend      312330 non-null int64
latitude     312330 non-null float64
longitude    312330 non-null float64
00:00        312330 non-null float64
01:00        312330 non-null float64
02:00        312330 non-null float64
03:00        312330 non-null float64
04:00        312330 non-null float64
05:00        312330 non-null float64
06:00        312330 non-null float64
07:00        312330 non-null float64
08:00        312330 non-null float64
09:00        312330 non-null float64
10:00        312330 non-null float64
11:00        312330 non-null float64
12:00        312330 non-null float64
13:00        312330 non-null float64
14:00        312330 non-null float64
15:

### Get the training and test sets

In [62]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]
Xtrain.head()

Unnamed: 0,time_num,time_cos,time_sin,day_num,day_cos,day_sin,weekend,latitude,longitude,00:00,01:00,02:00,03:00,04:00,05:00,06:00,07:00,08:00,09:00,10:00,11:00,12:00,13:00,14:00,15:00,16:00,17:00,18:00,19:00,20:00,21:00,22:00,23:00,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0.9375,0.92388,-0.382683,0.133929,0.666347,0.745642,0,40.723572,-73.97644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
1,0.9375,0.92388,-0.382683,0.133929,0.666347,0.745642,0,40.734558,-73.690796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
5,0.229167,0.130526,0.991445,0.032738,0.978918,0.204252,0,40.641174,-73.844604,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
7,0.020833,0.991445,0.130526,0.28869,-0.240712,0.970597,0,40.619202,-73.943481,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8,0.770833,0.130526,-0.991445,0.252976,-0.018699,0.999825,0,40.657654,-73.723755,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0


### Random Forest Regression

In [11]:
# Create a Random Forest Regression estimator
estimator = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1)

In [13]:
%%time
# Define a grid of parameters over which to optimize the random forest
# We will figure out which number of trees is optimal
parameters = {"n_estimators": [1,2,5,10,20,50,100], "max_features": ["auto"]} # ["auto","sqrt","log2"]
best = cv_optimize(estimator, parameters, Xtrain, ytrain, score_func='mean_squared_error')

BEST {'max_features': 'auto', 'n_estimators': 100} -0.0206022292971 [mean: -0.04350, std: 0.00059, params: {'max_features': 'auto', 'n_estimators': 1}, mean: -0.03192, std: 0.00044, params: {'max_features': 'auto', 'n_estimators': 2}, mean: -0.02500, std: 0.00024, params: {'max_features': 'auto', 'n_estimators': 5}, mean: -0.02263, std: 0.00026, params: {'max_features': 'auto', 'n_estimators': 10}, mean: -0.02150, std: 0.00022, params: {'max_features': 'auto', 'n_estimators': 20}, mean: -0.02083, std: 0.00018, params: {'max_features': 'auto', 'n_estimators': 50}, mean: -0.02060, std: 0.00017, params: {'max_features': 'auto', 'n_estimators': 100}]
Wall time: 26min 24s


### Evaluate the results

In [14]:
# Fit the best Random Forest and calculate R^2 values for training and test sets
reg=best.fit(Xtrain, ytrain)
training_accuracy = reg.score(Xtrain, ytrain)
test_accuracy = reg.score(Xtest, ytest)
print "############# based on standard predict ################"
print "R^2 on training data: %0.4f" % (training_accuracy)
print "R^2 on test data:     %0.4f" % (test_accuracy)


############# based on standard predict ################
R^2 on training data: 0.9976
R^2 on test data:     0.9829


In [15]:
# Show some of the predictions vs. the real number of pickups
# predictions vs. real number of pickups
np.round(np.power(10,np.column_stack((reg.predict(Xtest),ytest))) - 1,decimals=0).astype(int)

array([[14, 18],
       [82, 54],
       [ 1,  2],
       ..., 
       [69, 43],
       [ 1,  1],
       [78, 73]])

In [16]:
# Calculate the Root Mean Squared Error
np.sqrt(mean_squared_error(reg.predict(Xtest),ytest))

0.13874775789291882

In [17]:
# What are the most important features?
import operator
dict_feat_imp = dict(zip(list(X.columns.values),reg.feature_importances_))

sorted_features = sorted(dict_feat_imp.items(), key=operator.itemgetter(1), reverse=True)
sorted_features

[('longitude', 0.55106921611840609),
 ('latitude', 0.40910715177375095),
 ('time_num', 0.01326481573615192),
 ('time_cos', 0.0055988625564140195),
 ('day_sin', 0.0049410718365886395),
 ('day_num', 0.0043325735479966951),
 ('time_sin', 0.0038848278821744277),
 ('day_cos', 0.0024896997185055768),
 ('03:00', 0.00032909317599588814),
 ('weekend', 0.00032382335834664917),
 ('08:00', 0.00027236043093186422),
 ('Friday', 0.00024653499530508246),
 ('04:00', 0.00023754211740766067),
 ('Wednesday', 0.00022642512634473905),
 ('07:00', 0.00020736443263761771),
 ('06:00', 0.00020342575310213214),
 ('Saturday', 0.0001954715331625305),
 ('05:00', 0.00018939232506098142),
 ('02:00', 0.00017836012836718241),
 ('Tuesday', 0.00017088124382944159),
 ('09:00', 0.00016160841671645107),
 ('16:00', 0.00015316886248659798),
 ('Sunday', 0.00015006319757993912),
 ('15:00', 0.00014802743667468798),
 ('10:00', 0.00014684024605048998),
 ('20:00', 0.00013840505018140816),
 ('21:00', 0.00013714731945380096),
 ('19:00

### Create predictions for visualization

Now we are going to generate predictions that we can visualize in Tableau.

#### First we need a dataframe with all possible combinations of time and location

In [113]:
# Construct dataframes with all possible times (time_data) and all possible locations (loc_data)

# Columns about time
time_cols = list(X.columns.values)
time_cols.remove('latitude')
time_cols.remove('longitude')

# Columns about location
loc_cols = ['latitude', 'longitude']

# Unique times
time_data = X.drop(loc_cols, axis=1).drop_duplicates()

# Unique locations
loc_data = X.drop(time_cols, axis=1).drop_duplicates()

In [90]:
# Dummy column to be able to join them together
time_data['key'] = 1
loc_data['key'] = 1

# Merge the time_data and location_data
result = pd.merge(time_data, loc_data, on='key').drop(['key'], axis=1)
result = result[Xtrain.columns.values]

#### Then we do the prediction

In [92]:
# Get the real number of pickups and take care that we can merge it with the predictions,
# by also taking the geohash and the timestamp
yy = dftaxi[['geohash','day_num','pickups']]

# Decode the geohash in the latitude and longitude
yy['latitude'] = yy['geohash'].apply(lambda geo: decodegeo(geo, 0))
yy['longitude'] = yy['geohash'].apply(lambda geo: decodegeo(geo, 1))

# Do predictions and convert the logarithm to the normal numbers
result['pred_pickups'] = np.power(10,reg.predict(result))

# Merge the predictions and the real pickups
result = pd.merge(result, yy, how='left', on=['day_num', 'latitude', 'longitude']).drop(['geohash'], axis=1)
result.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312330 entries, 0 to 312329
Data columns (total 5 columns):
geohash      312330 non-null object
day_num      312330 non-null float64
pickups      312330 non-null int64
latitude     312330 non-null float64
longitude    312330 non-null float64
dtypes: float64(3), int64(1), object(1)
memory usage: 14.3+ MB


In [100]:
# Write to csv
result.to_csv('./data/taxi-data-predictions.csv')