### Preamble

Loading necessary libraries.

In [1]:
# Configure libraries

%matplotlib inline
import numpy as np
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error
import random
import math
import datetime

In [2]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

###  Reading the taxi data

Read in the training and validation data. Training data is from January 2013 till February 2015. Validation data is from March and April 2015. This data was prepared on an Amazon EC2 Spark cluster, transferred to S3 and then downloaded. It was converted from an RDD to a csv using the [ConvertRDDtoCSV](https://github.com/sdaulton/TaxiPrediction/blob/master/3. ConvertRDDtoCSV.ipynb) notebook.

In [3]:
# Each line is of the format:
# ((time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend, geohash), number of pickups)
names = ["year","month","day","time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "geohash", "pickups"]
dftrain=pd.read_csv("./data/taxi_data_training.csv", header=None, names = names)
print dftrain.shape

(7238532, 14)


In [4]:
dfvalid=pd.read_csv("./data/taxi_data_validation.csv", header=None, names = names)
print dfvalid.shape

(1569528, 14)


### Reading and preparing the weather data

Here we are reading the weather data pulled from NOAA for New York City Central Park. It contains among other things, daily numbers of precipitation, snowfall, snow depth, wind speed and minimum and maximum temperatures.

In [5]:
# Get the weather data

weather=pd.read_csv("./data/nyc-weather-data.csv")
weather.ix[weather.SNWD <= -9999, 'SNWD'] = 0
weather.ix[weather.SNOW <= -9999, 'SNOW'] = 0
weather.ix[weather.AWND <= -9999, 'AWND'] = 0
weather['year'] = (weather['DATE']/10000).apply(math.floor)
weather['month'] = ((weather['DATE'].mod(10000))/100).apply(math.floor)
weather['day'] = weather['DATE'].mod(100)
weather = weather[['year','month','day','PRCP','SNWD','SNOW','TMAX','TMIN','AWND']]
weather['PRCP'] = weather['PRCP'] / 10.
weather['TMAX'] = weather['TMAX'] / 10.
weather['TMIN'] = weather['TMIN'] / 10.
weather['AWND'] = weather['AWND'] / 10. * 3.6
weather.columns = ['year','month','day','precipitation','snow_depth','snowfall','max_temp','min_temp','avg_wind']
weather.head()

Unnamed: 0,year,month,day,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind
0,2009,1,1,0,0,0,-3.3,-9.4,18.0
1,2009,1,2,0,0,0,1.1,-5.0,10.08
2,2009,1,3,0,0,0,3.3,-1.7,16.2
3,2009,1,4,0,0,0,5.6,-3.9,12.24
4,2009,1,5,0,0,0,6.1,3.3,11.16


### Final preperation for machine learning

I tried initially doing one-hot-encoding on the geohashes to generate numerical features, but I quickly realized that this was not feasible from a memory perspective. Millions of records times tens of thousands of features would not fit in memory. So I decided to go for the numerical latitude and longitude route. Using a random forest, we can easily detect higher order structures in these two variables.

First we define some functions to further prepare the data.

In [6]:
# Get the number of the day in the year. So January 1 is 1, December 31st is 365 or 366
def get_yearday(df):
    date = datetime.date(df['year'],df['month'],df['day'])
    return (date.timetuple().tm_yday-1)/365.

# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0

# Join the weather data to the taxi data, add some more time and date features and
# get the latitude and longitude of the center of the geohash where the records were bucketed in.
def further_data_prep(df):
    df = pd.merge(df, weather, how='left', on=['year','month','day'])
    df['year_num'] = df.apply(lambda x:get_yearday(x),axis=1)
    df['month_num'] = (df['day']-1)/30.
    df['year_sin'] = (df['year_num'] * 2 * math.pi).apply(math.sin)
    df['year_cos'] = (df['year_num'] * 2 * math.pi).apply(math.cos)
    df['month_sin'] = (df['month_num'] * 2 * math.pi).apply(math.sin)
    df['month_cos'] = (df['month_num'] * 2 * math.pi).apply(math.cos)
    df['latitude'] = df['geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['longitude'] = df['geohash'].apply(lambda geo: decodegeo(geo, 1))
    return df

Prepare the data.

In [7]:
%%time
dftrain = further_data_prep(dftrain)
dfvalid = further_data_prep(dfvalid)

Wall time: 9min 16s


In [8]:
# Check results
dfvalid.head()

Unnamed: 0,year,month,day,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,pickups,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind,year_num,month_num,year_sin,year_cos,month_sin,month_cos,latitude,longitude
0,2015,4,10,14:30,0.614583,-0.75184,-0.659346,Friday,0.659226,-0.539926,-0.841713,0,dr5ru68,12,4.3,0,0,13.3,3.9,7.56,0.271233,0.3,0.991114,-0.133015,0.951057,-0.309017,40.751724,-73.992233
1,2015,3,7,19:00,0.802083,0.321439,-0.94693,Saturday,0.828869,0.475515,-0.879708,1,dr5rs0z,3,0.0,460,0,3.3,-7.7,8.28,0.178082,0.2,0.899631,0.436651,0.951057,0.309017,40.698166,-73.993607
2,2015,3,27,16:00,0.677083,-0.442289,-0.896873,Friday,0.668155,-0.491881,-0.870662,0,dr5rum1,4,6.9,0,0,7.8,3.9,6.84,0.232877,0.866667,0.994218,0.107381,-0.743145,0.669131,40.765457,-73.99086
3,2015,3,23,12:30,0.53125,-0.980785,-0.19509,Monday,0.075893,0.888446,0.458982,0,dr72mdp,1,0.0,0,0,3.3,-4.9,10.8,0.221918,0.733333,0.984474,0.175531,-0.994522,-0.104528,40.836868,-73.927689
4,2015,3,10,18:00,0.760417,0.065403,-0.997859,Tuesday,0.251488,-0.00935,0.999956,0,dr5ru0g,3,11.7,280,0,11.7,3.9,5.76,0.186301,0.3,0.920971,0.38963,0.951057,-0.309017,40.742111,-73.9991


Extract the features that will actually be used in the machine learning algorithm and define the target. The target in this case will be the logarithm of the number of pickups. Why the logarithm? Because if we would use the number of pickups as is, mistakes in absolute numbers would be treated equally among high traffic locations and low traffic locations. An error of 100 pickups in a location with usually about 1000 pickups would be treated equally as an error of 100 pickups in a location with usually about 10 pickups. That is not what we want. We also want some accuracy in the low traffic locations. Taking the logarithm allows us to do that. With a logarithmic target, errors are optimized in an order of magnitude scale, rather than an absolute scale. So the result will be that predictions are correct to such and such order of magnitude.

In [9]:
# Split off the features
# Xnames = ['month','day','latitude','longitude','year_num','year_cos','year_sin',
#           'month_num','month_cos','month_sin','day_num','day_cos','day_sin',
#           'time_num','time_cos','time_sin','weekend','precipitation',
#           'snow_depth','snowfall','max_temp','min_temp','avg_wind']
Xnames = ['month','day','latitude','longitude','year_num','year_cos','year_sin',
          'month_cos','month_sin','day_num','day_cos','day_sin',
          'time_num','time_cos','time_sin','max_temp','min_temp','avg_wind']
Xtrain = dftrain[Xnames]
Xvalid = dfvalid[Xnames]

# Split off the target (which will be the logarithm of the number of pickups (+1))
ytrain = np.log10(dftrain['pickups']+1)
yvalid = np.log10(dfvalid['pickups']+1)

### Get the training and test sets

In [10]:
print Xtrain.shape
max_samples = 2000000
if Xtrain.shape[0] > max_samples:
    rows = random.sample(Xtrain.index, max_samples)
    Xtrain = Xtrain.ix[rows]
    ytrain = ytrain.ix[rows]
print Xtrain.shape
Xtrain.head()

(7238532, 18)
(2000000, 18)


Unnamed: 0,month,day,latitude,longitude,year_num,year_cos,year_sin,month_cos,month_sin,day_num,day_cos,day_sin,time_num,time_cos,time_sin,max_temp,min_temp,avg_wind
1627377,10,30,40.751724,-73.942795,0.827397,0.467359,-0.884068,0.978148,-0.207912,0.498512,-0.999956,0.00935,0.489583,-0.997859,0.065403,15.0,8.3,5.76
3147822,5,26,40.762711,-74.000473,0.39726,-0.798779,0.601624,0.5,-0.866025,0.936012,0.920261,-0.391305,0.552083,-0.94693,-0.321439,18.9,8.9,13.68
5528461,8,14,40.736618,-73.983994,0.616438,-0.744104,-0.668064,-0.913545,0.406737,0.299107,-0.303677,0.952775,0.09375,0.83147,0.55557,23.3,16.1,10.44
951888,5,4,40.731125,-73.926315,0.336986,-0.519744,0.854322,0.809017,0.587785,0.715774,-0.213396,-0.976966,0.010417,0.997859,0.065403,20.6,8.9,9.36
6931874,6,1,40.747604,-73.973007,0.413699,-0.856551,0.516062,1.0,0.0,0.953869,0.958287,-0.285808,0.677083,-0.442289,-0.896873,25.0,12.8,5.76


### Random Forest Regression

The best algorithm that we tried on this problem was a Random Forest regressor. Using deep trees, a random forest is able to really get at the detailed structure of the longitude and latitude features to really use specific regions of NYC to make its predictions. We're evaluating the random forest on root-mean-squared-error (RMSE) of the prediction (in log-space of number of pickups).

In [11]:
# Create a Random Forest Regression estimator
reg = RandomForestRegressor(n_estimators=20, n_jobs=-1)

When we did our first tests with this dataset, we used the `cv_optimize` function (see the cell below) to try out several configurations of parameters using cross validation to get a feeling for the influence on accuracy (model quality) and performance (runtime). We concluded a few things from this:

1. It quickly became apparent that more than about 2 million records lead to very long runtimes.
2. Trees deeper than 20 were not adding much quality.
3. The more trees we created, the more accurate the model, but also the longer the runtime.
4. Using all features ("auto") was better than using only sqrt(n) features or log2(n) features

For this version of the notebook, we went with 50 trees of depth 20, using a sample of 2,000,000 data points.

The code below can be used to train the model using 50 trees of depth 20. After training each tree, the RMSE on the training and validation sets is calculated to visualize the progress we're making in model quality.

In [12]:
reg = RandomForestRegressor(n_estimators=1, max_depth=30, n_jobs=-1, warm_start=True)
for n in range(1,51):
    reg.set_params(n_estimators=n)
    reg.fit(Xtrain,ytrain)
    training_accuracy = reg.score(Xtrain, ytrain)
    valid_accuracy = reg.score(Xvalid, yvalid)
    rmsetrain = np.sqrt(mean_squared_error(reg.predict(Xtrain),ytrain))
    rmsevalid = np.sqrt(mean_squared_error(reg.predict(Xvalid),yvalid))
    print "N = %d, R^2 (train) = %0.3f, R^2 (valid) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f" % (n,training_accuracy, valid_accuracy, rmsetrain, rmsevalid)

N = 1, R^2 (train) = 0.788, R^2 (valid) = 0.459, RMSE (train) = 0.158, RMSE (valid) = 0.243
N = 2, R^2 (train) = 0.870, R^2 (valid) = 0.597, RMSE (train) = 0.124, RMSE (valid) = 0.210
N = 3, R^2 (train) = 0.897, R^2 (valid) = 0.641, RMSE (train) = 0.110, RMSE (valid) = 0.198
N = 4, R^2 (train) = 0.911, R^2 (valid) = 0.664, RMSE (train) = 0.102, RMSE (valid) = 0.192
N = 5, R^2 (train) = 0.919, R^2 (valid) = 0.678, RMSE (train) = 0.097, RMSE (valid) = 0.188
N = 6, R^2 (train) = 0.925, R^2 (valid) = 0.687, RMSE (train) = 0.094, RMSE (valid) = 0.185
N = 7, R^2 (train) = 0.929, R^2 (valid) = 0.693, RMSE (train) = 0.092, RMSE (valid) = 0.184
N = 8, R^2 (train) = 0.931, R^2 (valid) = 0.698, RMSE (train) = 0.090, RMSE (valid) = 0.182
N = 9, R^2 (train) = 0.933, R^2 (valid) = 0.702, RMSE (train) = 0.089, RMSE (valid) = 0.181
N = 10, R^2 (train) = 0.935, R^2 (valid) = 0.705, RMSE (train) = 0.087, RMSE (valid) = 0.180
N = 11, R^2 (train) = 0.936, R^2 (valid) = 0.707, RMSE (train) = 0.087, RMSE (v

### Evaluate the results

Sanity check on some records.

In [13]:
# Show some of the predictions vs. the real number of pickups
# predictions vs. real number of pickups
pd.DataFrame(np.round(np.power(10,np.column_stack((reg.predict(Xvalid),yvalid))) - 1,decimals=0).astype(int)).head(20)

Unnamed: 0,0,1
0,7,12
1,2,3
2,5,4
3,1,1
4,2,3
5,1,1
6,1,1
7,1,1
8,4,3
9,9,15


In [14]:
# Calculate the Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(reg.predict(Xvalid),yvalid))
print "RMSE = %0.3f (this is in log-space!)" % rmse
print "So two thirds of the records would be a factor of less than %0.2f away from the real value." % np.power(10,rmse)

RMSE = 0.173 (this is in log-space!)
So two thirds of the records would be a factor of less than 1.49 away from the real value.


In [15]:
# What are the most important features?
import operator
dict_feat_imp = dict(zip(list(Xtrain.columns.values),reg.feature_importances_))
sorted_features = sorted(dict_feat_imp.items(), key=operator.itemgetter(1), reverse=True)
sorted_features

[('longitude', 0.34072788876115723),
 ('latitude', 0.31287612120249852),
 ('time_num', 0.064314704030115605),
 ('time_cos', 0.026450957375376904),
 ('day_sin', 0.026326791200598051),
 ('day_num', 0.02421424852325817),
 ('avg_wind', 0.02320447792407548),
 ('year_sin', 0.022884790117684389),
 ('year_cos', 0.020631876770959508),
 ('time_sin', 0.020272077215941903),
 ('min_temp', 0.018447569373706708),
 ('max_temp', 0.017951183437801083),
 ('day_cos', 0.017214112320699134),
 ('year_num', 0.01711362151617072),
 ('day', 0.015338314285112615),
 ('month_cos', 0.014479578898743883),
 ('month_sin', 0.014306560224670604),
 ('month', 0.003245126821429412)]

Latitude and longitude are obviously the most important factors influencing the number of pickups. After that come time of the day and day of the week (in several incarnations). The average wind speed is the most important weather factor, corresponding nicely to a [2014 Taxicab Fact book](http://www.nyc.gov/html/tlc/downloads/pdf/2014_taxicab_fact_book.pdf) from TLC stating that a blizzard, a hurricane and a superstorm were the most important weather events influencing taxi rides. After wind speed comes the first seasonal variable.

### Create predictions for visualization

Now we are going to generate predictions that we can visualize in Tableau. We do this by generating all possible combinations of time and location so that we have a well filled space of predictions. Then we generate predictions for all these combinations and then export to .csv.

#### Load the test set

In [16]:
dftest=pd.read_csv("./data/taxi_data_test.csv", header=None, names = names)
print dftest.shape

(6306640, 14)


Prepare the data in the same way as the training and validation sets.

In [17]:
%%time
dftest = further_data_prep(dftest)

Wall time: 6min 33s


In [18]:
dftest.head()

Unnamed: 0,year,month,day,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,pickups,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind,year_num,month_num,year_sin,year_cos,month_sin,month_cos,latitude,longitude
0,2015,6,10,07:30,0.322917,-0.442289,0.896873,Wednesday,0.331845,-0.491881,0.870662,0,dr5rtjx,1,0,0,0,27.8,18.3,5.04,0.438356,0.3,0.377708,-0.925925,0.951057,-0.309017,40.724258,-73.949661
1,2015,5,3,20:00,0.84375,0.55557,-0.83147,Sunday,0.977679,0.990181,-0.13979,1,dr5ru3e,5,0,0,0,26.7,10.6,3.96,0.334247,0.066667,0.863142,-0.504961,0.406737,0.913545,40.746231,-73.988113
2,2015,6,10,04:30,0.197917,0.321439,0.94693,Wednesday,0.313988,-0.391305,0.920261,0,dr5ru96,3,0,0,0,27.8,18.3,5.04,0.438356,0.3,0.377708,-0.925925,0.951057,-0.309017,40.744858,-73.9785
3,2015,6,12,00:00,0.010417,0.997859,0.065403,Friday,0.572917,-0.896873,-0.442289,0,dr5rsmc,8,0,0,0,31.1,22.8,7.2,0.443836,0.366667,0.345612,-0.938377,0.743145,-0.669131,40.725632,-73.99086
4,2015,5,10,14:30,0.614583,-0.75184,-0.659346,Sunday,0.94494,0.940754,-0.33909,1,dr5rsr8,7,0,0,0,28.3,16.1,5.76,0.353425,0.3,0.796183,-0.605056,0.951057,-0.309017,40.735245,-73.992233


### Create an artificial dataset
This dataset will contain all possible combinations of time and location to predict for on May 1st, 2015, the day we want to visualize.

In [19]:
# Get all the different times with its corresponding weather data on May 1st, 2015

# Get the May 1, 2015 data
time_data = dftest[(dftest['month'] == 5) & (dftest['day'] == 1)]

# Drop the features we don't need and drop duplicates to get unique values.
time_data = time_data.drop(['geohash','pickups','year','time_cat','day_cat','latitude','longitude'], axis=1).drop_duplicates()
print time_data.shape
time_data.head()

(48, 21)


Unnamed: 0,month,day,time_num,time_cos,time_sin,day_num,day_cos,day_sin,weekend,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind,year_num,month_num,year_sin,year_cos,month_sin,month_cos
65,5,1,0.96875,0.980785,-0.19509,0.709821,-0.249776,-0.968304,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1
86,5,1,0.40625,-0.83147,0.55557,0.629464,-0.686997,-0.72666,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1
206,5,1,0.28125,-0.19509,0.980785,0.611607,-0.764037,-0.645172,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1
252,5,1,0.427083,-0.896873,0.442289,0.63244,-0.673289,-0.739379,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1
324,5,1,0.510417,-0.997859,-0.065403,0.644345,-0.616153,-0.787627,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1


In [20]:
# Get all the different locations

# First figure out which columns to drop
time_cols = list(Xtrain.columns.values)
time_cols.remove('latitude')
time_cols.remove('longitude')

# Then get the unique locations
loc_data = Xtrain.drop(time_cols, axis=1).drop_duplicates()

# To reduce memory consumption in Tableau, we are only predicting for
# the region closely around Manhattan and the La Guardia and JFK airports
loc_data = loc_data[(loc_data['latitude'] > 40.5) & (loc_data['latitude'] < 41.1) &
                    (loc_data['longitude'] > -74.1) & (loc_data['longitude'] < -73.6)]

print loc_data.shape
loc_data.head()

(22542, 2)


Unnamed: 0,latitude,longitude
1627377,40.751724,-73.942795
3147822,40.762711,-74.000473
5528461,40.736618,-73.983994
951888,40.731125,-73.926315
6931874,40.747604,-73.973007


In [21]:
# Dummy column to be able to join the times with the locations
time_data['key'] = 1
loc_data['key'] = 1

# Merge the time_data and location_data
result = pd.merge(time_data, loc_data, on='key').drop(['key'], axis=1)
#result = result[Xtrain.columns.values]
print result.shape[0]
result = result[Xnames]
result.head()

1082016


Unnamed: 0,month,day,latitude,longitude,year_num,year_cos,year_sin,month_cos,month_sin,day_num,day_cos,day_sin,time_num,time_cos,time_sin,max_temp,min_temp,avg_wind
0,5,1,40.751724,-73.942795,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28
1,5,1,40.762711,-74.000473,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28
2,5,1,40.736618,-73.983994,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28
3,5,1,40.731125,-73.926315,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28
4,5,1,40.747604,-73.973007,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28


#### Then we do the prediction

In [22]:
# Get the real number of pickups and take care that we can merge it with the predictions,
yy = dftest[['month','day','day_num','latitude','longitude','pickups']]
yy = yy[(yy['month'] == 5) & (yy['day'] == 1)]

# Do predictions and convert the logarithm to the normal numbers
result['pred_pickups'] = np.power(10,reg.predict(result)) - 1

# Merge the predictions and the real pickups
result = pd.merge(result, yy, how='left', on=['month','day','day_num','latitude','longitude'])
result.head(10)

Unnamed: 0,month,day,latitude,longitude,year_num,year_cos,year_sin,month_cos,month_sin,day_num,day_cos,day_sin,time_num,time_cos,time_sin,max_temp,min_temp,avg_wind,pred_pickups,pickups
0,5,1,40.751724,-73.942795,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,1.396741,
1,5,1,40.762711,-74.000473,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,1.64203,8.0
2,5,1,40.736618,-73.983994,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,7.021172,8.0
3,5,1,40.731125,-73.926315,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,1.368385,
4,5,1,40.747604,-73.973007,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,11.825699,16.0
5,5,1,40.727005,-73.99086,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,13.534925,10.0
6,5,1,40.725632,-73.996353,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,14.008216,12.0
7,5,1,40.677567,-73.973007,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,5.666272,14.0
8,5,1,40.720139,-74.008713,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,15.291712,15.0
9,5,1,40.718765,-73.953781,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,1.369543,1.0


In [23]:
print result.shape[0]
result.head()

1082016


Unnamed: 0,month,day,latitude,longitude,year_num,year_cos,year_sin,month_cos,month_sin,day_num,day_cos,day_sin,time_num,time_cos,time_sin,max_temp,min_temp,avg_wind,pred_pickups,pickups
0,5,1,40.751724,-73.942795,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,1.396741,
1,5,1,40.762711,-74.000473,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,1.64203,8.0
2,5,1,40.736618,-73.983994,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,7.021172,8.0
3,5,1,40.731125,-73.926315,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,1.368385,
4,5,1,40.747604,-73.973007,0.328767,-0.474951,0.880012,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,17.2,9.4,8.28,11.825699,16.0


In [25]:
# Drop unnecessary columns to reduce memory consumption in Tableau
result = result.drop(['month','day','time_cos','day_num','time_sin','day_cos','day_sin',
                      'max_temp','min_temp','avg_wind',
                      'year_num','year_sin','year_cos','month_sin','month_cos'], axis=1)
result.head()

Unnamed: 0,latitude,longitude,time_num,pred_pickups,pickups
0,40.751724,-73.942795,0.96875,1.396741,
1,40.762711,-74.000473,0.96875,1.64203,8.0
2,40.736618,-73.983994,0.96875,7.021172,8.0
3,40.731125,-73.926315,0.96875,1.368385,
4,40.747604,-73.973007,0.96875,11.825699,16.0


In [26]:
# Exclude all records that have less than 2 pickups or less than 1.5 predicted pickups.
# This is to reduce the size of the resulting dataset in Tableau
result = result[(result['pred_pickups'] >= 1.5) | (result['pickups'] >= 1.5)]
print result.shape
result.head()

(135269, 5)


Unnamed: 0,latitude,longitude,time_num,pred_pickups,pickups
1,40.762711,-74.000473,0.96875,1.64203,8
2,40.736618,-73.983994,0.96875,7.021172,8
4,40.747604,-73.973007,0.96875,11.825699,16
5,40.727005,-73.99086,0.96875,13.534925,10
6,40.725632,-73.996353,0.96875,14.008216,12


In [27]:
# Write to csv
result.to_csv('./data/predictions-may-1-weather.csv')

In [28]:
# Sanity check
result.describe()

Unnamed: 0,latitude,longitude,time_num,pred_pickups,pickups
count,135269.0,135269.0,135269.0,135269.0,79610.0
mean,40.751806,-73.969821,0.506728,4.191233,6.162844
std,0.045336,0.046021,0.30419,4.872338,7.399975
min,40.500412,-74.09935,0.010417,1.0,1.0
25%,40.725632,-73.99498,0.239583,1.650444,2.0
50%,40.753098,-73.977127,0.510417,2.1771,4.0
75%,40.776443,-73.955154,0.78125,4.682568,8.0
max,41.099167,-73.620071,0.989583,122.513052,175.0


### The result

In [29]:
%%html
<script type='text/javascript' src='https://public.tableau.com/javascripts/api/viz_v1.js'></script><div class='tableauPlaceholder' style='width: 804px; height: 569px;'><noscript><a href='#'><img alt='Pickups in NYC on May 1, 2015Play around with the slider below to see pickup densities at different times of the day ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;KD&#47;KD45Y8HBN&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' width='804' height='569' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='path' value='shared&#47;KD45Y8HBN' /> <param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;KD&#47;KD45Y8HBN&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='showVizHome' value='no' /><param name='showTabs' value='y' /><param name='bootstrapWhenNotified' value='true' /></object></div>