### Preamble

In [1]:
# Configure libraries

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error
import random
import math
import datetime

In [2]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

###  Reading the data

For a Geohash precision of 6 (geohash length of 6 characters), we have roughly 300,000 records. For a precision of 7, we have about 4 million records.

In [3]:
# Each line is of the format:
# ((time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend, geohash), number of pickups)
names = ["year","month","day","time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "geohash", "pickups"]
dftrain=pd.read_csv("./data/taxi_data_training.csv", header=None, names = names)
print dftrain.shape

(7238532, 14)


In [4]:
dfvalid=pd.read_csv("./data/taxi_data_validation.csv", header=None, names = names)
print dfvalid.shape

(1569528, 14)


In [5]:
# Get the weather data

weather=pd.read_csv("./data/nyc-weather-data.csv")
weather.ix[weather.SNWD <= -9999, 'SNWD'] = 0
weather.ix[weather.SNOW <= -9999, 'SNOW'] = 0
weather.ix[weather.AWND <= -9999, 'AWND'] = 0
weather['year'] = (weather['DATE']/10000).apply(math.floor)
weather['month'] = ((weather['DATE'].mod(10000))/100).apply(math.floor)
weather['day'] = weather['DATE'].mod(100)
weather = weather[['year','month','day','PRCP','SNWD','SNOW','TMAX','TMIN','AWND']]
weather['PRCP'] = weather['PRCP'] / 10.
weather['TMAX'] = weather['TMAX'] / 10.
weather['TMIN'] = weather['TMIN'] / 10.
weather['AWND'] = weather['AWND'] / 10. * 3.6
weather.columns = ['year','month','day','precipitation','snow_depth','snowfall','max_temp','min_temp','avg_wind']
weather.head()

Unnamed: 0,year,month,day,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind
0,2009,1,1,0,0,0,-3.3,-9.4,18.0
1,2009,1,2,0,0,0,1.1,-5.0,10.08
2,2009,1,3,0,0,0,3.3,-1.7,16.2
3,2009,1,4,0,0,0,5.6,-3.9,12.24
4,2009,1,5,0,0,0,6.1,3.3,11.16


In [6]:
def get_yearday(df):
    date = datetime.date(df['year'],df['month'],df['day'])
    return (date.timetuple().tm_yday-1)/365.

# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0

def further_data_prep(df):
    df = pd.merge(df, weather, how='left', on=['year','month','day'])
    df['year_num'] = df.apply(lambda x:get_yearday(x),axis=1)
    df['month_num'] = (df['day']-1)/30.
    df['year_sin'] = (df['year_num'] * 2 * math.pi).apply(math.sin)
    df['year_cos'] = (df['year_num'] * 2 * math.pi).apply(math.cos)
    df['month_sin'] = (df['month_num'] * 2 * math.pi).apply(math.sin)
    df['month_cos'] = (df['month_num'] * 2 * math.pi).apply(math.cos)
    df['latitude'] = df['geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['longitude'] = df['geohash'].apply(lambda geo: decodegeo(geo, 1))
    return df

In [7]:
%%time
dftrain = further_data_prep(dftrain)
dfvalid = further_data_prep(dfvalid)

Wall time: 7min 50s


In [8]:
dfvalid.head()

Unnamed: 0,year,month,day,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,pickups,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind,year_num,month_num,year_sin,year_cos,month_sin,month_cos,latitude,longitude
0,2015,4,10,14:30,0.614583,-0.75184,-0.659346,Friday,0.659226,-0.539926,-0.841713,0,dr5ru68,12,4.3,0,0,13.3,3.9,7.56,0.271233,0.3,0.991114,-0.133015,0.951057,-0.309017,40.751724,-73.992233
1,2015,3,7,19:00,0.802083,0.321439,-0.94693,Saturday,0.828869,0.475515,-0.879708,1,dr5rs0z,3,0.0,460,0,3.3,-7.7,8.28,0.178082,0.2,0.899631,0.436651,0.951057,0.309017,40.698166,-73.993607
2,2015,3,27,16:00,0.677083,-0.442289,-0.896873,Friday,0.668155,-0.491881,-0.870662,0,dr5rum1,4,6.9,0,0,7.8,3.9,6.84,0.232877,0.866667,0.994218,0.107381,-0.743145,0.669131,40.765457,-73.99086
3,2015,3,23,12:30,0.53125,-0.980785,-0.19509,Monday,0.075893,0.888446,0.458982,0,dr72mdp,1,0.0,0,0,3.3,-4.9,10.8,0.221918,0.733333,0.984474,0.175531,-0.994522,-0.104528,40.836868,-73.927689
4,2015,3,10,18:00,0.760417,0.065403,-0.997859,Tuesday,0.251488,-0.00935,0.999956,0,dr5ru0g,3,11.7,280,0,11.7,3.9,5.76,0.186301,0.3,0.920971,0.38963,0.951057,-0.309017,40.742111,-73.9991


### Final preperation for machine learning

I tried initially doing one-hot-encoding on the geohashes, but I quickly realized that this was not feasible from a memory perspective. 3 million records times 100,000 features would not fit in memory. So I decided to go for the numerical latitude and longitude route. Using a random forest, we can easily detect higher order structures in these two variables.

In [48]:
# Split off the features
# Xnames = ['month','day','latitude','longitude','year_num','year_cos','year_sin',
#           'month_num','month_cos','month_sin','day_num','day_cos','day_sin',
#           'time_num','time_cos','time_sin','weekend','precipitation',
#           'snow_depth','snowfall','max_temp','min_temp','avg_wind']
Xnames = ['month','day','latitude','longitude','year_num','year_cos','year_sin',
          'month_cos','month_sin','day_num','day_cos','day_sin',
          'time_num','time_cos','time_sin','max_temp','min_temp','avg_wind']
Xtrain = dftrain[Xnames]
Xvalid = dfvalid[Xnames]

# Split off the target (which will be the logarithm of the number of pickups (+1))
ytrain = np.log10(dftrain['pickups']+1)
yvalid = np.log10(dfvalid['pickups']+1)

In [49]:
# Create indicator variables for the hours and days of the week and drop the categorical values
# g = 5
# X = X.join(pd.get_dummies(X['time_cat']))\
#      .join(pd.get_dummies(X['day_cat']))\
#      .drop(['time_cat','day_cat','geohash'], axis=1)
#Xtrain = Xtrain.drop(['time_cat','day_cat','geohash'], axis=1)
#Xvalid = Xvalid.drop(['time_cat','day_cat','geohash'], axis=1)
#     .join(pd.get_dummies(X['geohash'].str[:g]))\

### Get the training and test sets

In [50]:
print Xtrain.shape
max_samples = 2000000
if Xtrain.shape[0] > max_samples:
    rows = random.sample(Xtrain.index, max_samples)
    Xtrain = Xtrain.ix[rows]
    ytrain = ytrain.ix[rows]
print Xtrain.shape
Xtrain.head()

(7238532, 18)
(2000000, 18)


Unnamed: 0,month,day,latitude,longitude,year_num,year_cos,year_sin,month_cos,month_sin,day_num,day_cos,day_sin,time_num,time_cos,time_sin,max_temp,min_temp,avg_wind
6891361,1,24,40.735245,-73.993607,0.063014,0.92264,0.385663,0.104528,-0.994522,0.635417,-0.659346,-0.75184,0.447917,-0.94693,0.321439,-6.6,-12.1,12.96
2362643,12,8,40.714645,-73.933182,0.934247,0.915864,-0.401488,0.104528,0.994522,0.873512,0.700465,-0.713687,0.114583,0.75184,0.659346,0.6,-1.6,10.08
2861163,1,22,40.780563,-73.977127,0.057534,0.935368,0.353676,-0.309017,-0.951057,0.400298,-0.810115,0.586271,0.802083,0.321439,-0.94693,-8.2,-14.9,14.04
5835395,6,13,40.743484,-73.9991,0.446575,-0.944188,0.329408,-0.809017,0.587785,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,26.1,18.3,5.76
5557607,10,23,40.709152,-74.014206,0.808219,0.357698,-0.933837,-0.104528,-0.994522,0.388393,-0.764037,0.645172,0.71875,-0.19509,-0.980785,12.8,7.2,7.2


In [None]:
# Create a Random Forest Regression estimator
reg = RandomForestRegressor(n_estimators=20, n_jobs=-1)

### Random Forest Regression

The step below takes about 25 mins on my laptop for 300,000 records and up to 100 estimators.

In [None]:
reg = RandomForestRegressor(n_estimators=1, max_depth=30, n_jobs=-1, warm_start=True)
for n in range(1,20):
    reg.set_params(n_estimators=n)
    reg.fit(Xtrain,ytrain)
    training_accuracy = reg.score(Xtrain, ytrain)
    valid_accuracy = reg.score(Xvalid, yvalid)
    rmsetrain = np.sqrt(mean_squared_error(reg.predict(Xtrain),ytrain))
    rmsevalid = np.sqrt(mean_squared_error(reg.predict(Xvalid),yvalid))
    print "N = %d, R^2 (train) = %0.3f, R^2 (valid) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f" % (n,training_accuracy, valid_accuracy, rmsetrain, rmsevalid)

In [None]:
reg = RandomForestRegressor(n_estimators=100, max_depth=50, n_jobs=-1, verbose=4)
reg.fit(Xtrain,ytrain)
training_accuracy = reg.score(Xtrain, ytrain)
valid_accuracy = reg.score(Xvalid, yvalid)
rmsetrain = np.sqrt(mean_squared_error(reg.predict(Xtrain),ytrain))
rmsevalid = np.sqrt(mean_squared_error(reg.predict(Xvalid),yvalid))
print "R^2 (train) = %0.3f, R^2 (valid) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f" % (training_accuracy, valid_accuracy, rmsetrain, rmsevalid)

In [None]:
%%time
# Define a grid of parameters over which to optimize the random forest
# We will figure out which number of trees is optimal
parameters = {"n_estimators": [100],
              "max_features": ["auto"], # ["auto","sqrt","log2"]
              "max_depth": [20]}
best = cv_optimize(reg, parameters, Xtrain, ytrain, n_folds=5, score_func='mean_squared_error', verbose=3)

### Evaluate the results

In [None]:
# Fit the best Random Forest and calculate R^2 values for training and test sets
reg=reg.fit(Xtrain, ytrain)
training_accuracy = reg.score(Xtrain, ytrain)
valid_accuracy = reg.score(Xvalid, yvalid)
print "############# based on standard predict ################"
print "R^2 on training data:   %0.4f" % (training_accuracy)
print "R^2 on validation data: %0.4f" % (valid_accuracy)

Sanity check on some records.

In [None]:
# Show some of the predictions vs. the real number of pickups
# predictions vs. real number of pickups
pd.DataFrame(np.round(np.power(10,np.column_stack((reg.predict(Xvalid),yvalid))) - 1,decimals=0).astype(int)).head(20)

In [None]:
# Calculate the Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(reg.predict(Xvalid),yvalid))
print "RMSE = %0.3f (this is in log-space!)" % rmse
print "So two thirds of the records would be a factor of less than %0.2f away from the real value." % np.power(10,rmse)

In [None]:
# What are the most important features?
import operator
dict_feat_imp = dict(zip(list(Xtrain.columns.values),reg.feature_importances_))

sorted_features = sorted(dict_feat_imp.items(), key=operator.itemgetter(1), reverse=True)
sorted_features

### Create predictions for visualization

Now we are going to generate predictions that we can visualize in Tableau. We do this by generating all possible combinations of time and location so that we have a well filled space of predictions. Then we generate predictions for all these combinations and then export to .csv.

#### First we need a dataframe with all possible combinations of time and location

In [17]:
dftest=pd.read_csv("./data/taxi_data_test.csv", header=None, names = names)
print dftest.shape

(6306640, 14)


In [18]:
%%time
dftest = further_data_prep(dftest)

Wall time: 6min 31s


In [19]:
dftest.head()

Unnamed: 0,year,month,day,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,geohash,pickups,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind,year_num,month_num,year_sin,year_cos,month_sin,month_cos,latitude,longitude
0,2015,6,10,07:30,0.322917,-0.442289,0.896873,Wednesday,0.331845,-0.491881,0.870662,0,dr5rtjx,1,0,0,0,27.8,18.3,5.04,0.438356,0.3,0.377708,-0.925925,0.951057,-0.309017,40.724258,-73.949661
1,2015,5,3,20:00,0.84375,0.55557,-0.83147,Sunday,0.977679,0.990181,-0.13979,1,dr5ru3e,5,0,0,0,26.7,10.6,3.96,0.334247,0.066667,0.863142,-0.504961,0.406737,0.913545,40.746231,-73.988113
2,2015,6,10,04:30,0.197917,0.321439,0.94693,Wednesday,0.313988,-0.391305,0.920261,0,dr5ru96,3,0,0,0,27.8,18.3,5.04,0.438356,0.3,0.377708,-0.925925,0.951057,-0.309017,40.744858,-73.9785
3,2015,6,12,00:00,0.010417,0.997859,0.065403,Friday,0.572917,-0.896873,-0.442289,0,dr5rsmc,8,0,0,0,31.1,22.8,7.2,0.443836,0.366667,0.345612,-0.938377,0.743145,-0.669131,40.725632,-73.99086
4,2015,5,10,14:30,0.614583,-0.75184,-0.659346,Sunday,0.94494,0.940754,-0.33909,1,dr5rsr8,7,0,0,0,28.3,16.1,5.76,0.353425,0.3,0.796183,-0.605056,0.951057,-0.309017,40.735245,-73.992233


In [20]:
time_data = dftest[(dftest['month'] == 5) & (dftest['day'] == 1)]
time_data = time_data.drop(['geohash','pickups','year','time_cat','day_cat','latitude','longitude'], axis=1).drop_duplicates()
print time_data.shape
time_data.head()

(48, 21)


Unnamed: 0,month,day,time_num,time_cos,time_sin,day_num,day_cos,day_sin,weekend,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind,year_num,month_num,year_sin,year_cos,month_sin,month_cos
65,5,1,0.96875,0.980785,-0.19509,0.709821,-0.249776,-0.968304,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1
86,5,1,0.40625,-0.83147,0.55557,0.629464,-0.686997,-0.72666,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1
206,5,1,0.28125,-0.19509,0.980785,0.611607,-0.764037,-0.645172,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1
252,5,1,0.427083,-0.896873,0.442289,0.63244,-0.673289,-0.739379,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1
324,5,1,0.510417,-0.997859,-0.065403,0.644345,-0.616153,-0.787627,0,0,0,0,17.2,9.4,8.28,0.328767,0,0.880012,-0.474951,0,1


In [21]:
# Construct dataframes with all possible times (time_data) and all possible locations (loc_data)

# Columns about time
time_cols = list(Xtrain.columns.values)
time_cols.remove('latitude')
time_cols.remove('longitude')
print time_cols

# Unique locations
loc_data = Xtrain.drop(time_cols, axis=1).drop_duplicates()

# To reduce memory consumption in Tableau, we are only predicting for
# the region closely around Manhattan and the La Guardia and JFK airports
loc_data = loc_data[(loc_data['latitude'] > 40.5) & (loc_data['latitude'] < 41.1) &
                    (loc_data['longitude'] > -74.1) & (loc_data['longitude'] < -73.6)]

print loc_data.shape
loc_data.head()

['month', 'day', 'year_num', 'year_cos', 'year_sin', 'month_num', 'month_cos', 'month_sin', 'day_num', 'day_cos', 'day_sin', 'time_num', 'time_cos', 'time_sin', 'weekend', 'precipitation', 'snow_depth', 'snowfall', 'max_temp', 'min_temp', 'avg_wind']
(22632, 2)


Unnamed: 0,latitude,longitude
1248191,40.761337,-73.985367
1498846,40.757217,-73.98262
2891013,40.788803,-73.9785
5516304,40.758591,-73.919449
3834075,40.725632,-74.001846


In [38]:
# Dummy column to be able to join them together
time_data['key'] = 1
loc_data['key'] = 1

# Merge the time_data and location_data
result = pd.merge(time_data, loc_data, on='key').drop(['key'], axis=1)
#result = result[Xtrain.columns.values]
print result.shape[0]
result = result[Xnames]
result.head()

1086336


Unnamed: 0,month,day,latitude,longitude,year_num,year_cos,year_sin,month_num,month_cos,month_sin,day_num,day_cos,day_sin,time_num,time_cos,time_sin,weekend,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind
0,5,1,40.761337,-73.985367,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28
1,5,1,40.757217,-73.98262,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28
2,5,1,40.788803,-73.9785,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28
3,5,1,40.758591,-73.919449,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28
4,5,1,40.725632,-74.001846,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28


#### Then we do the prediction

In [39]:
# Get the real number of pickups and take care that we can merge it with the predictions,
# by also taking the geohash and the timestamp
yy = dftest[['month','day','day_num','latitude','longitude','pickups']]
yy = yy[(yy['month'] == 5) & (yy['day'] == 1)]

# Do predictions and convert the logarithm to the normal numbers
result['pred_pickups'] = np.power(10,reg.predict(result)) - 1

# Merge the predictions and the real pickups
result = pd.merge(result, yy, how='left', on=['month','day','day_num','latitude','longitude'])
result.head(10)

Unnamed: 0,month,day,latitude,longitude,year_num,year_cos,year_sin,month_num,month_cos,month_sin,day_num,day_cos,day_sin,time_num,time_cos,time_sin,weekend,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind,pred_pickups,pickups
0,5,1,40.761337,-73.985367,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,10.911229,4
1,5,1,40.757217,-73.98262,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,12.231386,22
2,5,1,40.788803,-73.9785,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,5.241564,3
3,5,1,40.758591,-73.919449,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,7.872202,9
4,5,1,40.725632,-74.001846,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,7.485188,3
5,5,1,40.709152,-74.012833,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,2.488607,3
6,5,1,40.801163,-73.967514,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,5.327569,9
7,5,1,40.706406,-74.015579,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,1.753238,2
8,5,1,40.768204,-73.993607,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,4.56922,12
9,5,1,40.681686,-73.964767,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,1.52584,1


In [40]:
result.shape[0]

1086336

In [41]:
result.head()

Unnamed: 0,month,day,latitude,longitude,year_num,year_cos,year_sin,month_num,month_cos,month_sin,day_num,day_cos,day_sin,time_num,time_cos,time_sin,weekend,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind,pred_pickups,pickups
0,5,1,40.761337,-73.985367,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,10.911229,4
1,5,1,40.757217,-73.98262,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,12.231386,22
2,5,1,40.788803,-73.9785,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,5.241564,3
3,5,1,40.758591,-73.919449,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,7.872202,9
4,5,1,40.725632,-74.001846,0.328767,-0.474951,0.880012,0,1,0,0.709821,-0.249776,-0.968304,0.96875,0.980785,-0.19509,0,0,0,0,17.2,9.4,8.28,7.485188,3


In [42]:
# Drop unnecessary columns to reduce memory consumption in Tableau
result = result.drop(['month','day','time_cos','day_num','time_sin','day_cos','day_sin','weekend'], axis=1)

In [43]:
result = result.drop(['precipitation','snow_depth','snowfall','max_temp','min_temp','avg_wind','year_num','month_num','year_sin','year_cos','month_sin','month_cos'], axis=1)

In [44]:
result.head()

Unnamed: 0,latitude,longitude,time_num,pred_pickups,pickups
0,40.761337,-73.985367,0.96875,10.911229,4
1,40.757217,-73.98262,0.96875,12.231386,22
2,40.788803,-73.9785,0.96875,5.241564,3
3,40.758591,-73.919449,0.96875,7.872202,9
4,40.725632,-74.001846,0.96875,7.485188,3


In [45]:
result = result[(result['pred_pickups'] >= 1.5) | (result['pickups'] >= 1.5)]
print result.shape
result.head()

(140359, 5)


Unnamed: 0,latitude,longitude,time_num,pred_pickups,pickups
0,40.761337,-73.985367,0.96875,10.911229,4
1,40.757217,-73.98262,0.96875,12.231386,22
2,40.788803,-73.9785,0.96875,5.241564,3
3,40.758591,-73.919449,0.96875,7.872202,9
4,40.725632,-74.001846,0.96875,7.485188,3


In [46]:
# Write to csv
result.to_csv('./data/predictions-may-1-weather.csv')

In [47]:
result.describe()

Unnamed: 0,latitude,longitude,time_num,pred_pickups,pickups
count,140359.0,140359.0,140359.0,140359.0,79825.0
mean,40.75128,-73.967294,0.509995,4.116464,6.149089
std,0.046597,0.047109,0.306355,4.921602,7.394756
min,40.512772,-74.09935,0.010417,1.0,1.0
25%,40.724258,-73.99498,0.260417,1.64497,2.0
50%,40.751724,-73.975754,0.510417,2.123946,4.0
75%,40.776443,-73.952408,0.802083,4.395694,8.0
max,41.089554,-73.603592,0.989583,122.63993,175.0
