# Predict the best block for the driver to pick up based on the time of the day and day of the week.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

# Drop off useless data

In [2]:
data_2012_01=pd.read_csv("datasets/yellow_tripdata_2012-01.csv")
data_2012_01=data_2012_01.drop("Unnamed: 0",1)
data_2012_01=data_2012_01.drop("passenger_count",1)
data_2012_01=data_2012_01.drop("dropoff_datetime",1)
data_2012_01=data_2012_01.drop("trip_distance",1)
data_2012_01=data_2012_01.drop("dropoff_longitude",1)
data_2012_01=data_2012_01.drop("dropoff_latitude",1)
data_2012_01=data_2012_01.drop("total_amount",1)
data_2012_01.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,tip_amount
0,2012-01-10 23:55:50,-73.994693,40.725031,1.0
1,2012-01-11 19:18:25,-73.987955,40.752947,0.0
2,2012-01-11 19:19:19,-73.783093,40.64855,10.06
3,2012-01-11 19:19:21,-73.967515,40.758454,1.0
4,2012-01-11 14:38:15,-74.011315,40.711449,0.0


In [3]:
jan_time=pd.to_datetime(pd.Series(data_2012_01["pickup_datetime"]))
jan_time_list=[]
for index in jan_time:
    jan_time_list.append(index.dayofweek)
data_2012_01["weekday"]=jan_time_list
data_2012_01.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,tip_amount,weekday
0,2012-01-10 23:55:50,-73.994693,40.725031,1.0,1
1,2012-01-11 19:18:25,-73.987955,40.752947,0.0,2
2,2012-01-11 19:19:19,-73.783093,40.64855,10.06,2
3,2012-01-11 19:19:21,-73.967515,40.758454,1.0,2
4,2012-01-11 14:38:15,-74.011315,40.711449,0.0,2


In [4]:
jan_2012_mon=pd.DataFrame(data_2012_01)
jan_2012_mon=jan_2012_mon.loc[jan_2012_mon["weekday"]==0]
jan_2012_mon=jan_2012_mon.reset_index(drop=True)
print(jan_2012_mon.shape)

(2098157, 5)


# Narrow the data to New York City only

In [5]:
jan_2012_mon=jan_2012_mon[jan_2012_mon.pickup_longitude!=0]
jan_2012_mon=jan_2012_mon[jan_2012_mon.pickup_latitude!=0]
jan_2012_mon=jan_2012_mon[jan_2012_mon.pickup_latitude>40]
jan_2012_mon=jan_2012_mon[jan_2012_mon.pickup_latitude<41]
jan_2012_mon=jan_2012_mon[jan_2012_mon.pickup_longitude>-74.5]
jan_2012_mon=jan_2012_mon[jan_2012_mon.pickup_longitude<-72.5]
jan_2012_mon.shape

(2048831, 5)

# Assume our location is at the following longitude and latitude

In [6]:
jan_2012_mon.head(1)

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,tip_amount,weekday
0,2012-01-09 19:14:00,-73.993335,40.727717,0.0,0


# Then we search within 1 mile of our location
## *1 mile change is approximately 0.018 degree change for coordinates

In [7]:
in_range=jan_2012_mon.loc[jan_2012_mon["pickup_longitude"]>=-73.993335-0.018]
in_range=in_range.loc[in_range["pickup_longitude"]<=-73.993335+0.018]
in_range=in_range.loc[in_range["pickup_latitude"]<=40.727717+0.018]
in_range=in_range.loc[in_range["pickup_latitude"]>=40.727717-0.018]
in_range.shape

(560846, 5)

In [8]:
in_range_time_list=list(in_range["pickup_datetime"])
in_range_hour=[]
for e in in_range_time_list:
    in_range_hour.append(pd.to_datetime(e).hour)
in_range["time"]=in_range_hour
in_range.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,tip_amount,weekday,time
0,2012-01-09 19:14:00,-73.993335,40.727717,0.0,0,19
6,2012-01-09 17:12:42,-74.004338,40.724819,0.0,0,17
13,2012-01-09 22:00:58,-74.00484,40.730261,0.0,0,22
14,2012-01-09 22:00:40,-73.998459,40.729385,0.0,0,22
18,2012-01-09 22:03:06,-73.998232,40.726148,0.0,0,22


# Assume the current time is 7:30 PM
## *we look for data between 7 PM and 8 PM inclusively

In [9]:
in_time=in_range.loc[in_range["time"]<21]
in_time=in_time.loc[in_range["time"]>18]
in_time.shape

(73433, 6)

In [10]:
import matplotlib.path as mplPath

def indexZones(shapeFilename):
    import rtree
    import fiona.crs
    import geopandas as gpd
    index = rtree.Rtree()
    zones = gpd.read_file(shapeFilename).to_crs(fiona.crs.from_epsg(2263))
    for idx,geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return (index, zones)

def findBlock(p, index, zones):
    match = index.intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        z = mplPath.Path(np.array(zones.geometry[idx].exterior))
        if z.contains_point(np.array(p)):
            return zones['OBJECTID'][idx]
    return -1

def mapToZone(parts):
    import pyproj
    import shapely.geometry as geom
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)    
    index, zones = indexZones("datasets/block-groups-polygons.geojson")
    i=0
    for line in parts:
        if (line["pickup_longitude"] and line["pickup_latitude"]):
            pickup_location  = geom.Point(proj(float(line["pickup_longitude"]), float(line["pickup_latitude"])))
            try:
                block_id_list.append(findBlock(pickup_location, index, zones))
            except AttributeError:
                drop_list.append(i)
        i+=1

drop_list=[]
block_id_list=[]

In [11]:
mapToZone(in_time.T.to_dict().values())

In [12]:
print((len(drop_list)+len(block_id_list))==in_time.shape[0])

True


In [13]:
in_time=in_time.drop(in_time.index[drop_list])
in_time["block_id"]=block_id_list
in_time=in_time.reset_index(drop=True)
in_time=in_time.loc[in_time["block_id"]!=-1]
in_time.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,tip_amount,weekday,time,block_id
0,2012-01-09 19:14:00,-73.993335,40.727717,0.0,0,19,9547
1,2012-01-09 20:28:05,-73.98503,40.724199,1.98,0,20,9037
2,2012-01-09 20:56:07,-74.000189,40.737766,1.58,0,20,10157
3,2012-01-09 20:56:12,-74.007974,40.734792,1.65,0,20,9595
4,2012-01-09 20:51:13,-73.989731,40.730629,2.78,0,20,9040


In [14]:
d1=pd.read_csv("datasets/2012-1-0-id-78.csv")
d2=pd.read_csv("datasets/2012-2-0-id-78.csv")
d3=pd.read_csv("datasets/2012-3-0-id-78.csv")
d4=pd.read_csv("datasets/2012-4-0-id-78.csv")
d5=pd.read_csv("datasets/2012-5-0-id-78.csv")
d6=pd.read_csv("datasets/2012-6-0-id-78.csv")
d7=pd.read_csv("datasets/2012-7-0-id-78.csv")
d8=pd.read_csv("datasets/2012-8-0-id-78.csv")
d9=pd.read_csv("datasets/2012-9-0-id-78.csv")
d10=pd.read_csv("datasets/2012-10-0-id-78.csv")
d11=pd.read_csv("datasets/2012-11-0-id-78.csv")
d12=pd.read_csv("datasets/2012-12-0-id-78.csv")

In [15]:
combine=[d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12]
result = pd.concat(combine)

In [42]:
from sklearn.cluster import KMeans
plon=result['pickup_longitude'].values
plat=result['pickup_latitude'].values
coodinates=np.array([[plon[i],plat[i]] for i in range(len(plon))])
kmeans_n = KMeans(n_clusters=333,  n_init=1)
kmeans_n.fit(coodinates)
labels = kmeans_n.labels_
result["label"]=labels

In [43]:
ls=result.groupby('label').size()
ls=np.array([[ls[i]] for i in range(len(ls))])
lc=kmeans_n.cluster_centers_
train_s=int(len(ls)*0.8)
test_s=int(len(ls)*0.2)
train_f=ls[:train_s]
train_r=lc[:train_s]
test_f=ls[test_s:]
test_r=lc[test_s:]

In [84]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline


def fit_model(X, y):
    model = Pipeline([('poly', PolynomialFeatures(degree=4)),
                ('linear', LinearRegression(fit_intercept=False))])
    model.fit(X, y)
    return model

def score_model(model, X, y, Xv, yv):
    return tuple([model.score(X, y), model.score(Xv, yv)])

def fit_model_and_score(data, response, validation, val_response):
    model = fit_model(data, response)
    return score_model(model, data, response, validation, val_response)

print (fit_model_and_score(train_f, train_r,
                           test_f, test_r))

(0.11851685396606287, 0.091989863056934115)


In [83]:
X=ls
y=lc

model=Pipeline([('poly', PolynomialFeatures(degree=4)),
                ('linear', LinearRegression(fit_intercept=False))])
model.fit(X, y)
X_pred=(max(ls))
y_pred=model.predict(X_predict)
y_pred



array([[-73.99417684,  40.73509588]])

In [85]:
original=[[-73.993335,40.727717]]
from pygeocoder import Geocoder
predicted_location = Geocoder.reverse_geocode(original[0][1], original[0][0])
# predicted_location.street_address might not be applicable
if predicted_location.street_address:
    predicted_address = predicted_location.street_address
else:
    predicted_address = '%s %s, %s' %(predicted_location.street_number, predicted_location.route, predicted_location.city)
print (predicted_address)

388 Lafayette Street, New York


In [86]:
from pygeocoder import Geocoder
predicted_location = Geocoder.reverse_geocode(y_pred[0][1], y_pred[0][0])
# predicted_location.street_address might not be applicable
if predicted_location.street_address:
    predicted_address = predicted_location.street_address
else:
    predicted_address = '%s %s, %s' %(predicted_location.street_number, predicted_location.route, predicted_location.city)
print (predicted_address)

59 5th Avenue, New York
