In [29]:
import os
import gmaps
import gmaps.datasets
import pandas as pd
import seaborn as sns

In [106]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
%pylab inline

# See the https://jupyter-gmaps.readthedocs.io/en/latest/install.html for installation
# set GOOGLE_API_KEY in your environment
gmaps.configure(api_key=os.environ["GOOGLE_API_KEY"])

Populating the interactive namespace from numpy and matplotlib


### Loading Data

In [3]:
date_cols = ['pickup_datetime', 'dropoff_datetime',]
ride_data = pd.read_csv('ride_data.csv', parse_dates=date_cols)

In [4]:
pickup_locations = ride_data[['pickup_latitude', 'pickup_longitude']]
locations = pickup_locations.sample(frac=0.1).values

In [5]:
fig = gmaps.figure()
fig.add_layer(gmaps.heatmap_layer(locations))
fig

Figure(layout=FigureLayout(height='420px'))

In [6]:
ride_data.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [88]:
pickup_params = ['pickup_longitude', 'pickup_latitude']
dropoff_params = ['dropoff_longitude', 'dropoff_latitude']

carpool_columns = ['pickup_datetime'] + pickup_params + dropoff_params

carpool_data = ride_data[carpool_columns].sample(frac=0.2)

In [89]:
def round_datetime(tm, round_minutes = 30):
    discard = datetime.timedelta(minutes=tm.minute % round_minutes,
                             seconds=tm.second,
                             microseconds=tm.microsecond)
    tm -= discard
    if discard >= datetime.timedelta(minutes=round_minutes/2):
        tm += datetime.timedelta(minutes=round_minutes)
    return tm

In [90]:
def datetime_to_numeric(tm):
    return(tm.hour * 60 + tm.minute)

def datetime_to_time(tm):
    return('{}:{}'.format(tm.hour, tm.minute))

In [91]:
datetime_to_time(check)

'0:30'

In [251]:
carpool_data['pickup_time_slot'] = carpool_data.pickup_datetime.apply(lambda x: datetime_to_time(round_datetime(x)))

In [252]:
carpool_data.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_time_slot
707526,2016-04-22 22:19:21,-73.965027,40.75536,-73.959709,40.76252,22:30
1134827,2016-06-03 19:46:05,-73.975189,40.755211,-73.991371,40.724072,20:0
1260665,2016-04-29 07:18:36,-73.985161,40.741344,-73.96965,40.756863,7:30
888990,2016-04-08 07:34:03,-73.97113,40.755318,-73.981773,40.752205,7:30
506589,2016-02-08 12:48:23,-73.95813,40.760269,-73.954018,40.77124,13:0


In [253]:
from collections import Counter
Counter(carpool_data.pickup_time_slot).most_common(10)

[('19:0', 9320),
 ('19:30', 9142),
 ('18:30', 9112),
 ('20:0', 8721),
 ('18:0', 8598),
 ('21:0', 8517),
 ('21:30', 8476),
 ('22:0', 8329),
 ('20:30', 8290),
 ('22:30', 8051)]

In [254]:
subset_slot_data = carpool_data[(carpool_data.pickup_time_slot == '19:0')]

In [255]:
subset_slot_data.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_time_slot
1128815,2016-02-19 19:13:17,-73.985123,40.756039,-73.995125,40.749931,19:0
559038,2016-05-24 19:09:20,-73.994041,40.741379,-74.000038,40.734802,19:0
470217,2016-02-07 19:03:58,-73.959328,40.77438,-73.967018,40.766949,19:0
348960,2016-03-23 19:09:44,-73.985847,40.740917,-73.983757,40.758827,19:0
1378570,2016-04-05 18:53:12,-73.872627,40.774303,-73.982544,40.755436,19:0


In [256]:
from sklearn.cluster import DBSCAN

In [257]:
X_pickup = subset_slot_data[pickup_params]
X_dropoff = subset_slot_data[dropoff_params]

In [258]:
# 1 unit distance in lat long corresponds to 111km in metric units
# We keep 300 m as the maximum walking distance

max_pickup_distance = 0.2 * 1/ 111
max_dropoff_distance = 5 * max_pickup_distance


pickup_dbscan = DBSCAN(
    eps = max_pickup_distance,
    min_samples = 2,
    metric = 'l1'
)
dropoff_dbscan = DBSCAN(
    eps = max_dropoff_distance,
    min_samples = 2,
    metric = 'l1'
)

In [259]:
subset_slot_data['pickup_cluster'] = pickup_dbscan.fit(X_pickup).labels_

In [260]:
subset_slot_data.sort_values('pickup_cluster', inplace=True)

In [261]:
def get_dropoff_cluster(group):
    X_dropoff_inter = group[dropoff_params]
    return(dropoff_dbscan.fit(X_dropoff_inter).labels_)
    

In [294]:
sub_clusters = subset_slot_data.groupby('pickup_cluster').apply(lambda x: pd.Series(get_dropoff_cluster(x)))

In [290]:
sub_clusters.head()

pickup_cluster   
-1              0   -1
                1   -1
                2   -1
                3    0
                4    1
dtype: int64

In [295]:
sub_clusters.values[:10]

array([-1, -1, -1,  0,  1, -1,  2, -1,  3,  4])

In [288]:
subset_slot_data['subset_cluster'] = sub_clusters.values

In [279]:
subset_slot_data.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_time_slot,pickup_cluster,subset_cluster
6142,2016-06-17 19:01:34,-74.00071,40.661373,-74.00071,40.661373,19:0,-1,-1
1173604,2016-02-20 19:12:33,-73.979591,40.722977,-73.898109,40.862305,19:0,-1,-1
1281813,2016-05-03 19:09:30,-73.978996,40.691578,-73.968246,40.695072,19:0,-1,-1
39233,2016-01-14 19:14:53,-73.969276,40.689598,-73.954712,40.745831,19:0,-1,0
201219,2016-03-26 19:10:13,-73.973381,40.738701,-73.996765,40.731731,19:0,-1,1


In [281]:
Counter(zip(subset_slot_data.pickup_cluster, subset_slot_data.subset_cluster)).most_common(10)

[((0, 0), 7937),
 ((0, 2), 138),
 ((0, 6), 108),
 ((0, 3), 63),
 ((0, -1), 53),
 ((-1, -1), 47),
 ((1, 3), 47),
 ((0, 4), 45),
 ((1, -1), 34),
 ((2, 1), 33)]

In [250]:
subset_slot_data.shape

(9320, 8)

In [None]:
dropoff_dbscan.

In [None]:

sub_clusters.head()

In [153]:
pd.merge(subset_slot_data, sub_clusters, on='pickup_cluster', ).shape

(71342356, 9)

In [154]:
subset_slot_data.shape

(9320, 8)

In [104]:
len(cluster_fit.labels_)

9345

In [83]:
Counter(cluster_fit.labels_).most_common(10)

[(0, 3939),
 (-1, 440),
 (10, 307),
 (17, 280),
 (24, 236),
 (2, 144),
 (8, 125),
 (23, 107),
 (57, 107),
 (20, 102)]

In [84]:
len(cluster_fit.labels_)

9345