In [1]:
import os
import gmaps
import gmaps.datasets
import pandas as pd
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
%pylab inline

# See the https://jupyter-gmaps.readthedocs.io/en/latest/install.html for installation
# set GOOGLE_API_KEY in your environment
gmaps.configure(api_key=os.environ["GOOGLE_API_KEY"])

Populating the interactive namespace from numpy and matplotlib


### Loading Data

In [4]:
date_cols = ['pickup_datetime', 'dropoff_datetime',]
ride_data = pd.read_csv('ride_data.csv', parse_dates=date_cols)

In [None]:
pickup_locations = ride_data[['pickup_latitude', 'pickup_longitude']]
locations = pickup_locations.sample(frac=0.1).values

In [None]:
fig = gmaps.figure()
fig.add_layer(gmaps.heatmap_layer(locations))
fig

In [5]:
ride_data.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [6]:
pickup_params = ['pickup_longitude', 'pickup_latitude']
dropoff_params = ['dropoff_longitude', 'dropoff_latitude']

carpool_columns = ['pickup_datetime', 'passenger_count'] + pickup_params + dropoff_params

carpool_data = ride_data[carpool_columns].sample(frac=0.2, random_state=501)

In [7]:
def round_datetime(tm, round_minutes = 30):
    discard = datetime.timedelta(minutes=tm.minute % round_minutes,
                             seconds=tm.second,
                             microseconds=tm.microsecond)
    tm -= discard
    if discard >= datetime.timedelta(minutes=round_minutes/2):
        tm += datetime.timedelta(minutes=round_minutes)
    return tm

In [8]:
def datetime_to_numeric(tm):
    return(tm.hour * 60 + tm.minute)

def datetime_to_time(tm):
    return('{}:{}'.format(tm.hour, tm.minute))

In [9]:
carpool_data['pickup_time_slot'] = carpool_data.pickup_datetime.apply(lambda x: datetime_to_time(round_datetime(x)))

In [10]:
carpool_data.head()

Unnamed: 0,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_time_slot
1415246,2016-05-12 14:26:46,1,-73.96608,40.77375,-73.950027,40.775951,14:30
682465,2016-05-18 19:16:34,1,-73.977455,40.750675,-73.992477,40.737156,19:30
1058558,2016-06-07 17:53:46,3,-73.782356,40.64452,-73.907188,40.72641,18:0
675162,2016-01-05 23:40:03,1,-73.998314,40.760845,-74.003487,40.740101,23:30
1325438,2016-06-12 08:30:08,1,-73.975189,40.741604,-73.971245,40.764248,8:30


In [11]:
from collections import Counter
Counter(carpool_data.pickup_time_slot).most_common(10)

[('19:0', 9210),
 ('19:30', 9191),
 ('18:30', 9068),
 ('20:0', 8698),
 ('18:0', 8542),
 ('22:0', 8513),
 ('20:30', 8457),
 ('21:30', 8444),
 ('21:0', 8437),
 ('22:30', 7966)]

In [12]:
subset_slot_data = carpool_data[(carpool_data.pickup_time_slot == '19:0')]

In [13]:
subset_slot_data.head()

Unnamed: 0,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_time_slot
1393199,2016-05-30 18:50:21,1,-73.988747,40.748661,-73.981979,40.75808,19:0
1065839,2016-02-26 19:14:06,2,-74.000885,40.731819,-73.997604,40.721779,19:0
1041011,2016-05-22 19:06:06,2,-73.969025,40.761135,-73.987656,40.744198,19:0
159100,2016-02-02 19:03:52,1,-73.989578,40.739883,-73.9617,40.760109,19:0
520226,2016-03-31 18:58:00,1,-73.979691,40.761021,-73.961678,40.77676,19:0


In [14]:
from sklearn.cluster import DBSCAN

In [15]:
X_pickup = subset_slot_data[pickup_params]
dropoff_data = subset_slot_data[dropoff_params]

In [28]:
# 1 unit distance in lat long corresponds to 111km in metric units
# We keep 300 m as the maximum walking distance

max_pickup_distance = 0.1 * 1/ 111
max_dropoff_distance = 5 * max_pickup_distance


pickup_dbscan = DBSCAN(
    eps = max_pickup_distance,
    min_samples = 2,
    metric = 'l1'
)
dropoff_dbscan = DBSCAN(
    eps = max_dropoff_distance,
    min_samples = 2,
    metric = 'l1'
)

In [29]:
subset_slot_data['pickup_cluster'] = pickup_dbscan.fit_predict(X_pickup)

In [30]:
subset_slot_data['subset_cluster'] = -2

In [31]:
for i in set(subset_slot_data.pickup_cluster):
    indices = subset_slot_data.pickup_cluster == i
    X_dropoff = dropoff_data[indices]
    subset_slot_data.loc[indices, 'subset_cluster'] = dropoff_dbscan.fit_predict(X_dropoff)

In [35]:
subset_slot_data.head().reset_index(drop=True)

Unnamed: 0,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_time_slot,pickup_cluster,subset_cluster
0,2016-05-30 18:50:21,1,-73.988747,40.748661,-73.981979,40.75808,19:0,0,0
1,2016-02-26 19:14:06,2,-74.000885,40.731819,-73.997604,40.721779,19:0,1,0
2,2016-05-22 19:06:06,2,-73.969025,40.761135,-73.987656,40.744198,19:0,0,0
3,2016-02-02 19:03:52,1,-73.989578,40.739883,-73.9617,40.760109,19:0,0,0
4,2016-03-31 18:58:00,1,-73.979691,40.761021,-73.961678,40.77676,19:0,0,0


In [33]:
Counter(zip(subset_slot_data.pickup_cluster, subset_slot_data.subset_cluster)).most_common(10)

[((0, 0), 4231),
 ((15, 0), 488),
 ((-1, -1), 103),
 ((0, -1), 70),
 ((16, -1), 54),
 ((-1, 4), 50),
 ((46, -1), 49),
 ((34, 1), 48),
 ((15, -1), 45),
 ((34, 3), 42)]