In [1]:
import pandas as pd
import numpy as np

In [2]:
apr = pd.read_csv(r'../../../data/uber-raw-data-apr14.csv')

In [3]:
apr.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


In [4]:
apr.dtypes

Date/Time     object
Lat          float64
Lon          float64
Base          object
dtype: object

In [5]:
from datetime import datetime

In [6]:
apr['Date/Time'].iloc[0]

'4/1/2014 0:11:00'

In [7]:
# call datetime.strptime to convert
# it into datetime datatype
datetime_obj = datetime.strptime(apr['Date/Time'].iloc[0], 
                                 "%m/%d/%Y %H:%M:%S")

  
# extract the time from datetime_obj
hours = datetime_obj.time().hour
print(hours)


0


In [8]:
apr['Hour'] = apr['Date/Time'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y %H:%M:%S").time().hour)

In [9]:
apr.tail()

Unnamed: 0,Date/Time,Lat,Lon,Base,Hour
564511,4/30/2014 23:22:00,40.764,-73.9744,B02764,23
564512,4/30/2014 23:26:00,40.7629,-73.9672,B02764,23
564513,4/30/2014 23:31:00,40.7443,-73.9889,B02764,23
564514,4/30/2014 23:32:00,40.6756,-73.9405,B02764,23
564515,4/30/2014 23:48:00,40.688,-73.9608,B02764,23


In [10]:
apr.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Hour
0,4/1/2014 0:11:00,40.769,-73.9549,B02512,0
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512,0
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512,0
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512,0
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512,0


In [11]:
apr.drop('Date/Time', axis=1, inplace=True)
apr.drop('Base', axis=1, inplace=True)

In [12]:
apr.head()

Unnamed: 0,Lat,Lon,Hour
0,40.769,-73.9549,0
1,40.7267,-74.0345,0
2,40.7316,-73.9873,0
3,40.7588,-73.9776,0
4,40.7594,-73.9722,0


In [13]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

In [14]:
# model = KMeans()
# visualizer = KElbowVisualizer(estimator = model, k = (2,10))
# visualizer.fit(apr[['Lat', 'Lon']])
# visualizer.poof()

In [15]:
model = KMeans(5, random_state=42)
model.fit(apr[['Lat', 'Lon']])
y_pred = model.predict(apr[['Lat', 'Lon']])

In [16]:
y_pred

array([3, 0, 0, ..., 3, 0, 0], dtype=int32)

In [17]:
np.unique(y_pred, return_counts=True)

(array([0, 1, 2, 3, 4], dtype=int32),
 array([273868,  20088,  14829, 250867,   4864]))

In [18]:
model.cluster_centers_

array([[ 40.71965771, -73.99233428],
       [ 40.79817756, -73.87217037],
       [ 40.65994576, -73.77673087],
       [ 40.76299415, -73.97576775],
       [ 40.70048892, -74.20152276]])

In [19]:
apr['Kcluster'] = model.labels_

In [20]:
apr.head()

Unnamed: 0,Lat,Lon,Hour,Kcluster
0,40.769,-73.9549,0,3
1,40.7267,-74.0345,0,0
2,40.7316,-73.9873,0,0
3,40.7588,-73.9776,0,3
4,40.7594,-73.9722,0,3


In [21]:
sample = apr.sample(frac=0.02, random_state=42)

In [22]:
from sklearn.cluster import DBSCAN

In [51]:
dbscan_cluster = DBSCAN(eps=0.015)
dbscan_cluster = dbscan_cluster.fit(sample[['Lat', 'Lon']])
ypred = dbscan_cluster.fit_predict(sample[['Lat', 'Lon']])

In [52]:
sample['DBcluster'] = ypred

In [53]:
sample.head()

Unnamed: 0,Lat,Lon,Hour,Kcluster,DBcluster
77202,40.8021,-73.9654,10,3,0
558915,40.6462,-73.7769,4,2,1
152635,40.7747,-73.9603,9,3,0
361259,40.715,-74.0157,23,0,0
60087,40.7335,-74.008,19,0,0


In [54]:
np.unique(ypred, return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 array([  140, 10781,   227,     8,    78,     6,     7,     8,    10,
            4,     6,    12,     3]))

In [55]:
np.unique(ypred)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [56]:
sample.loc[sample['DBcluster'] == -1]['Lon'].mean()

-73.90173285714286

In [57]:
centroids = []
for label in np.unique(ypred):
    centroids.append([sample.loc[sample['DBcluster'] == label]['Lat'].mean(),sample.loc[sample['DBcluster'] == label]['Lon'].mean()])

In [58]:
import folium
from folium.plugins import HeatMap

In [59]:
map = folium.Map(location=(sample['Lat'].mean(), sample['Lon'].mean()), zoom_start=11)

# Create the Heat Map
HeatMap(list(zip(sample['Lat'], sample['Lon']))).add_to(map)

for point in model.cluster_centers_:
    folium.Marker(location=(point[0], point[1]), icon=folium.Icon(icon="car", prefix='fa', color='gray')).add_to(map)

for point in centroids:
    folium.Marker(location=(point[0], point[1]), icon=folium.Icon(icon="car", prefix='fa', color='pink')).add_to(map)

map