In [1]:
# Import usuals librairies
import pandas as pd 
import numpy as np 
import matplotlib as plt

In [2]:
# Import dataset
dataset = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/uber-tlc-foil-response/master/uber-trip-data/uber-raw-data-apr14.csv")

In [3]:
# Filter only on lat and lon
X = dataset.iloc[:, 1:3]
X.head()

Unnamed: 0,Lat,Lon
0,40.769,-73.9549
1,40.7267,-74.0345
2,40.7316,-73.9873
3,40.7588,-73.9776
4,40.7594,-73.9722


In [4]:
# Minibatch KMeans works as classical Kmeans but more faster to converge
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(4)
kmeans.fit(X)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=4, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [5]:
# Create a sample of data to not have too many elements on the map
X = X.sample(1000)

# Predict clusters on sample data
X.loc[:,'cluster'] = kmeans.predict(X)
X.head()

Unnamed: 0,Lat,Lon,cluster
26610,40.7647,-73.9784,0
506342,40.6951,-73.9714,2
91762,40.7455,-74.0332,2
443128,40.757,-73.9668,0
188616,40.7804,-73.9779,0


In [6]:
import plotly.express as px

fig = px.scatter_mapbox(X, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="carto-positron")
fig.show()

In [7]:
# Create a new column to specify the weekday
dataset.iloc[:,0]= pd.to_datetime(dataset.iloc[:,0])
dataset["weekday"] = dataset.iloc[:,0].dt.dayofweek
dataset.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,weekday
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,1
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,1
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,1
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,1
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,1


In [8]:
for d in dataset["weekday"].unique():

    X = dataset.loc[dataset["weekday"]==d,["Lat","Lon"]]
    kmeans = MiniBatchKMeans(4)
    kmeans.fit(X)

    # Create a sample of data to not have too many elements on the map
    X = X.sample(1000)

    # Predict clusters on sample data
    X.loc[:,'cluster'] = kmeans.predict(X)
    X.head()

    fig = px.scatter_mapbox(X, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="carto-positron")
    fig.show()

## DBscan

In [10]:
# Use DBSCAN to compare with KMeans
from sklearn.cluster import DBSCAN


for i in np.unique(dataset["weekday"]):

    X = dataset.loc[dataset["weekday"]==d,["Lat","Lon"]]
    
    X = X.sample(1000)

    # We use DBSCAN on a sample of data to avoid having to wait too long for the algorithm to converge.
    # We take an eps = 0.015 to have a reasonable number of clusters
    dbscan = DBSCAN(eps=0.015, metric = "manhattan")
    X.loc[:,'cluster'] = dbscan.fit_predict(X)

    fig = px.scatter_mapbox(X, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="carto-positron")
    fig.show()
