In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import  silhouette_score

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "colab"

In [5]:
data = pd.read_csv("uber-raw-data-jun14.csv")
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/1/2014 0:00:00,40.7293,-73.992,B02512
1,6/1/2014 0:01:00,40.7131,-74.0097,B02512
2,6/1/2014 0:04:00,40.3461,-74.661,B02512
3,6/1/2014 0:04:00,40.7555,-73.9833,B02512
4,6/1/2014 0:07:00,40.688,-74.1831,B02512


In [6]:
data_sample = data.sample(30000)

In [7]:
# Basic stats
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()


print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

Number of rows : 30000
Number of columns : 4

Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [8]:
data_sample['Date/Time']= pd.to_datetime(data_sample['Date/Time'])
data_sample.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [9]:
# split Date column into 2 parts (day and day of week) and drop Date column
data_sample['DayOfWeek'] =data_sample['Date/Time'].dt.dayofweek
data_sample['Day'] =data_sample['Date/Time'].dt.day
data_sample['Hour'] =data_sample['Date/Time'].dt.hour
data_sample.drop(['Date/Time', 'Base'], axis=1, inplace = True)

In [10]:
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
63922,40.7789,-73.9823,3,5,5
256117,40.72,-74.0056,5,28,1
68226,40.7604,-73.972,3,5,14
408336,40.7272,-74.0074,0,23,18
570569,40.7691,-73.9633,1,17,15


In [11]:
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()

Number of rows : 30000
Number of columns : 5



In [12]:
## map without clustering
fig = px.scatter_mapbox(
        data_sample,
        lat="Lat",
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [13]:
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
63922,40.7789,-73.9823,3,5,5
256117,40.72,-74.0056,5,28,1
68226,40.7604,-73.972,3,5,14
408336,40.7272,-74.0074,0,23,18
570569,40.7691,-73.9633,1,17,15


In [14]:
data_sample.dtypes

Lat          float64
Lon          float64
DayOfWeek      int64
Day            int64
Hour           int64
dtype: object

In [15]:
#creation  d'enssemble de donnée pour chaque jours de la semaine
data_sample_0 = data_sample.loc[data_sample['DayOfWeek'] == 0]
data_sample_1 = data_sample.loc[data_sample['DayOfWeek'] == 1]
data_sample_2 = data_sample.loc[data_sample['DayOfWeek'] == 2]
data_sample_3 = data_sample.loc[data_sample['DayOfWeek'] == 3]
data_sample_4 = data_sample.loc[data_sample['DayOfWeek'] == 4]
data_sample_5 = data_sample.loc[data_sample['DayOfWeek'] == 5]
data_sample_6 = data_sample.loc[data_sample['DayOfWeek'] == 6]

In [16]:
data_sample_0.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
408336,40.7272,-74.0074,0,23,18
465149,40.6416,-73.7889,0,2,5
98728,40.776,-73.9614,0,9,6
275089,40.7631,-73.9799,0,30,21
105866,40.7473,-73.9934,0,9,21


In [17]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample_0.head())
X = preprocessor.fit_transform(data_sample_0) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
408336  40.7272 -74.0074          0   23    18
465149  40.6416 -73.7889          0    2     5
98728   40.7760 -73.9614          0    9     6
275089  40.7631 -73.9799          0   30    21
105866  40.7473 -73.9934          0    9    21
...Terminé.
[[-0.3527518  -0.61636586  0.          0.80527617]
 [-2.52742324  3.05555324  0.         -1.50143455]
 [ 0.88701416  0.15666974  0.         -1.32399526]
 [ 0.55928914 -0.15422501  0.          1.33759402]
 [ 0.15788951 -0.38109415  0.          1.33759402]]



### DBSCAN algorithm

In [18]:

db = DBSCAN(eps=0.3, min_samples=10, metric="manhattan")

db.fit(X)

### Find out how many clusters DBSCAN created

In [19]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [20]:
data_sample_0["cluster_0"] = db.labels_
data_sample_0.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,cluster_0
408336,40.7272,-74.0074,0,23,18,0
465149,40.6416,-73.7889,0,2,5,4
98728,40.776,-73.9614,0,9,6,0
275089,40.7631,-73.9799,0,30,21,0
105866,40.7473,-73.9934,0,9,21,0


In [21]:
data_sample_0['cluster_0'].value_counts()

 0     3165
-1      771
 3      148
 1      100
 2       48
 4       27
 5       15
 6       15
 7       15
 8       12
 9        9
 10       7
Name: cluster_0, dtype: int64

In [22]:
fig = px.scatter_mapbox(
        data_sample_0[data_sample_0.cluster_0 != -1],
        lat="Lat",
        lon="Lon",
        color="cluster_0",
        mapbox_style="carto-positron"
)

fig.show()

In [37]:

px.scatter_mapbox(
    data_sample_0.loc[data_sample_0.cluster_0 != -1],
    lat="Lat",
    lon="Lon",
    animation_frame="Hour",
    mapbox_style="carto-positron"
)

In [24]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample_1.head())
X = preprocessor.fit_transform(data_sample_1) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
570569  40.7691 -73.9633          1   17    15
284845  40.7549 -73.9750          1    3    12
45614   40.7260 -73.9471          1    3     3
327393  40.7817 -73.9833          1   10    22
108975  40.7189 -73.9580          1   10    11
...Terminé.
[[ 0.74668405  0.25187624  0.          0.13352849]
 [ 0.34496743  0.01308567  0.         -0.41974714]
 [-0.47261076  0.58250935  0.         -2.07957402]
 [ 1.10313683 -0.15631278  0.          1.42450496]
 [-0.67346907  0.36004633  0.         -0.60417235]]



In [25]:
db.fit(X)

In [26]:
data_sample_1["cluster_1"] = db.labels_
fig = px.scatter_mapbox(
        data_sample_1[data_sample_1.cluster_1 != -1],
        lat="Lat",
        lon="Lon",
        color="cluster_1",
        mapbox_style="carto-positron"
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
fig = px.scatter_mapbox(
        data_sample_1[data_sample_1.cluster_1 != -1],
        lat="Lat",
        lon="Lon",
        animation_frame="Hour",
        mapbox_style="carto-positron"
)

fig.show()

### KMEAN algorithm

In [28]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Day", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
63922   40.7789 -73.9823          3    5     5
256117  40.7200 -74.0056          5   28     1
68226   40.7604 -73.9720          3    5    14
408336  40.7272 -74.0074          0   23    18
570569  40.7691 -73.9633          1   17    15
...Terminé.
[[ 0.99429222 -0.15486836  0.04155967 -1.18239427 -1.54040522]
 [-0.51217356 -0.56404556  1.08011865  1.61573053 -2.21656277]
 [ 0.52112386  0.02601255  0.04155967 -1.18239427 -0.01905074]
 [-0.32802154 -0.59565581 -1.51627881  1.00744253  0.65710681]
 [ 0.74364087  0.17879545 -0.99699931  0.27749693  0.14998865]]



Utiliser la methode elbow,l'objectif est de trouver le point sur le graphique de l'inertie où le changement dans l'inertie (WCSS) ralentit, formant un "coude", c'est la qu'on aura le nombre optimal de cluster

In [29]:
wcss =  []
for i in range (2,11):
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

print(wcss)





















[127188.31135167554, 110295.78784894384, 96002.69624755396, 85957.07473324417, 78822.97180122329, 73013.04373262267, 68229.75039660052, 64435.68710946382, 60536.72767042367]


In [30]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [31]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,11):
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)





















[0.17222248078221686, 0.18806128970894698, 0.1934178969692455, 0.190824812608652, 0.193602922913776, 0.19676519836235637, 0.19514972017971893, 0.19795723347870292, 0.20238167408939586]


In [32]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [33]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 4)
kmeans.fit(X)





In [34]:
data_sample.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,Cluster_KMeans
63922,40.7789,-73.9823,3,5,5,0
256117,40.72,-74.0056,5,28,1,0
68226,40.7604,-73.972,3,5,14,1
408336,40.7272,-74.0074,0,23,18,3
570569,40.7691,-73.9633,1,17,15,3


In [35]:
fig = px.scatter_mapbox(
        data_sample,
        lat="Lat",
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron"
)

fig.show()

In [36]:
fig = px.scatter_mapbox(
        data_sample,
        lat="Lat",
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()