In [215]:
from velibconnector import VelibConnector

cmd = """
SELECT station, lat, lon,
    delta, bikes, capacity, name,
        dt from velib_all
        where dt >= poll_dt::date and dt>'2024-12-05'
"""
df = VelibConnector(cmd).to_pandas()
df_saved = df.copy()
print(f'{len(df)} rows chargés pour la période de {df.dt.min().date()} à {df.dt.max().date()}')

PostgreSQL connection closed.
2120515 rows chargés pour la période de 2024-12-05 à 2025-02-06


In [216]:
df = df_saved.copy()

In [217]:
# On a les données horodataires
df['datehour'] = df.dt.dt.floor('h')

In [218]:
print('Drop NaN values:', df.isna().sum().sum())
df.dropna()
full_duplicates = df.duplicated(['dt', 'bikes', 'capacity', 'station']).sum()
print(f"Doublons des valeurs realtime: {full_duplicates}, {round(full_duplicates/len(df)*100, 2)}%")
df.drop_duplicates(['dt', 'bikes', 'capacity', 'station'], inplace=True)
calc_duplicated = df.duplicated(['datehour', 'station']).sum()
print(f"Enregistrements répétées par heure/station: {calc_duplicated}, {round(calc_duplicated/len(df)*100, 2)}%")

Drop NaN values: 0
Doublons des valeurs realtime: 3542, 0.17%
Enregistrements répétées par heure/station: 23801, 1.12%


In [219]:
# Analyse et nettoyage doublons
# display(df[df.duplicated(['datehour', 'station'], keep=False)].sort_values('dt').head(20))
# Il s'agit des données consecutives. On peut les résumer
def first(x):
    return x.iloc[0]
def last(x):
    return x.iloc[-1]
duplicates = df[df.duplicated(['datehour', 'station'], keep=False)]
duplicates = duplicates.sort_values('dt').groupby(['datehour', 'station']).agg({
    'lat' : first,
    'lon' : first,
    'delta' : 'sum',
    'bikes' : last,
    'capacity' : last,
})
df = df.set_index(['datehour', 'station'])
df.update(duplicates)
df = df.reset_index()
df.drop_duplicates(['datehour', 'station'], inplace=True)

In [220]:
import pandas as pd
import plotly.express as px
import numpy as np
import math
px.box(df.station.value_counts(), 'count', labels={'count' : 'Stations'}, title='Distribution de nombre de stations et présence des outliers').show()
datehour_df = df.groupby('datehour').station.count().reset_index()
px.line(x=datehour_df.datehour, y=datehour_df['station'], labels={'datehour' : 'Date-heure', 'station' : 'Nb de stations par heure'}, title='Nombre de stations connues par heure').show()
px.box(datehour_df, 'station', labels={'station' : 'Nb de station par heure'}, title='Distribution de nombre de stations par heure').show()

In [221]:
from scipy.stats import zscore

clean_df = df.copy()


def detect_outliers(selection: pd.Series, k = 1.5, method='IQR'):
    """
    Returns a blacklist of indexes considered as outliers.
    Args:
        - method = 'IQR' (default) ou 'zscore'
        - k - multiply coef for IQR or zscore limit
    """
    if method == 'IQR':
        q = selection.quantile([0.25, 0.75]).to_list()
        IQR = (q[1] - q[0]) * k
        low_border = q[0] - IQR
        high_border = q[1] + IQR
    elif method=='zscale':
        selection_zscore = selection.copy()
        selection_zscore = zscore(selection)
        low_border = selection[selection_zscore < -k].max()
        high_border = selection[selection_zscore > k].min()
    elif method=='hard':
        low_border = k
        high_border = selection.max()
    else:
        raise SyntaxError('Wrong method.')
    print('Valeur min-max:', selection.min(), '-', selection.max())
    print('Seuils de outliers:', low_border, '-', high_border)
    high_outliers = selection[selection>high_border].index.to_list()
    low_outliers = selection[selection<low_border].index.to_list()
    print('Nombre de valeurs total:', len(selection))
    print(f'Grands outliers: {len(high_outliers)} ou {round(len(high_outliers)/len(selection)*100, 2)}%')
    print(f'Petits outliers: {len(low_outliers)} ou {round(len(low_outliers)/len(selection)*100, 2)}%')
    return high_outliers + low_outliers

print('Stations')
blacklist_stations = [] + detect_outliers(clean_df.station.value_counts(), k=3)
print('Hours')
blacklist_hours = [] + detect_outliers(clean_df.datehour.value_counts(), k=3)

# blacklist_hours = detect_outliers(clean_df.groupby('datehour').station.count(), k=1350, method='hard')


total_rows = len(clean_df)
drop_mask = clean_df.station.isin(blacklist_stations) | clean_df.datehour.isin(blacklist_hours)
dropped_rows = drop_mask.sum()
print(f'Need to drop {dropped_rows} rows or {round(dropped_rows/total_rows*100, 2)}% of total {total_rows} rows.')
# clean_df = clean_df[~drop_mask]

print("Comme c'est un peu trop faison la clusterisation pour niveler le probleme de petites stations.")


Stations
Valeur min-max: 19 - 1465
Seuils de outliers: 1417.0 - 1480.0
Nombre de valeurs total: 1460
Grands outliers: 0 ou 0.0%
Petits outliers: 93 ou 6.37%
Hours
Valeur min-max: 273 - 1453
Seuils de outliers: 1414.0 - 1470.0
Nombre de valeurs total: 1470
Grands outliers: 0 ou 0.0%
Petits outliers: 204 ou 13.88%
Need to drop 365131 rows or 17.44% of total 2093172 rows.
Comme c'est un peu trop faison la clusterisation pour niveler le probleme de petites stations.


In [222]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

stations = df[['station', 'lat', 'lon', 'name']].drop_duplicates()
stations['lat'] = stations.lat.apply(float)
stations['lon'] = stations.lon.apply(float)

min_lon = stations.lon.min()
min_lat = stations.lat.min()
def convert_lon(lon):
    lon -= min_lon
    return lon * 72987
def convert_lat(lat):
    lat -= min_lat
    return lat * 111000
stations['convlon'] = stations.lon.apply(convert_lon)
stations['convlat'] = stations.lat.apply(convert_lat)
# 73 le meilleur, plus grand = 142
kmeans = KMeans(n_clusters=142, random_state=0).fit(stations[['convlat', 'convlon']])
stations['labels'] = kmeans.labels_


fig = px.scatter_map(stations, 'lat', 'lon', color='labels', size_max=20, color_continuous_scale='Viridis', zoom=10)
fig.update_xaxes(scaleanchor='y', scaleratio=1)
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.update_layout(
    width=800,
    height=600
)
fig.show()

In [223]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Calculer l'indice de silhouette pour différents nombres de clusters
silhouette_scores = []
for k in range(50, 251):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(stations[['convlat', 'convlon']])
    silhouette_scores.append(silhouette_score(stations[['convlat', 'convlon']], kmeans.labels_))

# Tracer la courbe de l'indice de silhouette
px.line(x=range(50, 251), y=silhouette_scores, labels={'x' : 'Nombre de clusters', 'y' : 'Indice de Silhouette'}, title='Indice Silhouette pour determiner le meilleur nombre de clusters pour KMeans').show()


In [224]:
## Calculer la taille max des clusters en metres

from scipy.spatial import distance_matrix

def max_distance_in_cluster(data):
    cluster_points = data[['convlat', 'convlon']]
    if len(cluster_points) > 1:
        dist_matrix = distance_matrix(cluster_points, cluster_points)
        return np.max(dist_matrix)
    else:
        return -1

# Calculate the maximum distance for each cluster
max_distances = []
for l in sorted(stations['labels'].unique()):
    max_distances.append(int(max_distance_in_cluster(stations[stations.labels == l])))

print(f'Distance mediane des clusters: {np.median(max_distances)} m.')
 

Distance mediane: 1434.5 m.


In [225]:
## Creation de zone de clusters pour Viz
from scipy.spatial import ConvexHull

def points_to_geo_json(data : list):
    result = {
        'type' : 'FeatureCollection',
        'features' : []
    }
    for d in data:
        label = d[0][3]
        feat = {
            'type' : 'Feature',
            'properties' : {
            },
            'id' : int(label),
            'geometry': {
                'type': 'Polygon',
                'coordinates': [
                    [[r[0], r[1]] for r in d] + [[d[0][0], d[0][1]]]
                                 ]
                },
        }
        result['features'].append(feat)
    return result

def get_border(df, label):
    if len(df) < 3:
        return [[df['lon'].iloc[0], df['lat'].iloc[0], df['station'].iloc[0], label]]
    points = df[['lon', 'lat']].to_numpy()
    hull = ConvexHull(points)
    return [[df['lon'].iloc[p], df['lat'].iloc[p], df['station'].iloc[p], label] for p in hull.vertices]


borders = []
for l in sorted(stations.labels.unique()):
    borders.append(get_border(stations[stations.labels==l], l))
geo = points_to_geo_json(borders)

In [226]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Choroplethmap(geojson=geo, locations=stations.labels, z=stations.labels, 
                            # center={'lat': 48.865983, 'lon': 2.275725}, 
                            marker_opacity=0.5
                            ))
fig.add_trace(
    go.Scattermap(
        lon=stations['lon'],
        lat=stations['lat'],
        mode='markers',
        marker=dict(size=10, color=stations['labels']),
        hovertext=stations.labels,
        customdata=stations[['labels', 'station']].to_numpy(),
        hovertemplate="<b>Cluster %{customdata[0]:}</b><br>" +
                  "Station: %{customdata[1]}<br>" +
                  "Latitude: %{lat:.2f}<br>" +
                  "Longitude: %{lon:.2f}<extra></extra>"
    )
)

fig.update_layout(
    width = 800,
    height=600,
    map = {
        'center' : {'lat': 48.85, 'lon': 2.35}, 
        'zoom' : 10}
    )
fig.show()

In [227]:
clean_df = pd.merge(clean_df, stations[['station', 'labels']], on='station', how='left')


In [228]:
px.box(clean_df.labels.value_counts(), 'count', labels={'count' : 'Clusters'}, title='Distribution de nombre de clusters et présence des outliers').show()
datehour_df = clean_df.groupby('datehour').labels.nunique().reset_index()
px.line(x=datehour_df.datehour, y=datehour_df['labels'], labels={'x' : 'Date-heure', 'y' : 'Nb de clusters par heure'}, title='Nombre de labels connues par heure').show()
px.box(datehour_df, 'labels', labels={'labels' : 'Nb de clusters par heure'}, title='Distribution de nombre de clusters par heure').show()

In [236]:
# Nettoyage des heures outliers
datehour_df = clean_df.groupby('datehour').labels.nunique()
q = datehour_df.quantile([0.25, 0.75]).to_list()
# outliers comme q1 - delta(q3,q1)*1.5 et q3 + delta(q3,q1)*1.5
q_margin = (q[1] - q[0]) * 1.5
seuil_bas = q[0] - q_margin
seuil_haut = q[1] + q_margin
print('Seuils de outliers:', seuil_bas, '-', seuil_haut)
print("Nombre de clusters:", clean_df.labels.nunique())
print('Max stations par heure:', datehour_df.max())
print('Min stations par heure:', datehour_df.min())
top_outliers = datehour_df[datehour_df>seuil_haut]
top_outliers_count = top_outliers.count()
bottom_outliers = datehour_df[datehour_df<seuil_bas]
bottom_outliers_count = bottom_outliers.count()
total_hours = clean_df.datehour.nunique()
print("Nombre d'heures:", total_hours)
print('Nombre de top outliers:', top_outliers_count, f"{round(top_outliers_count/total_hours*100, 2)}%")
print('Nombre de bottom outliers:', bottom_outliers_count, f"{round(bottom_outliers_count/total_hours*100, 2)}%")
blacklist = bottom_outliers.index.to_list() + top_outliers.index.to_list()
total_rows = len(clean_df)
dropped_rows = len(clean_df[clean_df.datehour.isin(blacklist)])
print(f'A enlever {len(blacklist)} outliers, {dropped_rows} lignes soit {round(dropped_rows/total_rows*100, 2)}% de {total_rows} lignes en totale.')
blacklist = datehour_df[datehour_df<141].index.to_list()
dropped_rows = len(clean_df[clean_df.datehour.isin(blacklist)])
print(f'Si on coupe manuellement les heures avec moins de 141 clusters on supprime {len(blacklist)} outliers et {dropped_rows} lignes soit {round(dropped_rows/total_rows*100, 2)}% de {total_rows} ligne en totale.')

clean_df = clean_df[~clean_df.datehour.isin(blacklist)]

Seuils de outliers: 142.0 - 142.0
Nombre de clusters: 142
Max stations par heure: 142
Min stations par heure: 103
Nombre d'heures: 1470
Nombre de top outliers: 0 0.0%
Nombre de bottom outliers: 263 17.89%
A enlever 263 outliers, 365927 lignes soit 17.48% de 2093172 lignes en totale.
Si on coupe manuellement les heures avec moins de 141 clusters on supprime 12 outliers et 10197 lignes soit 0.49% de 2093172 ligne en totale.


In [255]:
datehour_df = clean_df.groupby('datehour').labels.nunique().reset_index()
px.line(x=datehour_df.datehour, y=datehour_df['labels'], labels={'x' : 'Date-heure', 'y' : 'Nb de clusters par heure'}, title='Nombre de labels connues par heure').show()
px.box(datehour_df, 'labels', labels={'labels' : 'Nb de clusters par heure'}, title='Distribution de nombre de clusters par heure').show()
datehour_df_stations = clean_df.groupby('datehour').station.nunique().reset_index()
px.box(datehour_df_stations, 'station', labels={'station' : 'Nb de station par heure'}, title='Distribution de nombre de stations par heure').show()
px.line(datehour_df_stations, 'datehour', 'station', labels={'datehour' : 'Date-heure', 'station' : '# de stations'}, title='Nombre de stations connues par heure').show()




In [244]:
print(f"On a obtenu {len(clean_df):,} ligne soit {round(len(clean_df)/len(df_saved), 2)}% du dataset original.")

On a obtenu 2,082,975 ligne soit 0.98% du dataset original.


In [256]:

src_df = clean_df.groupby('datehour')[['delta', 'bikes', 'capacity']].sum().reset_index()
datehour_df = pd.concat([src_df[['datehour', 'delta']], src_df[['datehour', 'bikes']], src_df[['datehour', 'capacity']]])
datehour_df['value'] = datehour_df.bfill(axis=1)['delta']

datehour_df['line'] = datehour_df.apply(lambda row: 'Delta' if not math.isnan(row.delta) else 'Vélos' if not math.isnan(row.bikes) else 'Capacity', axis=1 )
px.line(datehour_df, x='datehour', y='value', color='line', 
        height=800, title="Dynamique d'utilisation des stations",
        labels={'line': 'Data', 'datehour' : 'Date-Heure', 'value' : 'Sommes'}).show()



In [325]:
clean_df

Unnamed: 0,datehour,station,lat,lon,delta,bikes,capacity,name,dt,labels,year,month,week,weekday
0,2024-12-14 00:00:00,213688169,48.865983,2.275725,1,5,35,Benjamin Godard - Victor Hugo,2024-12-14 00:28:18,57,2024,12,50,6
1,2024-12-14 00:00:00,17278902806,48.778192750803,2.3963020229163,1,19,20,Rouget de L'isle - Watteau,2024-12-14 00:30:24,44,2024,12,50,6
2,2024-12-14 00:00:00,36255,48.87929591733507,2.3373600840568547,3,10,21,Toudouze - Clauzel,2024-12-14 00:28:19,21,2024,12,50,6
3,2024-12-14 00:00:00,251039991,48.837525839067,2.3360354080796,-1,8,25,Cassini - Denfert-Rochereau,2024-12-14 00:30:46,75,2024,12,50,6
4,2024-12-14 00:00:00,85002689,48.819428333369,2.3433353751898,1,6,60,Jourdan - Stade Charléty,2024-12-14 00:27:46,132,2024,12,50,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2093167,2024-12-13 22:00:00,516395829,48.8512971,2.3624535,0,13,14,Quai des Célestins - Henri IV,2024-12-13 22:31:06,124,2024,12,50,5
2093168,2024-12-13 22:00:00,368766689,48.819116181578764,2.3966637253761296,2,17,25,Westermeyer - Paul Vaillant-Couturier,2024-12-13 22:31:05,129,2024,12,50,5
2093169,2024-12-13 22:00:00,129026597,48.874422773426545,2.3284685611724854,2,4,22,Caumartin - Provence,2024-12-13 22:26:51,137,2024,12,50,5
2093170,2024-12-13 22:00:00,315022587,48.870406028483,2.323243509808,3,27,67,Malesherbes - Place de la Madeleine,2024-12-13 22:32:32,99,2024,12,50,5


In [399]:
clean_df['year'] = clean_df.dt.dt.year
clean_df['month'] = clean_df.dt.dt.month
clean_df['week'] = clean_df.dt.dt.isocalendar().week
clean_df['weekday'] = clean_df.dt.dt.weekday + 1
clean_df['date'] = clean_df.dt.dt.date

usage_weekly = clean_df.groupby(['weekday', 'datehour']).delta.sum().reset_index().groupby(['weekday']).aggregate(
        delta_min=pd.NamedAgg(column="delta", aggfunc="min"),
        delta_max=pd.NamedAgg(column="delta", aggfunc="max"),
        delta_mean=pd.NamedAgg(column="delta", aggfunc="mean"),
).reset_index()
fig = go.Figure()
fig.add_traces([
    go.Scatter(
        x=usage_weekly.weekday,
        y=usage_weekly.delta_min,
        fill='tonexty',
        fillcolor='rgba(100, 0, 80, 0.2)',
        mode='lines+markers',
        line={'color':'red'},
        name='Min'
    )
])
fig.add_traces([
    go.Scatter(
        x=usage_weekly.weekday,
        y=usage_weekly.delta_mean,
        mode='lines+markers',
        line={'color':'black'},
        name='Moyenne',
    )
])
fig.add_traces([
    go.Scatter(
        x=usage_weekly.weekday,
        y=usage_weekly.delta_max,
        fill='tonexty',
        fillcolor='rgba(100, 0, 80, 0.2)',
        mode='lines+markers',
        line={'color':'red'},
        name='Max'
    )
])

fig.update_layout(
        title = 'Utilisation des vélos horodatée par jour de semaine',
        legend_title_text = 'Vélos pris/retournés',
        xaxis_title = 'Jour de semaine'
)
fig.show()



In [398]:
clean_df['hour'] = clean_df.dt.dt.hour

usage_hourly = clean_df.groupby(['datehour', 'hour']).delta.sum().reset_index().groupby(['hour']).aggregate(
        delta_min=pd.NamedAgg(column="delta", aggfunc="min"),
        delta_max=pd.NamedAgg(column="delta", aggfunc="max"),
        delta_mean=pd.NamedAgg(column="delta", aggfunc="mean"),
).reset_index()
fig = go.Figure()
fig.add_traces([
    go.Scatter(
        x=usage_hourly.hour,
        y=usage_hourly.delta_min,
        fill='tonexty',
        fillcolor='rgba(0, 80, 100, 0.2)',
        mode='lines+markers',
        line={'color':'blue'},
        name='Min'
    )
])
fig.add_traces([
    go.Scatter(
        x=usage_hourly.hour,
        y=usage_hourly.delta_mean,
        mode='lines+markers',
        line={'color':'red'},
        name='Moyenne',
    )
])
fig.add_traces([
    go.Scatter(
        x=usage_hourly.hour,
        y=usage_hourly.delta_max,
        fill='tonexty',
        fillcolor='rgba(100, 80, 0, 0.2)',
        mode='lines+markers',
        line={'color':'orange'},
        name='Max'
    )
])

fig.update_layout(
        title = 'Utilisation des vélos par heure',
        legend_title_text = 'Vélos pris/retournés',
        xaxis_title = 'Heure',
        xaxis = {'dtick' : 1, 'range' : [0, 23]}
)
fig.show()

In [420]:
clean_df['empty'] = (clean_df.bikes == 0)

empty_hourly = clean_df.groupby(['datehour', 'hour']).empty.sum().reset_index().groupby(['hour']).aggregate(
        empty_min=pd.NamedAgg(column="empty", aggfunc="min"),
        empty_max=pd.NamedAgg(column="empty", aggfunc="max"),
        empty_mean=pd.NamedAgg(column="empty", aggfunc="mean"),
).reset_index()
fig = go.Figure()
fig.add_traces([
    go.Scatter(
        x=empty_hourly.hour,
        y=empty_hourly.empty_mean,
        mode='lines+markers',
        line={'color':'black'},
        name='Moyenne',
    )
])
fig.add_traces([
    go.Scatter(
        x=empty_hourly.hour,
        y=empty_hourly.empty_min,
        fill='tonexty',
        fillcolor='rgba(100, 0, 80, 0.2)',
        mode='lines+markers',
        line={'color':'red'},
        name='Min',
    )
])

fig.add_traces([
    go.Scatter(
        x=empty_hourly.hour,
        y=empty_hourly.empty_max,
        fill='tonexty',
        fillcolor='rgba(100, 0, 80, 0.2)',
        mode='lines+markers',
        line={'color':'red'},
        name='Max',
    )
])
fig.update_layout(
        title = 'Nombre de stations vides par heure',
        legend_title_text = 'Stations vides',
        xaxis_title = 'Heure',
        xaxis = {'dtick' : 1, 'range' : [0, 23]},
        yaxis = {'dtick' : 20} 
)
fig.show()