In [2]:
import datetime, pandas as pd, numpy as np
import os
from pathlib import Path
from velibds import VelibData

CACHED = False
# CACHED = True

dataset_file = r'local_data/dataset.csv'
def prepare_dataset():
    os.environ['LOKY_MAX_CPU_COUNT'] = '7'
    df = VelibData(update_cache=True).extract().transform().data
    # df = VelibData(cache=True).extract().transform().data
    df.to_csv(dataset_file, index=False)
    return df

if CACHED and Path(dataset_file).is_file():
    df = pd.read_csv(dataset_file, parse_dates=['datehour'])
    print(f'Chargé {len(df)} lignes pour la période de {df.datehour.min()} à {df.datehour.max()}')
else:
    df = prepare_dataset()


Openning PostgreSQL connection.
Fetching PostgreSQL data.
PostgreSQL connection closed.
3344820 lignes chargées sur velib_data pour la période de 2024-12-05 à 2025-03-10
Chargement de données MétéoFrance pour la période: 2024-12-05T00:00:00Z - 2025-03-11T00:00:00Z
2305 lignes chargées sur meteo_data pour la période de 2024120500 à 2025031100
Transformation: velib_data
Suppression de 0 valeurs manquantes.
Suppression de 25866 (0.77%) doublons d'origine, c'est-à-dire des lignes identiques par les valeurs clé de données de temps réel Vélib: 'dt', 'bikes', 'capacity' et 'station'.
Il y a 32888 (0.99%) lignes qui réprésentent les mêmes stations pour les même dates et heures. On enlève les valeurs intrahoraires et recalcule les deltas.
Valeur min-max: 20 - 2293
Seuils de outliers: 2241.0 - 2311.0
Nombre de valeurs total: 1460
Grands outliers: 0 ou 0.0%
Petits outliers: 115 ou 7.88%
On enlève 115 stations peu représentées dans le dataset.
On a supprimé 224203 ligne, 0.07%.
Calcule de 57 clust

In [3]:
VelibData.help()

## VelibData functions and arguments

### VelibData():

{'from_dt': <Parameter "from_dt: datetime.datetime = datetime.date(2024, 12, 5)">,
 'to_dt': <Parameter "to_dt: datetime.datetime = None">,
 'debug': <Parameter "debug=True">,
 'params': <Parameter "params: dict = None">,
 'cache': <Parameter "cache=False">,
 'update_cache': <Parameter "update_cache=False">}

### extract():

{}

### transform():

{'params': <Parameter "params: dict = None">}

### params dict:

{'clusters': True,
 'drop_outliers': True,
 'reconstruct_velib': True,
 'velib': True,
 'meteo': True}

## Examples

### Extract, transform and create united dataset for the period from 2024-12-05 to today:

```df = VelibData().extract().transform().data```

### Choose only February:

```df = VelibData(from_dt = datetime.date(2025, 2, 1), to_dt = datetime.date(2025, 3, 1)).extract().transform().data```

### Create united dataset but don't restore continuous time series and don't impute missing data:

```df = VelibData(params={'reconstruct_velib' = False}).extract().transform().data```

### Create united dataset without dropping stations/clusters outliers:

```df = VelibData(params={'drop_outliers' = False}).extract().transform().data```

### No clusters:

```df = VelibData(params={'clusters' = False}).extract().transform().data```

### Don't add meteo data:

```df = VelibData(params={'meteo': False}).extract().transform().data```

### Use cache in local_data folder to save and load data (minimize online transactions):

```df = VelibData(cache=True).extract().transform().data```

### Recreate cache files with up to date data:

```df = VelibData(update_cache=True).extract().transform().data```

In [None]:
# chatelet = '82328045'
# df[(df.station.astype(str) == chatelet) & (df.datehour.dt.date == datetime.date(2025,2,24)) & (df.hour.isin([20,21,22,23]))]
# # df[(df.datehour.dt.date == datetime.date(2025,2,24)) & (df.hour==22) & (df.delta < -10)].sort_values('delta', ascending=True)

## Modèles "Classiques"

In [5]:
split_date = datetime.date(2025, 2, 15)
df_train = df[df.datehour.dt.date < split_date]
df_test = df[df.datehour.dt.date >= split_date]


## Linear Regression

In [6]:
features = ['capacity', 'temp', 'precip', 'gel', 'vent', 'lat', 'lon', 'hour', 'weekday', 'weekend', 'holiday', 'preholiday']
# features = ['capacity', 'temp', 'precip', 'gel', 'vent', 'cluster', 'hour', 'weekday', 'weekend', 'holiday', 'preholiday']


In [80]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error as RMSE, mean_absolute_error as MAE
from xgboost import XGBRegressor
import time

X_train = df_train[features]
y_train = df_train['delta']
X_test = df_test[features]
y_test = df_test['delta']
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
# X_train.loc[:, features_trans] = scaler.fit_transform(X_train[features_trans])
# X_test.loc[:, features_trans] = scaler.transform(X_test[features_trans])

def modelize(model):
    if hasattr(model, '__getitem__'):
        model_tag = model[1]
        model = model[0]
    else:
        model_tag = model.__class__.__name__
    start_time = time.time()
    # weights = np.where((y_train > y_train.quantile(0.75)) | (y_train < y_train.quantile(0.25)), 100, 1)
    # model.fit(X_train, y_train, sample_weight=weights)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    return pd.DataFrame({
        'rmse' : [RMSE(y_train, y_train_pred), RMSE(y_test, y_pred)],
        'mae' : [MAE(y_train, y_train_pred), MAE(y_test, y_pred)],
        'model' : model_tag,
        'seconds' : str(round(time.time() - start_time, 2))
        }, index=['train', 'test']), y_pred
xgb1 = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.6,
    max_depth=10,
    subsample=0.5,
    reg_lambda=1,
    scale_pos_weight=10
)
xgb2 = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.6, reg_lambda=0.1)
xgb3 = XGBRegressor()
models = [LinearRegression(), BayesianRidge(), (xgb1, 'XGBRegressor_1'), (xgb2, 'XGBRegressor_2'), (xgb3, 'XGBRegressor_3')]
# models = [LinearRegression(), BayesianRidge(), (xgb1, 'XGBRegressor_1')]
results = [modelize(m) for m in models]
results_df = pd.concat([r[0] for r in results])
display(results_df)

Unnamed: 0,rmse,mae,model,seconds
train,2.887227,1.737588,LinearRegression,0.67
test,3.162254,1.918894,LinearRegression,0.67
train,2.887231,1.737443,BayesianRidge,1.07
test,3.162237,1.918527,BayesianRidge,1.07
train,2.924069,1.89219,XGBRegressor_1,12.96
test,3.203432,2.062989,XGBRegressor_1,12.96
train,2.863273,1.749693,XGBRegressor_2,24.45
test,3.142058,1.937137,XGBRegressor_2,24.45
train,2.863474,1.749126,XGBRegressor_3,8.1
test,3.138723,1.931658,XGBRegressor_3,8.1


### Visualisation pour une station

In [81]:
from velibds import VelibDataViz as viz

chatelet = '82328045'

df_show = df_test.copy()
df_show['orig_delta'] = df_show.delta.where(~df_show.reconstructed, np.nan)
tasks = [
    {
        'x': 'datehour',
        'y': 'delta',
        'name': 'Reconstructed Delta',
        'color' : 'black'
    },
    {
        'x': 'datehour',
        'y': 'orig_delta',
        'name': 'Orig Delta',
        'color' : 'gray'
    }
    ]
# tasks = []
for i, (dat, y_pred) in enumerate(results):
    tag = f'delta_{i}'
    model_tag = dat.loc['train', 'model']
    tasks.append({
        'x': 'datehour',
        'y': tag,
        'name': model_tag,
        # 'color' : 'red'
    })
    df_show[tag] = y_pred
mask = df_show.station.astype(str) == chatelet
df_show = df_show[mask]
viz.line(df_show, tasks)

## Station séparée

In [None]:
features = ['temp', 'precip', 'gel', 'vent', 'hour', 'weekday', 'weekend', 'holiday', 'preholiday']

chatelet = '82328045'
# y_shift = abs(df['delta'].min()) + 1
X_train = df_train[df_train.station.astype(str) == chatelet][features]
y_train = df_train[df_train.station.astype(str) == chatelet]['delta']
# y_train = np.log1p(df_train[df_train.station.astype(str) == chatelet]['delta'] + y_shift)
X_test = df_test[df_test.station.astype(str) == chatelet][features]
y_test = df_test[df_test.station.astype(str) == chatelet]['delta']
# y_test = np.log(df_test[df_test.station.astype(str) == chatelet]['delta'] + y_shift)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def modelize(model):
    if hasattr(model, '__getitem__'):
        model_tag = model[1]
        model = model[0]
    else:
        model_tag = model.__class__.__name__
    start_time = time.time()
    # weights = np.where((y_train > y_train.quantile(0.75)) | (y_train < y_train.quantile(0.25)), 100, 1)
    # model.fit(X_train, y_train, sample_weight=weights)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    return pd.DataFrame({
        'rmse' : [RMSE(y_train, y_train_pred), RMSE(y_test, y_pred)],
        'mae' : [MAE(y_train, y_train_pred), MAE(y_test, y_pred)],
        'model' : model_tag,
        'seconds' : str(round(time.time() - start_time, 2))
        }, index=['train', 'test']), y_pred

xgb1 = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=10,
    min_child_weight=1,
    # subsample=0.8,
    # colsample_bytree=0.8,
    # gamma=0,
    # reg_lambda=1,
    scale_pos_weight=10
)
xgb2 = XGBRegressor(n_estimators=10, max_depth=5, subsample=0.3, learning_rate=0.6, reg_lambda=0.1)
xgb3 = XGBRegressor()
models = [LinearRegression(), BayesianRidge(), (xgb1, 'XGBRegressor_1'), (xgb2, 'XGBRegressor_2'), (xgb3, 'XGBRegressor_3')]
results = [modelize(m) for m in models]
results_df = pd.concat([r[0] for r in results])
display(results_df)


Unnamed: 0,rmse,mae,model,seconds
train,2.856976,2.017113,LinearRegression,0.01
test,3.256149,2.240626,LinearRegression,0.01
train,2.857762,2.007301,BayesianRidge,0.01
test,3.260222,2.235126,BayesianRidge,0.01
train,0.115454,0.067004,XGBRegressor_1,1.1
test,3.2997,2.386334,XGBRegressor_1,1.1
train,2.573746,1.901823,XGBRegressor_2,0.02
test,3.956853,2.924629,XGBRegressor_2,0.02
train,0.933789,0.629633,XGBRegressor_3,0.07
test,3.410715,2.468865,XGBRegressor_3,0.07


In [62]:
df_show = df_test[df_test.station.astype(str) == chatelet].copy()
df_show['orig_delta'] = df_show.delta.where(~df_show.reconstructed, np.nan)
tasks = [
    {
        'x': 'datehour',
        'y': 'delta',
        'name': 'Reconstructed Delta',
        'color' : 'black'
    },
    {
        'x': 'datehour',
        'y': 'orig_delta',
        'name': 'Orig Delta',
        'color' : 'gray'
    }
    ]
for i, (dat, y_pred) in enumerate(results):
    tag = f'delta_{i}'
    model_tag = dat.loc['train', 'model']
    tasks.append({
        'x': 'datehour',
        'y': tag,
        'name': model_tag,
        # 'color' : 'red'
    })
    df_show[tag] = y_pred
    # df_show[tag] = np.expm1(y_pred) - y_shift
viz.line(df_show, tasks)

## Approche trihoraire

In [83]:
from velibds import load_holidays
def add_datetime_details(df : pd.DataFrame, use_holidays = True, datehour : str = 'datehour'):
    """Ajout de colonnes:
        'hour' : 0..23, 
        'weekday': 0..6, 
        'weekend' : 0..1,
        'holiday' : 0..1,
        'preholiday' : 0..1
    Args:
        df (pd.DataFrame): dataset avec la colonne 'datehour'
        use_holidays (bool, optional): S'il faut charger les jours fériés et ajouter la col 'holiday'. Defaults to True.

    Returns:
        df mis à jour
    """
    df['hour'] = df[datehour].dt.hour
    df['weekday'] = df[datehour].dt.weekday
    df['weekend'] = df.weekday.isin([5, 6]).astype(int)
    if use_holidays:
        holidays = load_holidays()
        df['holiday'] = (df[datehour].dt.strftime(r'%Y-%m-%d').isin(holidays)).astype(int)
        df['preholiday'] = ((df[datehour] + pd.DateOffset(days=1)).dt.strftime(r'%Y-%m-%d').isin(holidays)).astype(int)
    return df


In [84]:
df3h = df.copy()
df3h['trih'] = df3h.datehour.dt.floor('3h')
df3h = df3h.groupby(['trih', 'station', 'lat', 'lon', 'cluster', 'capacity']).aggregate({
    'bikes' : 'mean',
    'temp' : 'min',
    'vent' : 'max',
    'precip' : 'sum',
    'gel' : 'sum'
}).reset_index()
df3h = add_datetime_details(df3h, datehour='trih')
df3h['delta'] = df3h.groupby('station')['bikes'].diff()
df3h = df3h.dropna()
split_date = datetime.date(2025, 2, 15)
df3h_train = df3h[df3h.trih.dt.date < split_date]
df3h_test = df3h[df3h.trih.dt.date >= split_date]



In [85]:
features = ['capacity', 'temp', 'precip', 'gel', 'vent', 'lat', 'lon', 'hour', 'weekday', 'weekend', 'holiday', 'preholiday']

In [86]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error as RMSE, mean_absolute_error as MAE
from xgboost import XGBRegressor
import time

X_train = df3h_train[features]
y_train = df3h_train['delta']
X_test = df3h_test[features]
y_test = df3h_test['delta']
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# X_train.loc[:, features_trans] = scaler.fit_transform(X_train[features_trans])
# X_test.loc[:, features_trans] = scaler.transform(X_test[features_trans])

def modelize(model):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    return pd.DataFrame({
        'rmse' : [RMSE(y_train, y_train_pred), RMSE(y_test, y_pred)],
        'mae' : [MAE(y_train, y_train_pred), MAE(y_test, y_pred)],
        'model' : model.__class__.__name__,
        'seconds' : str(round(time.time() - start_time, 2))
        }, index=['train', 'test']), y_pred

models = [LinearRegression(), BayesianRidge(), XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.4, reg_lambda=0.1)]
results = [modelize(m) for m in models]
results_df = pd.concat([r[0] for r in results])
display(results_df)

Unnamed: 0,rmse,mae,model,seconds
train,4.531834,2.878616,LinearRegression,0.27
test,5.026982,3.229848,LinearRegression,0.27
train,4.531834,2.878506,BayesianRidge,0.37
test,5.026937,3.229739,BayesianRidge,0.37
train,3.326533,2.299509,XGBRegressor,9.83
test,3.799828,2.588349,XGBRegressor,9.83


In [87]:
from velibds import VelibDataViz as viz

chatelet = '82328045'

df_show = df3h_test.copy()
tasks = [
    {
        'x': 'trih',
        'y': 'delta',
        'name': 'Orig Delta',
        'color' : 'gray'
    }
    ]
# tasks = []
for i, (dat, y_pred) in enumerate(results):
    tag = f'delta_{i}'
    model_tag = dat.loc['train', 'model']
    tasks.append({
        'x': 'trih',
        'y': tag,
        'name': model_tag,
        # 'color' : 'red'
    })
    df_show[tag] = y_pred
mask = df_show.station.astype(str) == chatelet
df_show = df_show[mask]
viz.line(df_show, tasks)

### Une station, 3 heures

In [89]:
features = ['temp', 'precip', 'gel', 'vent', 'hour', 'weekday', 'weekend', 'holiday', 'preholiday']

chatelet = '82328045'
# y_shift = abs(df['delta'].min()) + 1
X_train = df3h_train[df3h_train.station.astype(str) == chatelet][features]
y_train = df3h_train[df3h_train.station.astype(str) == chatelet]['delta']
# y_train = np.log1p(df_train[df_train.station.astype(str) == chatelet]['delta'] + y_shift)
X_test = df3h_test[df3h_test.station.astype(str) == chatelet][features]
y_test = df3h_test[df3h_test.station.astype(str) == chatelet]['delta']
# y_test = np.log(df_test[df_test.station.astype(str) == chatelet]['delta'] + y_shift)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def modelize(model):
    if hasattr(model, '__getitem__'):
        model_tag = model[1]
        model = model[0]
    else:
        model_tag = model.__class__.__name__
    start_time = time.time()
    # weights = np.where((y_train > y_train.quantile(0.75)) | (y_train < y_train.quantile(0.25)), 100, 1)
    # model.fit(X_train, y_train, sample_weight=weights)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    return pd.DataFrame({
        'rmse' : [RMSE(y_train, y_train_pred), RMSE(y_test, y_pred)],
        'mae' : [MAE(y_train, y_train_pred), MAE(y_test, y_pred)],
        'model' : model_tag,
        'seconds' : str(round(time.time() - start_time, 2))
        }, index=['train', 'test']), y_pred

xgb1 = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=10,
    min_child_weight=1,
    # subsample=0.8,
    # colsample_bytree=0.8,
    # gamma=0,
    # reg_lambda=1,
    scale_pos_weight=10
)
xgb2 = XGBRegressor(n_estimators=10, max_depth=5, subsample=0.3, learning_rate=0.6, reg_lambda=0.1)
xgb3 = XGBRegressor()
models = [LinearRegression(), BayesianRidge(), (xgb1, 'XGBRegressor_1'), (xgb2, 'XGBRegressor_2'), (xgb3, 'XGBRegressor_3')]
results = [modelize(m) for m in models]
results_df = pd.concat([r[0] for r in results])
display(results_df)


Unnamed: 0,rmse,mae,model,seconds
train,4.578423,3.426737,LinearRegression,0.0
test,5.190206,3.928245,LinearRegression,0.0
train,4.586286,3.404248,BayesianRidge,0.0
test,5.194773,3.930962,BayesianRidge,0.0
train,0.016322,0.009214,XGBRegressor_1,0.68
test,3.652839,2.876971,XGBRegressor_1,0.68
train,2.855413,2.203165,XGBRegressor_2,0.02
test,4.244204,3.496774,XGBRegressor_2,0.02
train,0.285698,0.189881,XGBRegressor_3,0.09
test,3.508987,2.72293,XGBRegressor_3,0.09


In [91]:
df_show = df3h_test[df3h_test.station.astype(str) == chatelet].copy()
tasks = [
    {
        'x': 'trih',
        'y': 'delta',
        'name': 'Delta',
        'color' : 'gray'
    }
    ]
for i, (dat, y_pred) in enumerate(results):
    tag = f'delta_{i}'
    model_tag = dat.loc['train', 'model']
    tasks.append({
        'x': 'trih',
        'y': tag,
        'name': model_tag,
        # 'color' : 'red'
    })
    df_show[tag] = y_pred
    # df_show[tag] = np.expm1(y_pred) - y_shift
viz.line(df_show, tasks)

# Prophet

## MLP