In [1]:
from velibds import VelibData, VelibDataViz as viz
import plotly.express as px
from velibds.modelisation import MLP, show_prediction_report, station_graph, BaseRegressor
from sklearn.metrics import root_mean_squared_error as RMSE, mean_absolute_error as MAE
from tensorflow.keras.optimizers import Optimizer, Adam, AdamW, Nadam
from pathlib import Path
import datetime, os, pandas as pd, numpy as np
now = datetime.datetime.now
# import plotly.io as pio
# pio.renderers.default = 'notebook'

# Laisse BLIND = True, cela va pas generer les graphiques pendant le run donc pas de VSCODE planté...
BLIND = True
# BLIND = False

# Est-ce qu'on va charger et remplir un df_results déjà existant?
LOADED = True

# Date marge de train et test
SPLIT_DATE = datetime.date(2025, 2, 15)
# Date minimale pour train
LOW_LIMIT = datetime.date(2024, 12, 1)

# On va charger le dataset du csv?
CACHED = True
# CACHED = False
dataset_file = r'local_data/dataset.csv'
chatelet = '82328045'

def prepare_dataset():
    os.environ['LOKY_MAX_CPU_COUNT'] = '7'
    df = VelibData(update_cache=True).extract().transform().data
    # df = VelibData(cache=True).extract().transform().data
    df.to_csv(dataset_file, index=False)
    return df

if CACHED and Path(dataset_file).is_file():
    df = pd.read_csv(dataset_file, parse_dates=['datehour'])
    print(f'Chargé {len(df)} lignes pour la période de {df.datehour.min()} à {df.datehour.max()}')
else:
    df = prepare_dataset()
df_orig = df.copy()




Chargé 3161088 lignes pour la période de 2024-12-05 00:00:00 à 2025-03-12 23:00:00


In [2]:
import json
if LOADED:
    df_results = pd.read_csv(r'df_results.csv', index_col=0)
    with open(r'history.json', 'r') as f:
        history = json.load(f)
else:
    history = {}
    df_results = df.copy()


In [3]:
def common_output(wrapper : MLP, df_results, y_pred, y_col = 'delta'):
    name = wrapper.NAME
    pred_col = 'pred__' + name
    df_results[pred_col] = y_pred
    history[name] = wrapper.history
    if not BLIND:
        show_prediction_report(df_results, df_results[y_col], y_pred, enable_4h=True, graph=True)
        f_name = "".join([x if x.isalnum() else "_" for x in name]) + '.html'
        # station_graph(df_test, y_pred, chatelet, f'Chatelet for {name}', enable_4h=True, filename = f_name)
        station_graph(df_results, y_pred, chatelet, f'Chatelet for {name}', enable_4h=True, y_col=y_col)


## Avec reconstruction

In [4]:
print(now(), 'Cutting dataset')
df_train = df_orig[(df.datehour.dt.date < SPLIT_DATE) & (df.datehour.dt.date >= LOW_LIMIT)].copy()
df_test = df_orig[(df.datehour.dt.date >= SPLIT_DATE)].copy()
if not LOADED:
    df_results = df_test.copy()

2025-03-14 23:54:21.106815 Cutting dataset


In [None]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error as MAE, root_mean_squared_error as RMSE


xgb1 = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.6,
    max_depth=10,
    subsample=0.5,
    reg_lambda=1,
    scale_pos_weight=10
)
xgb2 = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.6, reg_lambda=0.1)
xgb3 = XGBRegressor()
models = [LinearRegression(), BayesianRidge(), (xgb1, 'XGBRegressor_max'), (xgb2, 'XGBRegressor_med'), (xgb3, 'XGBRegressor_default')]
# models = [LinearRegression(), BayesianRidge()] #, (xgb1, 'XGBRegressor_max')]
settings_basic = [
    {
    'NAME' : 'base',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : False,
    'WEIGHTS' : False
    },
    {
    'NAME' : 'w',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True
    },
    {
    'NAME' : 'ow',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True
    },
    {
    'NAME' : 'uw',
    'UNDERSAMPLE' : 2,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True
    },
    {
    'NAME' : 'ow7',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True,
    'TOP_WEIGHT' : 7,
    'BOTTOM_WEIGHT' : 7,
    },
    {
    'NAME' : 'uw7',
    'UNDERSAMPLE' : 2,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True,
    'TOP_WEIGHT' : 7,
    'BOTTOM_WEIGHT' : 7,
    },
]

prefix = r'full_'
for s in settings_basic:
    base_name = s['NAME']
    for model in models:
        if isinstance(model, (tuple, list)):
            model, name = model
        else:
            name = model.__class__.__name__
        name = prefix + name + '_' + base_name
        print(now(), f'Fitting {name}')
        s['NAME'] = name
        wrapper = BaseRegressor(model, name).update_settings(s).fit(df_train, df_train.delta)
        print(now(), f'Prediction {name}')
        y_pred = wrapper.predict(df_test)
        print(now(), f'MAE {name}:', MAE(df_test.delta, y_pred))
        print(now(), f'RMSE {name}:', RMSE(df_test.delta, y_pred))
        common_output(wrapper, df_results, y_pred)
        s['NAME'] = base_name



2025-03-14 23:54:24.027991 Fitting full_LinearRegression_base
Init full_LinearRegression_base
2025-03-14 23:54:24.027991 Starting with X_train size: (2322432, 13)
No rush TOP_MARGE = 4.0 
No rush BOTTOM_MARGE = -4.0
Rush hours distribution:
rush
0    0.9117
1    0.0883
Name: proportion, dtype: float64
2025-03-14 23:54:25.276125 Fitting...
2025-03-14 23:54:26.280328 Fitted!
2025-03-14 23:54:26.319545 Prediction full_LinearRegression_base
2025-03-14 23:54:26.624849 MAE full_LinearRegression_base: 1.935146477819552
2025-03-14 23:54:26.640880 RMSE full_LinearRegression_base: 3.199422633460554
2025-03-14 23:54:26.659725 Fitting full_BayesianRidge_base
Init full_BayesianRidge_base
2025-03-14 23:54:26.659725 Starting with X_train size: (2322432, 13)
No rush TOP_MARGE = 4.0 
No rush BOTTOM_MARGE = -4.0
Rush hours distribution:
rush
0    0.9117
1    0.0883
Name: proportion, dtype: float64
2025-03-14 23:54:28.142794 Fitting...
2025-03-14 23:54:29.801492 Fitted!
2025-03-14 23:54:29.868268 Predict

## Sans reconstruction

In [6]:
# print(now(), 'Cutting dataset')
# df = df[~df.reconstructed]
# df_train = df[(df.datehour.dt.date < SPLIT_DATE) & (df.datehour.dt.date >= LOW_LIMIT)].copy()
# df_test = df[(df.datehour.dt.date >= SPLIT_DATE)].copy()
df_train = df_train[~df_train.reconstructed].copy()


In [7]:
prefix = r''
for s in settings_basic:
    base_name = s['NAME']
    for model in models:
        if isinstance(model, (tuple, list)):
            model, name = model
        else:
            name = model.__class__.__name__
        name = prefix + name + '_' + base_name
        print(now(), f'Fitting {name}')
        s['NAME'] = name
        wrapper = BaseRegressor(model, name).update_settings(s).fit(df_train, df_train.delta)
        print(now(), f'Prediction {name}')
        y_pred = wrapper.predict(df_test)
        print(now(), f'MAE {name}:', MAE(df_test.delta, y_pred))
        print(now(), f'RMSE {name}:', RMSE(df_test.delta, y_pred))
        common_output(wrapper, df_results, y_pred)
        s['NAME'] = base_name


2025-03-15 00:00:51.254173 Fitting LinearRegression_base
Init LinearRegression_base
2025-03-15 00:00:51.254173 Starting with X_train size: (2270372, 13)
No rush TOP_MARGE = 4.0 
No rush BOTTOM_MARGE = -4.0
Rush hours distribution:
rush
0    0.912252
1    0.087748
Name: proportion, dtype: float64
2025-03-15 00:00:52.287002 Fitting...
2025-03-15 00:00:53.232438 Fitted!
2025-03-15 00:00:53.267232 Prediction LinearRegression_base
2025-03-15 00:00:53.545404 MAE LinearRegression_base: 1.9354015554587813
2025-03-15 00:00:53.568028 RMSE LinearRegression_base: 3.1994325506037598
2025-03-15 00:00:53.586555 Fitting BayesianRidge_base
Init BayesianRidge_base
2025-03-15 00:00:53.586555 Starting with X_train size: (2270372, 13)
No rush TOP_MARGE = 4.0 
No rush BOTTOM_MARGE = -4.0
Rush hours distribution:
rush
0    0.912252
1    0.087748
Name: proportion, dtype: float64
2025-03-15 00:00:54.750596 Fitting...
2025-03-15 00:00:56.216342 Fitted!
2025-03-15 00:00:56.259804 Prediction BayesianRidge_base
20

### Mass count mlps

In [8]:
from tensorflow.keras.optimizers import SGD
prefix = r''
mlp_settings = [
    {
    'NAME' : 'base',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : False,
    'WEIGHTS' : False
    },
    {
    'NAME' : 'w',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True
    },
    {
    'NAME' : 'ow',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True
    },
    {
    'NAME' : 'uw',
    'UNDERSAMPLE' : 2,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True
    },
    {
    'NAME' : 'ow',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True,
    'optimizer' : SGD(learning_rate=0.01, nesterov=True),
    # 'optimizer' : SGD(learning_rate=0.003, nesterov=True)
    },
    {
    'NAME' : 'uw',
    'UNDERSAMPLE' : 2,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True,
    'optimizer' : SGD(learning_rate=0.01, nesterov=True),
    # 'optimizer' : SGD(learning_rate=0.003, nesterov=True)
    },
    {
    'NAME' : 'ow',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True,
    # 'optimizer' : SGD(learning_rate=0.01, nesterov=True),
    'optimizer' : SGD(learning_rate=0.003, nesterov=True)
    },
    {
    'NAME' : 'uw',
    'UNDERSAMPLE' : 2,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True,
    # 'optimizer' : SGD(learning_rate=0.01, nesterov=True),
    'optimizer' : SGD(learning_rate=0.003, nesterov=True)
    },
    {
    'NAME' : 'ow7',
    'UNDERSAMPLE' : False,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True,
    'TOP_WEIGHT' : 7,
    'BOTTOM_WEIGHT' : 7,
    # 'optimizer' : SGD(learning_rate=0.01, nesterov=True),
    'optimizer' : SGD(learning_rate=0.003, nesterov=True)
    },
    {
    'NAME' : 'uw7',
    'UNDERSAMPLE' : 2,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True,
    'TOP_WEIGHT' : 7,
    'BOTTOM_WEIGHT' : 7,
    # 'optimizer' : SGD(learning_rate=0.01, nesterov=True),
    'optimizer' : SGD(learning_rate=0.003, nesterov=True)
    },
]
fit_settings = [
    # {'epochs' : 10},
    # {'epochs' : 30},
    # {'epochs' : 50},
    # {'epochs' : 150},
    {'epochs' : 300}
    ]

for s in mlp_settings:
    base_name = s['NAME']
    for i, fs in enumerate(fit_settings):
        name = 'mlp' + '_' + base_name + '_' + str(i)
        print(now(), f'Fitting {name}')
        s['NAME'] = name
        wrapper = MLP(name).update_settings(s).add_fit_params(fit_settings[i]).fit(df_train, df_train.delta).save()
        print(now(), f'Prediction {name}')
        y_pred = wrapper.predict(df_test)
        print(now(), f'MAE {name}:', MAE(df_test.delta, y_pred))
        print(now(), f'RMSE {name}:', RMSE(df_test.delta, y_pred))
        common_output(wrapper, df_results, y_pred)
        s['NAME'] = base_name


2025-03-15 00:06:18.467926 Fitting mlp_base_0
Init mlp_base_0
2025-03-15 00:06:18.467926 Starting with X_train size: (2270372, 13)
No rush TOP_MARGE = 4.0 
No rush BOTTOM_MARGE = -4.0
Rush hours distribution:
rush
0    0.912252
1    0.087748
Name: proportion, dtype: float64
2025-03-15 00:06:19.519463 Creating model
2025-03-15 00:06:19.668140 Fitting...
Epoch 1/300
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 1.7077 - mae: 1.7077 - val_loss: 1.8270 - val_mae: 1.8270 - learning_rate: 0.0010
Epoch 2/300
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - loss: 1.7021 - mae: 1.7021 - val_loss: 1.8272 - val_mae: 1.8272 - learning_rate: 0.0010
Epoch 3/300
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - loss: 1.7030 - mae: 1.7030 - val_loss: 1.8273 - val_mae: 1.8273 - learning_rate: 0.0010
Epoch 4/300
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - loss: 1.7046 - ma

In [32]:
df_results.columns

Index(['datehour', 'station', 'lat', 'lon', 'name', 'cluster', 'reconstructed',
       'disabled', 'capacity', 'bikes',
       ...
       'pred__e10_adam_lr3', 'pred__e10_adam_lr3_u3', 'pred__e10_adam_lr3_u1',
       'pred__e10_adam_lr3_o3', 'pred__e10_adam_lr3_o6', 'pred__e10_adam_lr6',
       'pred__e10_adam_lr6_u3', 'pred__e10_adam_lr6_u1',
       'pred__e10_adam_lr6_o3', 'pred__e10_adam_lr6_o6'],
      dtype='object', length=102)

In [33]:
df_results.to_csv('df_results2.csv')


In [34]:
import json
with open('history.json', 'w') as f:
    json.dump(history, f)

In [12]:
raise Exception("It'over! Congrats!").with_traceback(None)


Exception: It'over! Congrats!

### Expérimentation weights

In [None]:
from keras.optimizers import Adam, AdamW, SGD
prefix = r''
top_bottom_weights = [
    (7, 7),
    (10, 10),
    # (10, 15),
    # (20, 30),
    (50, 70) 
    ]
sampling = [None, ('u', 3), ('u', 1), ('o', 3), ('o', 6)]
lrs = [0.001, 0.003, 0.006]
optimizers = [
    {'class' : Adam,
     'tag' : 'adam',
     'args' : {}
    },
    {'class' : AdamW,
     'tag' : 'adam',
     'args' : {}
    },
    {'class' : SGD,
     'tag' : 'adam',
     'args' : {'nesterov' : True}
    },
]
activators = [
    'relu',
    'tanh'
]
epochs = [
    10,
    # 50,
    # 150,
    # 300
    ]

mlp_settings = []
fit_settings = []
for e in epochs:
    for opt in optimizers:
        for a in activators:
            for rate in lrs:
                for s in sampling:
                    for w in top_bottom_weights:
                        tags = []
                        current_fit = {'epochs' : e}
                        tags.append(f'e{e}')
                        current_settings = {}
                        current_settings['optimizer'] = opt['class'](learning_rate=rate, **opt.get('args', {}))
                        tags.append(opt.get('tag', ''))
                        tags.append(f'lr{int(rate*1000)}')
                        if s is not None:
                            if s[0] == 'o':
                                current_settings['OVERSAMPLE'] = s[1]
                                tags.append(f'o{s[1]}')
                            elif s[0] == 'u':
                                current_settings['UNDERSAMPLE'] = s[1]
                                tags.append(f'u{s[1]}')
                        current_settings['WEIGHTS'] = True
                        current_settings['TOP_WEIGHT'] = w[0]
                        current_settings['BOTTOM_WEIGHT'] = w[1]
                        current_settings['NAME'] = '_'.join(tags)
                        current_settings['ACTIVATION'] = a
                        tags.append(a)
                        mlp_settings.append(current_settings)
                        fit_settings.append(current_fit)


for i, s in enumerate(mlp_settings):
    name = s['NAME']
    print(f'FITTING {name}')
    wrapper = MLP(name).update_settings(s).add_fit_params(fit_settings[i]).fit(df_train, df_train.delta).save()
    print(f'PREDICTING {name}')
    y_pred = wrapper.predict(df_test)
    # print(now(), f'MAE {name}:', MAE(df_test.delta, y_pred))
    # print(now(), f'RMSE {name}:', RMSE(df_test.delta, y_pred))
    common_output(wrapper, df_results, y_pred)


FITTING e10_adam_lr1
Init e10_adam_lr1
2025-03-15 11:45:35.170573 Starting with X_train size: (2270372, 13)
No rush TOP_MARGE = 4.0 
No rush BOTTOM_MARGE = -4.0
Rush hours distribution:
rush
0    0.912252
1    0.087748
Name: proportion, dtype: float64
2025-03-15 11:45:36.517911 Calculating weights
2025-03-15 11:45:36.610820 Creating model
2025-03-15 11:45:36.683127 Fitting...
Epoch 1/10
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - loss: 5.5450 - mae: 1.7086 - weighted_mae: 3.6504 - val_loss: 6.1749 - val_mae: 1.8255 - val_weighted_mae: 3.8766 - learning_rate: 0.0010
Epoch 2/10
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - loss: 5.4388 - mae: 1.7201 - weighted_mae: 3.5844 - val_loss: 6.0805 - val_mae: 1.8325 - val_weighted_mae: 3.8174 - learning_rate: 0.0010
Epoch 3/10
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - loss: 5.4143 - mae: 1.7197 - weighted_mae: 3.5619 - val_loss: 6.0583 - val_

KeyboardInterrupt: 

In [28]:
len(mlp_settings)

270

### Obsolete part

In [None]:
raise Exception("It'over! Congrats!").with_traceback(None)


Exception: It'over! Congrats!

Sans oversampling et weights

In [None]:
df_results2 = pd.read_csv('df_results.csv', index_col=0)
df_results2['pred__mlp_ow_0'] = df_results['pred__mlp_ow_0']
df_results2['pred__mlp_uw_0'] = df_results['pred__mlp_uw_0']


In [None]:
df_results2.to_csv('df_results.csv')


In [None]:
name = 'base'
mlp = MLP(name, oversample=False, weights=False, long=True).fit(df_train, df_train['delta'])#.save()
# mlp = MLP().load(r'local_data/model_reg_base', False)
y_pred = mlp.predict(df_test)

common_output(name)

In [None]:
name = 'w'
settings_basic = {
    'NAME' : name,
    'OVERSAMPLE' : False,
    'WEIGHTS' : True
}
mlp.update_settings(settings_basic).add_fit_params({'epochs' : 50}).reset_tags().fit(df_train, df_train['delta']).save()
# mlp = MLP().load(r'local_data/model_reg_w_weighted', False)

common_output(name)

In [None]:
name = 'ow'
settings_basic = {
    'NAME' : name,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True
}
mlp.update_settings(settings_basic).reset_tags().fit(df_train, df_train['delta']).save()
# mlp = MLP().load(r'local_data/model_reg_ow_oversampled_weighted', False)
y_pred = mlp.predict(df_test).astype(int)

common_output(name)

In [None]:
from tensorflow.keras.optimizers import SGD

name = 'sgd_001'
settings_basic = {
    'NAME' : name,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True
}
mlp.update_settings(settings_basic).reset_tags()
mlp.update_optimizer(SGD(learning_rate=0.01, nesterov=True))
mlp.fit(df_train, df_train['delta']).save()
# mlp = MLP().load(r'local_data/model_reg_sgd_oversampled_weighted', False)
y_pred = mlp.predict(df_test).astype(int)

common_output(name)

In [None]:
from tensorflow.keras.optimizers import SGD

name = 'sgd_0005'
settings_basic = {
    'NAME' : name,
    'OVERSAMPLE' : 3,
    'WEIGHTS' : True
}

mlp.update_settings(settings_basic).reset_tags()
mlp.update_optimizer(SGD(learning_rate=0.005, nesterov=True))
mlp.fit(df_train, df_train['delta']).save()
# mlp = MLP().load(r'local_data\model_reg_sgd_lr_06_oversampled_weighted', False)
y_pred = mlp.predict(df_test).astype(int)

common_output(name)

In [None]:
from tensorflow.keras.optimizers import SGD

name = 'sgd_u_0003'
settings_basic = {
    'NAME' : name,
    'OVERSAMPLE' : False,
    'UNDERSAMPLE' : 3,
    'WEIGHTS' : True
}

# mlp = MLP().load(r'local_data\model_reg_sgd_lr_06_oversampled_weighted', False)
mlp.update_settings(settings_basic).reset_tags()
mlp.update_optimizer(SGD(learning_rate=0.003, nesterov=True))
mlp.fit(df_train, df_train['delta']).save()
y_pred = mlp.predict(df_test).astype(int)

common_output(name)

In [None]:
df_pred.columns

In [None]:
from velibds.modelisation import show_prediction_report, station_graph

def show_all(y_pred, y_rush, name):
    df_test = pd.DataFrame({'pred' : y_pred, 'rush' : y_rush})
    df_test['delta'] = df_orig['delta']
    df_test['datehour'] = df_orig['datehour']
    df_test['station'] = df_orig['station']
    show_prediction_report(df_test, df_test.delta, y_pred, enable_4h=True, graph=True)
    station_graph(df_test, y_pred, chatelet, f'Chatelet for {name}', enable_4h=True)

for n in df_pred.columns:
    show_all(df_pred[n], df_rush[n], n)


In [5]:
import h5py

h5_data = h5py.File('df_results.h5', 'r')
len(h5_data['df']['axis1'])


838656