# ML Grid testing

## Gestion Dataset

In [1]:
from velibds import VelibData, VelibDataViz as viz
import plotly.express as px
from velibds.modelisation import MLP, show_prediction_report, station_graph, BaseRegressor
from sklearn.metrics import root_mean_squared_error as RMSE, mean_absolute_error as MAE
from keras.optimizers import Optimizer, Adam, AdamW, Nadam, SGD
from pathlib import Path
import datetime, os, pandas as pd, numpy as np
now = datetime.datetime.now


# Laisse BLIND = True, cela va pas generer les graphiques pendant le run donc pas de VSCODE planté...
BLIND = True
# BLIND = False

# Est-ce qu'on va charger et remplir un df_results déjà existant?
LOADED = True

# Faut-il ajouter des lags et si oui, combien?
# LAGS = 3
LAGS = 0

# Date marge de train et test
SPLIT_DATE = datetime.date(2025, 2, 15)
# Date minimale pour train
LOW_LIMIT = datetime.date(2024, 12, 1)

# On va charger le dataset du csv?
CACHED = True
# CACHED = False

# Path dataset
dataset_file = r'local_data/dataset.csv'

# ID de la station Chatelet
chatelet = '82328045'

# Features à customizer:
FEATURES = MLP.FEATURES

def prepare_dataset():
    os.environ['LOKY_MAX_CPU_COUNT'] = '7'
    df = VelibData(update_cache=True).extract().transform().data
    # df = VelibData(cache=True).extract().transform().data
    df.to_csv(dataset_file, index=False)
    return df


if CACHED and Path(dataset_file).is_file():
    df = pd.read_csv(dataset_file, parse_dates=['datehour'])
    print(f'Chargé {len(df)} lignes pour la période de {df.datehour.min()} à {df.datehour.max()}')
else:
    df = prepare_dataset()

### Ajoute des lags:
if LAGS:
    for lag in range(1, LAGS + 1):
        col = f'lag_{lag}'
        df[col] = df.groupby('station')['delta'].shift(lag)
        FEATURES.append(col)
    df = df.dropna()

df_orig = df.copy()

# Train test split par date
df_train = df_orig[(df.datehour.dt.date < SPLIT_DATE) & (df.datehour.dt.date >= LOW_LIMIT)].copy()
df_test = df_orig[(df.datehour.dt.date >= SPLIT_DATE)].copy()


2025-03-23 20:20:29.732280: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-23 20:20:29.816541: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742757629.951184 1123066 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742757629.974211 1123066 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742757630.047407 1123066 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Chargé 3161088 lignes pour la période de 2024-12-05 00:00:00 à 2025-03-12 23:00:00


## Init

In [2]:
classic_tasks = []
mlp_tasks = []
history = {}
df_results = df_test

### Functions

In [3]:
import json
import warnings
import joblib

settings_file = r'grid_settings.json'
df_results_file = r'df_results.h5'
history_file = r'history.json'

known_optimizers = [Adam, AdamW, Nadam, SGD,]
known_optimizers_str = [c.__name__ for c in known_optimizers]

def load_all():
    global df_results, history, classic_tasks, mlp_tasks
    print(now(), 'Loading all data...')
    if Path(settings_file).is_file():
        with open(settings_file, 'r') as f:
            settings = json.load(f)
        if settings.get('SPLIT_DATE') != SPLIT_DATE.strftime('%Y-%m-%y'):
            warnings.warn('SPLIT_DATE is different in settings! Skip restoring.')
            return
        classic_tasks = settings.get('classic_tasks', []) or classic_tasks
        for ct in classic_tasks:
            ct['__MODEL'] = deserialize_regressor(ct['__NAME'], ct['__MODEL'])
        mlp_tasks = settings.get('mlp_tasks', []) or mlp_tasks
        for mt in mlp_tasks:
            mt['optimizer'] = deserialize_optimizer(mt['optimizer'], mt['__optimizer_settings'])
    if Path(df_results_file).is_file():
        # df_results = pd.read_csv(df_results_file, index_col=0)        
        df_results = pd.read_hdf(df_results_file, key='df')        
    if Path(history_file).is_file():
        with open(history_file, 'r') as f:
            history = json.load(f)
    print(now(), 'All data loaded...')
    
def serialize_regressor(name, model):
    if isinstance(model, (tuple, list)):
        model, model_name = model
    else:
        model_name = model.__class__.__name__
    joblib.dump(model, 'local_data/' + name + '@' + model_name + '.joblib')
    return model_name

def deserialize_regressor(name, model_name : str):
    model = joblib.load('local_data/' + name + '@' + model_name + '.joblib')
    return model

def serialize_optimizer(opt):
    return opt.__class__.__name__

def deserialize_optimizer(opt_name : str, opt_settings : dict):
    opt_class = known_optimizers[known_optimizers_str.index(opt_name)]
    return opt_class(**opt_settings)

def save_all():
    print(now(), 'Saving all data...')
    fixed_classic_tasks = classic_tasks.copy()
    for i, ct in enumerate(fixed_classic_tasks):
        ct = ct.copy()
        fixed_classic_tasks[i] = ct
        ct['__MODEL'] = serialize_regressor(ct['__NAME'], ct['__MODEL'])
    fixed_mlp_tasks = mlp_tasks.copy()
    for i, mt in enumerate(fixed_mlp_tasks):
        mt = mt.copy()
        fixed_mlp_tasks[i] = mt
        mt['optimizer'] = serialize_optimizer(mt['optimizer'])
    settings = {
        'SPLIT_DATE' : SPLIT_DATE.strftime('%Y-%m-%y'),
        'classic_tasks' : fixed_classic_tasks,
        'mlp_tasks' : fixed_mlp_tasks
    }
    with open(settings_file, 'w') as f:
        json.dump(settings, f)
    with open(history_file, 'w') as f:
        json.dump(history, f)
    print(now(), 'Saving results dataset...')
    # df_results.to_csv(df_results_file)
    df_results.to_hdf(df_results_file, mode='w', key='df')
    print(now(), 'All data saved...')


def common_output(wrapper : MLP, df_results, y_pred, y_col = 'delta'):
    name = wrapper.NAME
    pred_col = 'pred__' + name
    df_results[pred_col] = y_pred.round()
    df_results[pred_col] = df_results[pred_col].astype('int8')
    history[name] = wrapper.history

def optimize_df_results(df_results : pd.DataFrame):
    cols = df_results.columns[df_results.columns.str.startswith('pred__')].to_list()
    for c in cols:
        df_results[c] = df_results[c].round().astype('int8')
    df_results['bikes'] = df_results['bikes'].astype('int8')
    df_results['capacity'] = df_results['capacity'].astype('int8')
    df_results['delta'] = df_results['delta'].astype('int8')
    return df_results
    

In [5]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error as MAE, root_mean_squared_error as RMSE



xgb1 = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.6,
    max_depth=10,
    subsample=0.5,
    reg_lambda=1,
    scale_pos_weight=10
)
xgb2 = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.6, reg_lambda=0.1)
xgb3 = XGBRegressor()
models = [(LinearRegression(), 'LinReg'), (BayesianRidge(), 'Baye'), (xgb1, 'XGBRegressor_max'), (xgb2, 'XGBRegressor_med'), (xgb3, 'XGBRegressor_default')]

top_bottom_weights = [
    (7, 7),
    (10, 10),
    # (10, 15),
    # (20, 30),
    (50, 70) 
    ]
sampling = [None, ('u', 3), ('u', 1), ('o', 3), ('o', 6)]
lrs = [0.001, 0.003, 0.006]
optimizers = [
    {'class' : Adam,
     'tag' : 'adam',
     'args' : {}
    },
    {'class' : AdamW,
     'tag' : 'adamw',
     'args' : {}
    },
    {'class' : SGD,
     'tag' : 'sgd',
     'args' : {'nesterov' : True}
    },
]
activators = [
    'relu',
    'tanh'
]
epochs = [
    # 10,
    # 50,
    # 150,
    300
    ]

def generate_classical_settings():
    classical_settings = []
    for m in models:
        for s in sampling:
            for w in top_bottom_weights:
                current_settings = {}
                if not isinstance(m, (tuple, list)):
                    m = (m, m.__class__.__name__)
                tags = ['reg', m[1]]
                current_settings['__MODEL'] = m[0]
                if s is not None:
                    if s[0] == 'o':
                        current_settings['OVERSAMPLE'] = s[1]
                        tags.append(f'o{s[1]}')
                    elif s[0] == 'u':
                        current_settings['UNDERSAMPLE'] = s[1]
                        tags.append(f'u{s[1]}')
                current_settings['WEIGHTS'] = True
                current_settings['TOP_WEIGHT'] = w[0]
                current_settings['BOTTOM_WEIGHT'] = w[1]
                tags.append(f'w{w[0]}w{w[1]}')
                current_settings['__NAME'] = '_'.join(tags)
                classical_settings.append(current_settings)
    return classical_settings

def generate_mlp_settings():
    mlp_settings = []
    for e in epochs:
        for opt in optimizers:
            for a in activators:
                for rate in lrs:
                    for s in sampling:
                        for w in top_bottom_weights:
                            current_settings = {}
                            tags = ['mlp']
                            current_settings['__FIT'] = {'epochs' : e}
                            tags.append(f'e{e}')
                            current_settings['__optimizer_settings'] = (opt.get('args', {}) | {'learning_rate' : rate})
                            current_settings['optimizer'] = opt['class'](learning_rate=rate, **opt.get('args', {}))
                            tags.append(opt.get('tag', ''))
                            tags.append(f'lr{int(rate*1000)}')
                            if s is not None:
                                if s[0] == 'o':
                                    current_settings['OVERSAMPLE'] = s[1]
                                    tags.append(f'o{s[1]}')
                                elif s[0] == 'u':
                                    current_settings['UNDERSAMPLE'] = s[1]
                                    tags.append(f'u{s[1]}')
                            current_settings['WEIGHTS'] = True
                            current_settings['TOP_WEIGHT'] = w[0]
                            current_settings['BOTTOM_WEIGHT'] = w[1]
                            tags.append(f'w{w[0]}w{w[1]}')
                            current_settings['ACTIVATION'] = a
                            tags.append(a)
                            current_settings['__NAME'] = '_'.join(tags)
                            mlp_settings.append(current_settings)
    return  mlp_settings


### Action

In [6]:
if LOADED:
    load_all()
settings_basic = generate_classical_settings()
settings_mlp = generate_mlp_settings()
known_names = [v.get('__NAME') for v in classic_tasks + mlp_tasks]
new_classic_tasks_count = len(classic_tasks)
classic_tasks = classic_tasks + [v for v in settings_basic if v.get('__NAME') not in known_names]
new_classic_tasks_count = len(classic_tasks) - new_classic_tasks_count 
new_mlp_tasks_count = len(mlp_tasks)
mlp_tasks = mlp_tasks + [v for v in settings_mlp if v.get('__NAME') not in known_names]
new_mlp_tasks_count = len(mlp_tasks) - new_mlp_tasks_count
done_tasks = df_results.columns[df_results.columns.str.startswith('pred__')].str.replace('pred__', '').to_list()
if new_mlp_tasks_count or new_classic_tasks_count:
    save_all()

2025-03-23 20:21:02.107870 Loading all data...


I0000 00:00:1742757664.763475 1123066 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3586 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


2025-03-23 20:21:11.259278 All data loaded...


## Worker

In [None]:
# Primo: les classiques avec un dataset reconstruit
data_changed = False
prefix = 'full_'
try:
    for t in classic_tasks:
        model = t['__MODEL']
        name = prefix + t['__NAME']
        if name in done_tasks:
            print(now(), f'Task is done: {name}')
            continue
        print(now(), f'Fitting {name}')
        wrapper = BaseRegressor(model, name).update_settings(t).fit(df_train, df_train.delta)
        print(now(), f'Prediction {name}')
        y_pred = wrapper.predict(df_test)
        common_output(wrapper, df_results, y_pred)
        done_tasks.append(name)
        data_changed = True
finally:
    if data_changed:
        save_all()
    
# Duo: maintenant avec un dataset sans reconstructions...
df_train = df_train[~df_train.reconstructed].copy()
data_changed = False
prefix = ''
try:
    for t in classic_tasks:
        model = t['__MODEL']
        name = prefix + t['__NAME']
        if name in done_tasks:
            print(now(), f'Task is done: {name}')
            continue
        print(now(), f'Fitting {name}')
        wrapper = BaseRegressor(model, name).update_settings(t).fit(df_train, df_train.delta)
        print(now(), f'Prediction {name}')
        y_pred = wrapper.predict(df_test)
        common_output(wrapper, df_results, y_pred)
        done_tasks.append(name)
        data_changed = True
finally:
    if data_changed:
        save_all()


### MLP

In [7]:
data_runs = 0
for t in mlp_tasks:
    name = t['__NAME']
    if name in done_tasks:
        print(now(), f'Task is done: {name}')
        continue
    try:
        fit_settings = t['__FIT']
        print(now(), f'Fitting {name}')
        wrapper = MLP(name).update_settings(t).add_fit_params(fit_settings).add_fit_params({'batch_size' : 32768}).fit(df_train, df_train.delta).save()
        print(now(), f'Prediction {name}')
        y_pred = wrapper.predict(df_test)
        common_output(wrapper, df_results, y_pred)
        done_tasks.append(name)
        data_runs += 1
    except Exception as e:
        data_runs = 100
        raise e
    finally:
        if data_runs >= 3:
            save_all()


2025-03-23 20:21:19.095308 Task is done: mlp_e300_adam_lr1_w7w7_relu
2025-03-23 20:21:19.095573 Task is done: mlp_e300_adam_lr1_w10w10_relu
2025-03-23 20:21:19.095610 Task is done: mlp_e300_adam_lr1_w50w70_relu
2025-03-23 20:21:19.095633 Task is done: mlp_e300_adam_lr1_u3_w7w7_relu
2025-03-23 20:21:19.095653 Task is done: mlp_e300_adam_lr1_u3_w10w10_relu
2025-03-23 20:21:19.095694 Task is done: mlp_e300_adam_lr1_u3_w50w70_relu
2025-03-23 20:21:19.095726 Task is done: mlp_e300_adam_lr1_u1_w7w7_relu
2025-03-23 20:21:19.095751 Task is done: mlp_e300_adam_lr1_u1_w10w10_relu
2025-03-23 20:21:19.095774 Task is done: mlp_e300_adam_lr1_u1_w50w70_relu
2025-03-23 20:21:19.095800 Task is done: mlp_e300_adam_lr1_o3_w7w7_relu
2025-03-23 20:21:19.095824 Task is done: mlp_e300_adam_lr1_o3_w10w10_relu
2025-03-23 20:21:19.095845 Task is done: mlp_e300_adam_lr1_o3_w50w70_relu
2025-03-23 20:21:19.095866 Task is done: mlp_e300_adam_lr1_o6_w7w7_relu
2025-03-23 20:21:19.095887 Task is done: mlp_e300_adam_lr

In [None]:
raise Exception('Break!').with_traceback(None)

## Cut down by metrics

In [1]:
top_settings = {'lr': {'1', '6'},
 'weight': {'w10w10', 'w7w7'},
 'opt': {'adam', 'adamw', 'sgd'},
 'resample': {'None', 'o3'}}

from sklearn.linear_model import LinearRegression, BayesianRidge
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error as MAE, root_mean_squared_error as RMSE



xgb1 = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.6,
    max_depth=10,
    subsample=0.5,
    reg_lambda=1,
    scale_pos_weight=10
)
xgb2 = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.6, reg_lambda=0.1)
xgb3 = XGBRegressor()
models = [(LinearRegression(), 'LinReg'), (BayesianRidge(), 'Baye'), (xgb1, 'XGBRegressor_max'), (xgb2, 'XGBRegressor_med'), (xgb3, 'XGBRegressor_default')]

top_bottom_weights = [(int(a) for a in w.split('w')[1:]) for w in top_settings['weight']]
sampling = [(s[0], s[1]) if s != 'None' else None for s in top_settings['resample']]
lrs = [int(lr) for lr in top_settings['lr'] if lr != 'None']
optimizers = [
    {'class' : Adam,
     'tag' : 'adam',
     'args' : {}
    },
    {'class' : AdamW,
     'tag' : 'adamw',
     'args' : {}
    },
    {'class' : SGD,
     'tag' : 'sgd',
     'args' : {'nesterov' : True}
    },
]
optimizers = [o for o in optimizers if o['tag'] in top_settings['opt']]
activators = [
    'relu',
    'tanh'
]
epochs = [
    # 10,
    # 50,
    # 150,
    300
    ]

def generate_classical_settings():
    classical_settings = []
    for m in models:
        for s in sampling:
            for w in top_bottom_weights:
                current_settings = {}
                if not isinstance(m, (tuple, list)):
                    m = (m, m.__class__.__name__)
                tags = ['reg', m[1]]
                current_settings['__MODEL'] = m[0]
                if s is not None:
                    if s[0] == 'o':
                        current_settings['OVERSAMPLE'] = s[1]
                        tags.append(f'o{s[1]}')
                    elif s[0] == 'u':
                        current_settings['UNDERSAMPLE'] = s[1]
                        tags.append(f'u{s[1]}')
                current_settings['WEIGHTS'] = True
                current_settings['TOP_WEIGHT'] = w[0]
                current_settings['BOTTOM_WEIGHT'] = w[1]
                tags.append(f'w{w[0]}w{w[1]}')
                current_settings['__NAME'] = '_'.join(tags)
                classical_settings.append(current_settings)
    return classical_settings

def generate_mlp_settings():
    mlp_settings = []
    for e in epochs:
        for opt in optimizers:
            for a in activators:
                for rate in lrs:
                    for s in sampling:
                        for w in top_bottom_weights:
                            current_settings = {}
                            tags = ['mlp']
                            current_settings['__FIT'] = {'epochs' : e}
                            tags.append(f'e{e}')
                            current_settings['__optimizer_settings'] = (opt.get('args', {}) | {'learning_rate' : rate})
                            current_settings['optimizer'] = opt['class'](learning_rate=rate, **opt.get('args', {}))
                            tags.append(opt.get('tag', ''))
                            tags.append(f'lr{int(rate*1000)}')
                            if s is not None:
                                if s[0] == 'o':
                                    current_settings['OVERSAMPLE'] = s[1]
                                    tags.append(f'o{s[1]}')
                                elif s[0] == 'u':
                                    current_settings['UNDERSAMPLE'] = s[1]
                                    tags.append(f'u{s[1]}')
                            current_settings['WEIGHTS'] = True
                            current_settings['TOP_WEIGHT'] = w[0]
                            current_settings['BOTTOM_WEIGHT'] = w[1]
                            tags.append(f'w{w[0]}w{w[1]}')
                            current_settings['ACTIVATION'] = a
                            tags.append(a)
                            current_settings['__NAME'] = '_'.join(tags)
                            mlp_settings.append(current_settings)
    return  mlp_settings

NameError: name 'Adam' is not defined

## Lagged

In [None]:
# Duo: maintenant avec un dataset sans reconstructions...
df_train = df_train[~df_train.reconstructed].copy()
data_changed = False
prefix = 'lag'
try:
    for t in classic_tasks:
        model = t['__MODEL']
        name = prefix + t['__NAME']
        if name in done_tasks:
            print(now(), f'Task is done: {name}')
            continue
        print(now(), f'Fitting {name}')
        wrapper = BaseRegressor(model, name).update_features(FEATURES).update_settings(t).fit(df_train, df_train.delta)
        print(now(), f'Prediction {name}')
        y_pred = wrapper.predict(df_test)
        common_output(wrapper, df_results, y_pred)
        done_tasks.append(name)
        data_changed = True
finally:
    if data_changed:
        save_all()