In [1]:
from typing import Union
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


class Transform:
    default_params = {
        'encode_time' : [
            ('hour', 24),
            ('weekday', 7),
            ], # liste de tuples/listes ('colname', periods)
        'encode_lat' : ['lat'], # liste de cols latitude à encoder
        'encode_lon' : ['lon'], # liste de cols longiture à encoder
        'hotencode' : ['cluster'],
        'lags' : 0, # int: nombre de cols lag à ajouter
        'lags_cols' : [], # les colonnes lags
        'scale' : ['capacity', 'temp', 'precip', 'gel', 'vent', 'lat', 'lon'], # cols à scaler
        'nonscale' : ['weekend', 'holiday', 'preholiday'], # cols sans scaling et encoding
        'target' : 'delta',
    }
    
    
    def __init__(self, params):
        self.params = self.default_params | params
        # sauvegardons la liste de features actuelles pour info
        self.features = [et[0] for et in self.params['encode_time']] + \
            self.params['scale'] + self.params['nonscale'] + \
                self.params['encode_lat'] + self.params['encode_lon'] + \
                    self.params['hotencode'] + self.params['lags_cols'] + [self.params['target']]
    
    
    def transform(self, df : pd.DataFrame):
        df = df.copy() # on ne veut pas toucher le dataset d'origine
        df = self.encode(df)
        if self.params['lags_cols']:
            self.params['scale'] += [c for c in self.params['lags_cols'] if c not in self.params['scale']]
        # df = self.add_lags(df)
        df = self.scale(df)
        return df[self.features]


    def encode(self, df : pd.DataFrame):
        if self.params['encode_time']:
            cols, periods = zip(*self.params['encode_time'])
            df = Transform.encode_time(df, list(cols), list(periods))
            self.replace_features(cols, 'sin_')
            self.replace_features(cols, 'cos_')
        if self.params['encode_lat']:
            df = Transform.encode_lat(df, self.params['encode_lat'])
        if self.params['encode_lon']:
            df = Transform.encode_lon(df, self.params['encode_lon'])
        if self.params['hotencode']:
            df = self.hotencode(df, self.params['hotencode'])
        return df
    

    def scale(self, df: pd.DataFrame):
        if not self.params['scale']:
            # rien à faire!
            return df
        # Il nous faudra assurer le type float pour les colonnes en scale
        safe_types = dict(zip(self.params['scale'], [float] * len(self.params['scale'])))
        df = df.astype(safe_types)
        # Si on a déjà transformé avec ce transformer, donc utilisons le scaler existant.
        if hasattr(self, 'scaler') and self.scaler is not None:
            df.loc[:, self.params['scale']] = self.scaler.transform(df[self.params['scale']])
        else:
            self.scaler = StandardScaler()
            df.loc[:, self.params['scale']] = self.scaler.fit_transform(df[self.params['scale']])
        return df


    def drop_features(self, cols: list[str]):
        self.features = [f for f in self.features if f not in cols]

    def add_features(self, cols: list[str]):
        self.features += [c for c in cols if c not in self.features]

    def replace_features(self, cols : list[str], prefix : str):
        # remove old features from list
        self.drop_features(cols)
        # add new features
        self.add_features([prefix + f for f in cols])


    def add_lags(self, df: pd.DataFrame):
        if not self.params['lags']:
            return df
        raise NotImplementedError()            

    def hotencode(self, df: pd.DataFrame, cols: Union[str, list[str]]):
        if isinstance(cols, str):
            cols = [cols]
        orig_features = df.columns.to_list()
        df = pd.get_dummies(df, prefix=cols, columns=cols, dtype=int)
        if hasattr(self, 'new_features'):
            # remplissons les cols inexistantes avec les 0
            for col in self.new_features:
                if col not in df.columns:
                    df[col] = 0
        else:                
            # Mise à jour de la liste de features
            self.new_features = [f for f in df.columns.to_list() if f not in orig_features]
        self.drop_features(self.params['hotencode'])
        self.add_features(self.new_features)
        return df


    @staticmethod
    def encode_time(df: pd.DataFrame, cols: Union[str, list[str]], periods = Union[int, list[int], None]):
        '''
        Encoder les heures, jours, mois etc en représentation sin(x) - cos(x) pour la continuité de valeurs
        
        df: Pandas dataframe
        
        cols: Nom d'une ou des colonnes (str ou list de str)
        
        periods: Combien de périodes dans une saisonalité (eg 24 pour heures). Optionnel. Si pas précisé, sera calculé pas nombre de valeurs uniques.

        Resultat: colonnes sin_COLNAME, cos_COLNAME ajouté en df d'origine qui est retourné.
        '''
        if isinstance(cols, str):
            cols = [cols]
        if isinstance(periods, int) or periods is None:
            periods = [periods]
        for i, c in enumerate(cols):
            # Si on a la périodicité pour cette colonne, ok, sinon c'est None
            if i < len(periods):
                p = periods[i]
            else:
                p = None
            # Si la périodicité et None, calculons à la base de nombre de valeurs uniques
            if p is None:
                p = df[c].nunique()
            df['sin_' + c] = np.sin(df[c] / p * 2 * np.pi)
            df['cos_' + c] = np.cos(df[c] / p * 2 * np.pi)
        return df


    @staticmethod
    def encode_lat(df: pd.DataFrame, cols: Union[str, list[str]]):
        if isinstance(cols, str):
            cols = [cols]
        for c in cols:
            min_val = df[c].min()
            df[c] = (df[c] - min_val) * 111000
        return df


    @staticmethod
    def encode_lon(df: pd.DataFrame, cols: Union[str, list[str]]):
        if isinstance(cols, str):
            cols = [cols]
        for c in cols:
            min_val = df[c].min()
            df[c] = (df[c] - min_val) * 72987
        return df



In [2]:
import datetime

df_file = r"local_data/dataset.csv"
df = pd.read_csv(df_file, parse_dates=['datehour'])
# Date marge de train et test
SPLIT_DATE = datetime.date(2025, 2, 15)
# Date minimale pour train
LOW_LIMIT = datetime.date(2024, 12, 1)
# ID de la station Chatelet
chatelet = '82328045'
# Ajout de lags:
lags = 2
lags_cols = []
for lag in range(1, lags + 1):
    col = f'lag_{lag}'
    df[col] = df.groupby('station')['delta'].shift(lag)
    lags_cols.append(col)
df = df.dropna()
# Train test split par date
df_train = df[(df.datehour.dt.date < SPLIT_DATE) & (df.datehour.dt.date >= LOW_LIMIT)].copy()
df_test = df[(df.datehour.dt.date >= SPLIT_DATE)].copy()


In [10]:
from xgboost import XGBRegressor


# Marquer les heures de pointes
Q1 = df_train['delta'].quantile(0.25)
Q2 = df_train['delta'].quantile(0.75)
IQR = Q2 - Q1
TOP_MARGE = Q2 + 1.5 * IQR
BOTTOM_MARGE = Q1 - 1.5 * IQR
df_train['rush'] = ((df_train['delta'] > TOP_MARGE) | (df_train['delta'] < BOTTOM_MARGE)).astype(int)
train_0 = df_train[df_train.rush == 0]
train_1 = df_train[df_train.rush == 1]
df_train = pd.concat([train_0] + [train_1] * 6).sample(frac=1)

weights = (df_train['rush'] * 20).where(df_train['delta'] > TOP_MARGE, np.nan)
weights = weights.fillna((df_train['rush'] * 20).where(df_train['delta'] < BOTTOM_MARGE, 1))

transformer = Transform(params={'lags_cols':lags_cols})
df_train_transformed = transformer.transform(df_train)

# xgb1 = XGBRegressor(
#     objective='reg:squarederror',
#     n_estimators=100,
#     learning_rate=0.6,
#     max_depth=10,
#     subsample=0.5,
#     reg_lambda=1,
#     scale_pos_weight=10,
# )

xgb1 = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.6, reg_lambda=0.1)
# xgb3 = XGBRegressor()

xgb1.fit(df_train_transformed.drop(columns=['delta']).to_numpy(), df_train_transformed['delta'], sample_weight = weights)
# xgb2.fit(df_train_transformed.drop(columns=['delta']).to_numpy(), df_train_transformed['delta'])
# xgb3.fit(df_train_transformed.drop(columns=['delta']).to_numpy(), df_train_transformed['delta'])

In [11]:
df_test_transformed = transformer.transform(df_test)

In [12]:
df_test['rush'] = ((df_test['delta'] > TOP_MARGE) | (df_test['delta'] < BOTTOM_MARGE)).astype(int)


In [13]:
xgb1.score(df_test_transformed[df_test.rush == 1].drop(columns=['delta']).to_numpy(), df_test_transformed[df_test.rush == 1]['delta'])

0.653072714805603

In [65]:
deltas_hist = df_train['delta'].to_list()

for i in range(1):
    prediction_df = df_test.iloc[i:i+1]
    prediction_df.loc[:,'lag_1'] = [deltas_hist[-1]]
    prediction_df.loc[:, 'lag_2'] = [deltas_hist[-2]]
    prediction_df = transformer.transform(prediction_df) 

display(prediction_df)

Unnamed: 0,capacity,temp,precip,gel,vent,lat,lon,lag_1,lag_2,weekend,...,cluster_41,cluster_42,cluster_43,cluster_44,cluster_45,cluster_46,cluster_47,cluster_48,cluster_49,cluster_50
2322432,-0.160341,-1.11443,-0.201617,-0.344707,-0.56685,-3.529719,-3.422296,-1.279653,2.661415,1,...,0,0,0,0,0,0,0,0,0,0


In [14]:
features = df_test.columns.to_list()
lag_1_index = features.index('lag_1')
lag_2_index = features.index('lag_2')

deltas_hist = df_train[df_train.station.astype(str) == chatelet]['delta'].to_list()
y_pred = []
y_test = []
for i in range(24):
    prediction_df = df_test[df_test.station.astype(str) == chatelet].iloc[i:i+1]
    prediction_df.loc[:,'lag_1'] = [deltas_hist[-1]]
    prediction_df.loc[:, 'lag_2'] = [deltas_hist[-2]]
    prediction_df = transformer.transform(prediction_df) 
    prediction_X = prediction_df.drop(columns=['delta']).to_numpy()
    pred = int(xgb1.predict(prediction_X)[0])
    y_test.append(prediction_df.iloc[0]['delta'])
    y_pred.append(pred)
    deltas_hist.append(pred)


In [67]:
deltas_hist = df_train[df_train.station.astype(str) == chatelet]['delta'].to_list()

for i in range(24):
    prediction_df = df_test[df_test.station.astype(str) == chatelet].iloc[i:i+1]
    prediction_df.loc[:,'lag_1'] = [deltas_hist[-1]]
    prediction_df.loc[:, 'lag_2'] = [deltas_hist[-2]]
    prediction_df = transformer.transform(prediction_df) 
    prediction_X = prediction_df.drop(columns=['delta']).to_numpy()
    pred = int(xgb2.predict(prediction_X)[0])
    y_test.append(prediction_df.iloc[0]['delta'])
    y_pred.append(pred)
    deltas_hist.append(pred)

deltas_hist = df_train[df_train.station.astype(str) == chatelet]['delta'].to_list()

for i in range(24):
    prediction_df = df_test[df_test.station.astype(str) == chatelet].iloc[i:i+1]
    prediction_df.loc[:,'lag_1'] = [deltas_hist[-1]]
    prediction_df.loc[:, 'lag_2'] = [deltas_hist[-2]]
    prediction_df = transformer.transform(prediction_df) 
    prediction_X = prediction_df.drop(columns=['delta']).to_numpy()
    pred = int(xgb3.predict(prediction_X)[0])
    y_test.append(prediction_df.iloc[0]['delta'])
    y_pred.append(pred)
    deltas_hist.append(pred)


In [15]:
from sklearn.metrics import mean_absolute_error as MAE
import plotly.graph_objects as go
dh = df_test[df_test.station.astype(str) == chatelet]['datehour'].iloc[:24].to_list()
# MAE(y_test, y_pred)
data = pd.DataFrame({'dh' : dh, 'test' : y_test, 'pred' : y_pred})
fig = go.Figure()
fig.add_scatter(x=data['dh'], y=data['test'], name='test')
fig.add_scatter(x=data['dh'], y=data['pred'], name='pred')
fig.show()

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

model = Sequential([
    Input((len(transformer.features)-1,)),
    Dense(256, activation='relu'),
    Dropout(0.1),
    Dense(128, activation='relu'),
    Dropout(0.1),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(32, activation='relu'),
    Dropout(0.1),
    Dense(16, activation='relu'),
    Dropout(0.1),
    Dense(8),
    Dense(1)
])
weighted_metrics = ['mae']
model.compile(optimizer='adam', loss='mse', weighted_metrics=weighted_metrics, metrics=['mae'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.01)
history = model.fit(df_train_transformed.drop(columns=['delta']), df_train_transformed['delta'], callbacks=[early_stopping, reduce_lr],
                    epochs = 300, sample_weight = weights, batch_size=32000, validation_split=0.1)


Epoch 1/300
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 107ms/step - loss: 763.0591 - mae: 8.0312 - weighted_mae: 14.2071 - val_loss: 263.1448 - val_mae: 7.3523 - val_weighted_mae: 7.3618 - learning_rate: 0.0010
Epoch 2/300
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 107ms/step - loss: 267.8569 - mae: 7.6786 - weighted_mae: 7.6635 - val_loss: 144.4460 - val_mae: 6.7773 - val_weighted_mae: 5.2657 - learning_rate: 0.0010
Epoch 3/300
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 108ms/step - loss: 178.3886 - mae: 7.0804 - weighted_mae: 6.2329 - val_loss: 101.5441 - val_mae: 6.0639 - val_weighted_mae: 4.4021 - learning_rate: 0.0010
Epoch 4/300
[1m 59/145[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m9s[0m 106ms/step - loss: 144.7892 - mae: 6.5435 - weighted_mae: 5.5949

KeyboardInterrupt: 

In [1]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

2025-04-07 18:56:21.299255: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-07 18:56:21.309750: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-07 18:56:21.322753: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-07 18:56:21.326418: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-07 18:56:21.336363: I tensorflow/core/platform/cpu_feature_guar

[]

In [24]:
df_test_transformed.drop(columns=['delta']).to_numpy()[:1]

array([[ 2.90183779e-01, -1.10291556e+00, -2.27647927e-01,
        -3.50228204e-01, -5.58690456e-01,  1.93078975e-01,
        -1.12358838e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.93078975e-01, -1.12358838e+00,
         2.40977188e-04, -6.92110199e-01,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+0