Hyperparamètres clés en Deep Learning

In [None]:
# Exemples d'hyperparamètres critiques

hyperparameters = {
    "learning_rate": [0.001, 0.01, 0.1],
    "batch_size": [16, 32, 64, 128],
    "epochs": [50, 100, 200],
    "optimizer": ["adam", "sgd", "rmsprop"],
    "hidden_layers": [1, 2, 3, 4],
    "neurons_per_layer": [64, 128, 256, 512],
}

Grid Search - Recherche exhaustive Test de TOUTES les combinaisons possibles

In [None]:
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier


def create_model(learning_rate=0.001, hidden_units=128):
    model = Sequential(
        [
            Dense(hidden_units, activation="relu", input_shape=(784,)),
            Dense(10, activation="softmax"),
        ]
    )
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model


# Configuration Grid Search
model = KerasClassifier(build_fn=create_model, verbose=0)
param_grid = {
    "learning_rate": [0.001, 0.01, 0.1],
    "hidden_units": [64, 128, 256],
    "batch_size": [32, 64],
    "epochs": [50, 100],
}
grid = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=3, scoring="accuracy", n_jobs=-1
)
grid_result = grid.fit(X_train, y_train)

Random Search - Recherche aléatoire - Souvent 90% des performances du Grid Search en 10% du temps

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Distribution des hyperparamètres
param_distributions = {
    "learning_rate": uniform(0.0001, 0.1),  # Uniforme entre 0.0001 et 0.1001
    "hidden_units": randint(50, 500),
    "batch_size": [16, 32, 64, 128],
    "dropout_rate": uniform(0.1, 0.4),
    "epochs": randint(50, 200),
}

# Choix discret
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=50,  # 50 combinaisons aléatoires
    cv=3,
    scoring="accuracy",
    random_state=42,
)
# Entier entre 50 et 499
# Entre 0.1 et 0.5
random_result = random_search.fit(X_train, y_train)
print(f"Meilleur score: {random_result.best_score_}")
print(f"Meilleurs paramètres: {random_result.best_params_}")

Bayesian Optimization - Recherche intelligente - Converge plus rapidement vers l'optimum global

In [None]:
from keras import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
import optuna


def objective(trial):

    # Définition de l'espace de recherche
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    n_layers = trial.suggest_int("n_layers", 1, 5)
    n_units = trial.suggest_int("n_units", 32, 512, step=32)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)

    # Construction du modèle
    model = Sequential()
    model.add(Dense(n_units, activation="relu", input_shape=(784,)))

    for i in range(n_layers - 1):
        model.add(Dropout(dropout_rate))
        model.add(Dense(n_units, activation="relu"))

    model.add(Dense(10, activation="softmax"))
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )

    # Entraînement avec early stopping
    history = model.fit(
        X_train,
        y_train,
        validation_split=0.2,
        epochs=100,
        batch_size=32,
        verbose=0,
        callbacks=[EarlyStopping(patience=10)],
    )
    # Retour de la métrique à optimiser
    return max(history.history["val_accuracy"])


# Lancement de l'optimisation
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print(f"Meilleure valeur: {study.best_value}")
print(f"Meilleurs paramètres: {study.best_params}")

Learning Rate Scheduling -  Decay exponentiel

In [None]:
# Diminution exponentielle du learning rate
from keras.optimizers.schedules import ExponentialDecay


initial_learning_rate = 0.1
lr_schedule = ExponentialDecay(
    initial_learning_rate, decay_steps=1000, decay_rate=0.96, staircase=True
)

# Formule mathématique appliquée
# lr = initial_lr * decay_rate^(step / decay_steps)
optimizer = Adam(learning_rate=lr_schedule)

 Learning Rate Scheduling - Cyclic Learning Rates

In [None]:
# Oscillation entre lr_min et lr_max
from keras.callbacks import LearningRateScheduler
import numpy as np


def cyclic_lr_schedule(epoch, lr):
    cycle_length = 20
    lr_min, lr_max = 0.001, 0.01
    cycle = np.floor(1 + epoch / (2 * cycle_length))
    x = np.abs(epoch / cycle_length - 2 * cycle + 1)
    lr = lr_min + (lr_max - lr_min) * np.maximum(0, 1 - x)
    return lr


lr_scheduler = LearningRateScheduler(cyclic_lr_schedule)
model.fit(X_train, y_train, callbacks=[lr_scheduler], epochs=100)

Architecture Tuning - Détermination du nombre optimal de couches

In [None]:
def test_layer_depth():
    results = {}
    for n_layers in range(1, 8):  # Test de 1 à 7 couches
        model = Sequential()
        model.add(Dense(128, activation="relu", input_shape=(784,)))

        # Ajout des couches cachées
        for i in range(n_layers - 1):
            model.add(Dense(128, activation="relu"))

        model.add(Dense(10, activation="softmax"))

        model.compile(
            optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
        )

        history = model.fit(
            X_train, y_train, validation_split=0.2, epochs=50, verbose=0
        )

        results[n_layers] = {
            "train_acc": max(history.history["accuracy"]),
            "val_acc": max(history.history["val_accuracy"]),
            "params": model.count_params(),
        }
    return results

Architecture Tuning -  Optimisation des tailles de filtres CNN

In [None]:
def optimize_cnn_architecture():
    architectures = [
        {"filters": [32, 64], "kernel_sizes": [3, 3]},
        {"filters": [32, 64, 128], "kernel_sizes": [3, 3, 3]},
        {"filters": [64, 128, 256], "kernel_sizes": [5, 3, 3]},
        {"filters": [16, 32, 64, 128], "kernel_sizes": [7, 5, 3, 3]},
    ]
    for i, arch in enumerate(architectures):
        model = Sequential()
        model.add(
            Conv2D(
                arch["filters"][0],
                kernel_size=arch["kernel_sizes"][0],
                activation="relu",
                input_shape=(28, 28, 1),
            )
        )

        model.add(MaxPooling2D(2, 2))

        for j in range(1, len(arch["filters"])):
            model.add(
                Conv2D(
                    arch["filters"][j],
                    kernel_size=arch["kernel_sizes"][j],
                    activation="relu",
                )
            )

            model.add(MaxPooling2D(2, 2))

        model.add(Flatten())
        model.add(Dense(128, activation="relu"))
        model.add(Dense(10, activation="softmax"))

        # Test de l'architecture
        print(f"Architecture {i+1}: {model.count_params()} paramètres")

Batch Size et Memory Management - Impact sur la convergence

In [None]:
# Test de différentes tailles de batch
batch_sizes = [8, 16, 32, 64, 128, 256]
convergence_results = {}

for batch_size in batch_sizes:
    print(f"Test avec batch_size = {batch_size}")
    model = create_base_model()
    # Ajustement du learning rate selon la règle empirique
    # lr = base_lr * sqrt(batch_size / base_batch_size)
    base_lr = 0.001
    adjusted_lr = base_lr * np.sqrt(batch_size / 32)
    model.compile(
        optimizer=Adam(learning_rate=adjusted_lr),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )

    start_time = time.time()

    history = model.fit(
        X_train,
        y_train,
        batch_size=batch_size,
        epochs=50,
        validation_split=0.2,
        verbose=0,
    )

    training_time = time.time() - start_time

    convergence_results[batch_size] = {
        "final_accuracy": history.history["val_accuracy"][-1],
        "training_time": training_time,
        "memory_usage": get_memory_usage(),  # Fonction custom
        "steps_per_epoch": len(X_train) // batch_size,
    }