In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.multioutput import RegressorChain

In [2]:
from preprocessing import load_data, creer_feature_ligne, creer_features_date

# Chargement des données

In [3]:
data = load_data()
creer_features_date(data)
creer_feature_ligne(data)

# AdaBoost pour prédire seulement le retard

In [11]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    # Éviter de diviser par zéro
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

In [5]:
X = data[['service', 'gare_depart', 'gare_arrivee', 'duree_moyenne', 'nb_train_prevu', 'ligne', 'mois']]
y = data['retard_moyen_arrivee']

X = pd.get_dummies(X, columns=['gare_depart', 'gare_arrivee', 'service', 'ligne'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialisation de l'estimateur AdaBoost
ada = AdaBoostRegressor()

# Grille d'hyperparamètres pour AdaBoost
param_grid_ada = {
    'n_estimators': [10, 50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0],
    'loss': ['linear', 'square', 'exponential']
}

# Utilisation de RandomizedSearchCV pour optimiser les hyperparamètres
ada_search = RandomizedSearchCV(ada, param_distributions=param_grid_ada, n_iter=100, cv=5, verbose=0, n_jobs=-1, random_state=42)
ada_search.fit(X_train, y_train)

# Meilleurs hyperparamètres pour AdaBoost
print(ada_search.best_params_)

# Prédiction avec le modèle AdaBoost optimisé
y_pred_ada = ada_search.best_estimator_.predict(X_test)

# Métriques d'évaluation pour AdaBoost
mae_ada = mean_absolute_error(y_test, y_pred_ada)
mse_ada = mean_squared_error(y_test, y_pred_ada)
mape_ada = mean_absolute_percentage_error(y_test, y_pred_ada)
r2_ada = r2_score(y_test, y_pred_ada)

print(f"Mean Squared Error with AdaBoost: {mse_ada}")
print(f"Mean Absolute Error with AdaBoost: {mae_ada}")
print(f"Mean Absolute Percentage Error with AdaBoost: {mape_ada}")
print(f"R2 Score with AdaBoost: {r2_ada}")



ValueError: 
All the 300 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
260 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 171, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 1168, in _boost
    estimator.fit(X_, y_)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 1320, in fit
    super()._fit(
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 242, in _fit
    X, y = self._validate_data(
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 617, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'National'

--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 171, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 1168, in _boost
    estimator.fit(X_, y_)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 1320, in fit
    super()._fit(
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 242, in _fit
    X, y = self._validate_data(
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 617, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/yanivbenchetrit/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'International'


# Random Forest avec hyperparamètres optimisés par Randomized Search

In [4]:
cibles = ['retard_moyen_arrivee','prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic',
       'prct_cause_materiel_roulant', 'prct_cause_gestion_gare',
       'prct_cause_prise_en_charge_voyageurs']

X = data[['service', 'gare_depart', 'gare_arrivee', 'duree_moyenne', 'nb_train_prevu', 'ligne', 'mois']]
y = data[cibles]

# One-Hot Encoding pour les colonnes catégorielles
X = pd.get_dummies(X, columns=['gare_depart', 'gare_arrivee', 'service', 'ligne'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
param_grid_randomized = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 5, 10, 20, 50, 75],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'bootstrap': [True]
}


rf = RandomForestRegressor()


# Recherche aléatoire avec validation croisée
rf_search = RandomizedSearchCV(rf, param_distributions=param_grid_randomized, n_iter=100, cv=5, verbose=0, n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)

# Meilleurs hyperparamètres
print("Meilleurs paramètres : \n",rf_search.best_params_)

#y_pred_grid = grid_search.best_estimator_.predict(X_test)
y_pred_randomized = rf_search.best_estimator_.predict(X_test)

# Séparation des prédictions de retard_moyen_arrivee
y_pred_retard = y_pred_randomized[:, 0]

# Prédictions pour les autres colonnes
y_pred_cause = y_pred_randomized[:, 1:]

# Normalisation de ces prédictions pour qu'elles somment à 100
sums = y_pred_cause.sum(axis=1)[:, np.newaxis]
y_pred_normalized = 100 * y_pred_cause / sums

# Ajout des prédictions de retard_moyen_arrivee aux prédictions normalisées
y_pred_final = np.hstack([y_pred_retard[:, np.newaxis], y_pred_normalized])

for i, cible in enumerate(cibles):

    mask1 = ~np.isnan(y_test[cible])
    mask2 = ~np.isnan(y_pred_final[:, i])

    mask = mask1 & mask2
    # Appliquez le masque aux prédictions et aux vraies valeurs
    y_test_non_nan = y_test[cible][mask]
    y_pred_non_nan = y_pred_final[:, i][mask]
    
    mae = mean_absolute_error(y_test_non_nan, y_pred_non_nan)
    mse_randomized = mean_squared_error(y_test_non_nan, y_pred_non_nan)
    
    r2 = r2_score(y_test_non_nan, y_pred_non_nan)
    
    print(f"--- {cible} ---")
    print(f"RMSE with RandomizedSearch: {mse_randomized**0.5}")
    print(f"Mean Absolute Error with RandomizedSearch: {mae}")
    print(f"R2 Score with RandomizedSearch: {r2}")

    try:
        mape = mean_absolute_percentage_error(y_test_non_nan, y_pred_non_nan)
        print(f"Mean Absolute Percentage Error with RandomizedSearch: {mape}")
    except:
        pass

  warn(


Meilleurs paramètres : 
 {'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 20, 'bootstrap': True}
--- retard_moyen_arrivee ---
Mean Squared Error with RandomizedSearch: 190.137032287581
Mean Absolute Error with RandomizedSearch: 8.528909205936626
R2 Score with RandomizedSearch: 0.2971565156153778
Mean Absolute Percentage Error with RandomizedSearch: 25.527466405610806
--- prct_cause_externe ---
Mean Squared Error with RandomizedSearch: 266.3162264626453
Mean Absolute Error with RandomizedSearch: 12.055815350692363
R2 Score with RandomizedSearch: 0.0661996488484623
Mean Absolute Percentage Error with RandomizedSearch: 58.897889571496144
--- prct_cause_infra ---
Mean Squared Error with RandomizedSearch: 215.21500414969813
Mean Absolute Error with RandomizedSearch: 10.859101292788978
R2 Score with RandomizedSearch: 0.15498599642350386
Mean Absolute Percentage Error with RandomizedSearch: 54.149124143019634
--- prct_cause_gestion_tra

  y_pred_normalized = 100 * y_pred_cause / sums


In [9]:
best_params =  {'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 20, 'bootstrap': True}

# Regressor chain

In [16]:
best_rf = rf_search.best_estimator_ 

# Utilisation du modèle optimisé dans RegressorChain
chain_rf = RegressorChain(best_rf)
chain_rf.fit(X_train, y_train)
y_pred_chain = chain_rf.predict(X_test)

# Séparation des prédictions de retard_moyen_arrivee
y_pred_retard = y_pred_chain[:, 0]

# Prédictions pour les autres colonnes
y_pred_cause = y_pred_chain[:, 1:]

# Normalisation de ces prédictions pour qu'elles somment à 100
sums = y_pred_cause.sum(axis=1)[:, np.newaxis]
y_pred_normalized = 100 * y_pred_cause / sums

# Ajout des prédictions de retard_moyen_arrivee aux prédictions normalisées
y_pred_final = np.hstack([y_pred_retard[:, np.newaxis], y_pred_normalized])

for i, cible in enumerate(cibles):

    mask1 = ~np.isnan(y_test[cible])
    mask2 = ~np.isnan(y_pred_final[:, i])

    mask = mask1 & mask2
    # Appliquez le masque aux prédictions et aux vraies valeurs
    y_test_non_nan = y_test[cible][mask]
    y_pred_non_nan = y_pred_final[:, i][mask]
    
    mae = mean_absolute_error(y_test_non_nan, y_pred_non_nan)
    mse_randomized = mean_squared_error(y_test_non_nan, y_pred_non_nan)
    
    r2 = r2_score(y_test_non_nan, y_pred_non_nan)
    
    print(f"--- {cible} ---")
    print(f"RMSE with RandomizedSearch: {mse_randomized**0.5}")
    print(f"Mean Absolute Error with RandomizedSearch: {mae}")
    print(f"R2 Score with RandomizedSearch: {r2}")

    try:
        mape = mean_absolute_percentage_error(y_test_non_nan, y_pred_non_nan)
        print(f"Mean Absolute Percentage Error with RandomizedSearch: {mape}")
    except:
        pass


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


--- retard_moyen_arrivee ---
Mean Squared Error with RandomizedSearch: 191.96895812609822
Mean Absolute Error with RandomizedSearch: 8.512087665825414
R2 Score with RandomizedSearch: 0.2903847830181727
Mean Absolute Percentage Error with RandomizedSearch: 25.29525419961754
--- prct_cause_externe ---
Mean Squared Error with RandomizedSearch: 263.4552565548811
Mean Absolute Error with RandomizedSearch: 11.945304050631787
R2 Score with RandomizedSearch: 0.08180655812255744
Mean Absolute Percentage Error with RandomizedSearch: 60.69934231543236
--- prct_cause_infra ---
Mean Squared Error with RandomizedSearch: 223.71905422212123
Mean Absolute Error with RandomizedSearch: 10.985230068221963
R2 Score with RandomizedSearch: 0.1269851175023624
Mean Absolute Percentage Error with RandomizedSearch: 54.366974229414545
--- prct_cause_gestion_trafic ---
Mean Squared Error with RandomizedSearch: 197.7841740564105
Mean Absolute Error with RandomizedSearch: 10.360090091613515
R2 Score with RandomizedS

  y_pred_normalized = 100 * y_pred_cause / sums
