In [1]:
from preprocessing import load_data, creer_feature_ligne, preprocess_data

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostRegressor
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain

In [9]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    # Éviter de diviser par zéro
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

In [10]:
data = load_data()
preprocess_data(data)
creer_feature_ligne(data)

# Plusieurs clusterings + RF

In [None]:
cibles = ['retard_moyen_arrivee','prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic',
       'prct_cause_materiel_roulant', 'prct_cause_gestion_gare',
       'prct_cause_prise_en_charge_voyageurs']

X = data[['service', 'gare_depart', 'gare_arrivee', 'duree_moyenne', 'nb_train_prevu', 'ligne', 'mois']]
y = data[cibles]

# One-Hot Encoding pour les colonnes catégorielles
X = pd.get_dummies(X, columns=['gare_depart', 'gare_arrivee', 'service', 'ligne'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:


#Ajout d'une feature en utilisant un clustering

# One-Hot Encoding pour les colonnes catégorielles
#X_cluster = pd.get_dummies(df, columns=['date','gare_depart', 'gare_arrivee', 'service', 'ligne'])
#X_cluster = X_cluster.drop(['commentaire_annulation', 'commentaire_retards_depart', 'commentaires_retard_arrivee'], axis=1)

#Trouvé à l'aide de l'elbow méthode
k_optimal = 3
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
#df['cluster'] = kmeans.fit_predict(X_train)

#majority_class = {}
#lignes= df['ligne'].unique()
#for ligne in lignes:
#    cluster_labels = df[df['ligne'] == ligne]['cluster'].tolist()
#    majority_label = Counter(cluster_labels).most_common(1)[0][0]
#    majority_class[ligne] = majority_label


#df['majority_class'] = df['ligne'].map(majority_class)

X_train['cluster'] = kmeans.fit_predict(X_train)
X_test['cluster'] = kmeans.predict(X_test)


param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 5, 10, 20, 50, 75],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'bootstrap': [True]
}

param_grid_multi_rf = {
    'estimator__n_estimators': [10, 50, 100, 200],
    'estimator__max_depth': [None, 10, 20, 30]
}


rf = RandomForestRegressor()
multi_rf = MultiOutputRegressor(rf)


# Recherche aléatoire avec validation croisée
rf_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=100, cv=5, verbose=0, n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)

# Meilleurs hyperparamètres
print("Meilleurs paramètres : \n",rf_search.best_params_)

#y_pred_grid = grid_search.best_estimator_.predict(X_test)
y_pred_randomized = rf_search.best_estimator_.predict(X_test)

# Séparation des prédictions de retard_moyen_arrivee
y_pred_retard = y_pred_randomized[:, 0]

# Prédictions pour les autres colonnes
y_pred_cause = y_pred_randomized[:, 1:]

# Normalisation de ces prédictions pour qu'elles somment à 100
sums = y_pred_cause.sum(axis=1)[:, np.newaxis]
y_pred_normalized = 100 * y_pred_cause / sums

# Ajout des prédictions de retard_moyen_arrivee aux prédictions normalisées
y_pred_final = np.hstack([y_pred_retard[:, np.newaxis], y_pred_normalized])

for i, cible in enumerate(cibles):
    mae = mean_absolute_error(y_test[cible], y_pred_final[:, i])
    mse_randomized = mean_squared_error(y_test[cible], y_pred_final[:, i])
    mape = mean_absolute_percentage_error(y_test[cible], y_pred_final[:, i])
    r2 = r2_score(y_test[cible], y_pred_final[:, i])
    
    print(f"--- {cible} ---")
    print(f"Mean Squared Error with RandomizedSearch: {mse_randomized}")
    print(f"Mean Absolute Error with RandomizedSearch: {mae}")
    print(f"Mean Absolute Percentage Error with RandomizedSearch: {mape}")
    print(f"R2 Score with RandomizedSearch: {r2}")

# RF optimisée

In [32]:
rf = RandomForestRegressor(n_estimators=50, min_samples_split=2, min_samples_leaf=4, max_features='sqrt', max_depth=None, bootstrap=True)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error with RandomizedSearch: {mae}")
print(f"R2 Score with RandomizedSearch: {r2}")

Mean Absolute Error with RandomizedSearch: 8.6689234939845
R2 Score with RandomizedSearch: 0.2804701547444919


# Différence ? 

In [None]:
k_optimal = 3
kmeans = KMeans(n_clusters=k_optimal, random_state=42)

X_train['cluster'] = kmeans.fit_predict(X_train)
X_test['cluster'] = kmeans.predict(X_test)
rf = RandomForestRegressor()
# multi_rf = MultiOutputRegressor(rf)

# Recherche aléatoire avec validation croisée
rf_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=100, cv=5, verbose=0, n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)

# Meilleurs hyperparamètres
print("Meilleurs paramètres : \n",rf_search.best_params_)

y_pred_randomized = rf_search.best_estimator_.predict(X_test)

# Séparation des prédictions de retard_moyen_arrivee
y_pred_retard = y_pred_randomized[:, 0]

# Prédictions pour les autres colonnes
y_pred_cause = y_pred_randomized[:, 1:]

# Normalisation de ces prédictions pour qu'elles somment à 100
sums = y_pred_cause.sum(axis=1)[:, np.newaxis]
y_pred_normalized = 100 * y_pred_cause / sums

# Ajout des prédictions de retard_moyen_arrivee aux prédictions normalisées
y_pred_final = np.hstack([y_pred_retard[:, np.newaxis], y_pred_normalized])

for i, cible in enumerate(cibles):
    mae = mean_absolute_error(y_test[cible], y_pred_final[:, i])
    mse_randomized = mean_squared_error(y_test[cible], y_pred_final[:, i])
    mape = mean_absolute_percentage_error(y_test[cible], y_pred_final[:, i])
    r2 = r2_score(y_test[cible], y_pred_final[:, i])
    
    print(f"--- {cible} ---")
    print(f"Mean Squared Error with RandomizedSearch: {mse_randomized}")
    print(f"Mean Absolute Error with RandomizedSearch: {mae}")
    print(f"Mean Absolute Percentage Error with RandomizedSearch: {mape}")
    print(f"R2 Score with RandomizedSearch: {r2}")

In [33]:
best_rf = rf_search.best_estimator_ 

# Utilisation du modèle optimisé dans RegressorChain
chain_rf = RegressorChain(best_rf)
chain_rf.fit(X_train, y_train)
y_pred_chain = chain_rf.predict(X_test)

# Séparation des prédictions de retard_moyen_arrivee
y_pred_retard = y_pred_chain[:, 0]

# Prédictions pour les autres colonnes
y_pred_cause = y_pred_chain[:, 1:]

# Normalisation de ces prédictions pour qu'elles somment à 100
sums = y_pred_cause.sum(axis=1)[:, np.newaxis]
y_pred_normalized = 100 * y_pred_cause / sums

# Ajout des prédictions de retard_moyen_arrivee aux prédictions normalisées
y_pred_final = np.hstack([y_pred_retard[:, np.newaxis], y_pred_normalized])

for i, cible in enumerate(cibles):
    mae = mean_absolute_error(y_test[cible], y_pred_final[:, i])
    mse_randomized = mean_squared_error(y_test[cible], y_pred_final[:, i])
    mape = mean_absolute_percentage_error(y_test[cible], y_pred_final[:, i])
    r2 = r2_score(y_test[cible], y_pred_final[:, i])
    
    print(f"--- {cible} ---")
    print(f"Mean Squared Error with RandomizedSearch: {mse_randomized}")
    print(f"Mean Absolute Error with RandomizedSearch: {mae}")
    print(f"Mean Absolute Percentage Error with RandomizedSearch: {mape}")
    print(f"R2 Score with RandomizedSearch: {r2}")

--- retard_moyen_arrivee ---
Mean Squared Error with RegressorChain: 197.5415713502332
Mean Absolute Error with RegressorChain: 8.668389372662315
Mean Absolute Percentage Error with RegressorChain: 25.596084513701605
R2 Score with RegressorChain: 0.26978556124397923
--- prct_cause_externe ---
Mean Squared Error with RegressorChain: 245.6393074708426
Mean Absolute Error with RegressorChain: 11.732289238593191
Mean Absolute Percentage Error with RegressorChain: 60.17115549653846
R2 Score with RegressorChain: 0.14526801054073202
--- prct_cause_infra ---
Mean Squared Error with RegressorChain: 216.94093418809442
Mean Absolute Error with RegressorChain: 10.879777551502835
Mean Absolute Percentage Error with RegressorChain: 55.010910665651465
R2 Score with RegressorChain: 0.1548119343672627
--- prct_cause_gestion_trafic ---
Mean Squared Error with RegressorChain: 182.444769503909
Mean Absolute Error with RegressorChain: 10.078631272222765
Mean Absolute Percentage Error with RegressorChain: 5

In [None]:
# AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor
# Initialisation de l'estimateur AdaBoost
ada = AdaBoostRegressor()

# Grille d'hyperparamètres pour AdaBoost
param_grid_ada = {
    'n_estimators': [10, 50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0],
    'loss': ['linear', 'square', 'exponential']
}

# Utilisation de RandomizedSearchCV pour optimiser les hyperparamètres
ada_search = RandomizedSearchCV(ada, param_distributions=param_grid_ada, n_iter=100, cv=5, verbose=0, n_jobs=-1, random_state=42)
ada_search.fit(X_train, y_train)

# Meilleurs hyperparamètres pour AdaBoost
print(ada_search.best_params_)

# Prédiction avec le modèle AdaBoost optimisé
y_pred_ada = ada_search.best_estimator_.predict(X_test)

# Métriques d'évaluation pour AdaBoost
mae_ada = mean_absolute_error(y_test, y_pred_ada)
mse_ada = mean_squared_error(y_test, y_pred_ada)
mape_ada = mean_absolute_percentage_error(y_test, y_pred_ada)
r2_ada = r2_score(y_test, y_pred_ada)

print(f"Mean Squared Error with AdaBoost: {mse_ada}")
print(f"Mean Absolute Error with AdaBoost: {mae_ada}")
print(f"Mean Absolute Percentage Error with AdaBoost: {mape_ada}")
print(f"R2 Score with AdaBoost: {r2_ada}")


In [3]:
k_optimal = 3
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

print(df[['ligne', 'cluster']])



majority_class = {}
lignes= df['ligne'].unique()
for ligne in lignes:
    cluster_labels = df[df['ligne'] == ligne]['cluster'].tolist()
    majority_label = Counter(cluster_labels).most_common(1)[0][0]
    majority_class[ligne] = majority_label


df['majority_class'] = df['ligne'].map(majority_class)

X['majority_class'] = df['majority_class']

print(X['majority_class'])

  super()._check_params_vs_input(X, default_n_init=10)


                                       ligne  cluster
0        BORDEAUX ST JEAN-PARIS MONTPARNASSE        1
1       LA ROCHELLE VILLE-PARIS MONTPARNASSE        2
2                 PARIS MONTPARNASSE-QUIMPER        2
3                 PARIS MONTPARNASSE-ST MALO        0
4     PARIS MONTPARNASSE-ST PIERRE DES CORPS        2
...                                      ...      ...
8149                    STRASBOURG-PARIS EST        1
8150    TOULOUSE MATABIAU-PARIS MONTPARNASSE        0
8151                TOURS-PARIS MONTPARNASSE        2
8152           VALENCE ALIXAN TGV-PARIS LYON        1
8153               VANNES-PARIS MONTPARNASSE        2

[8154 rows x 2 columns]
0       1
1       2
2       2
3       0
4       1
       ..
8149    1
8150    0
8151    2
8152    2
8153    2
Name: majority_class, Length: 8154, dtype: int64
