# 🚄 TARDIS - Prédiction des Retards / Lateness Prediction SNCF

Ce fichier sert a entrainer les 3 IAs nécessaires au bon fonctionnement du dashboard en ligne.

This file is used to train the 3 AIs required for the proper function of the online dashboard.

## 💽 Code

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

## Import dataset
df = pd.read_csv("../cleaned_dataset.csv")


class Parameters:
    """
    ## Parameters
    This class is used to define the **common parameters for Model Generation**.
    """

    common_clean_features: list[str] = (
        [
            "Average journey time",
            "Number of scheduled trains",
            "Number of cancelled trains",
            "Number of trains delayed > 15min",
            "Number of trains delayed > 30min",
            "Number of trains delayed > 60min",
            "Mois",
            "Jour de la semaine",
        ],
    )
    common_features: list[str] = [
        "Average journey time",
        "Number of scheduled trains",
        "Number of cancelled trains",
        "Number of trains delayed > 15min",
        "Number of trains delayed > 30min",
        "Number of trains delayed > 60min",
        "Mois",
        "Jour de la semaine",
    ]
    random_state: int = 42
    test_size: float = 0.1
    n_estimators: float = 100


class TrainGeneric:
    """
    ## TrainGeneric
    This class is used to train the TARDIS AI Model.
    This model can be used for general predictions (Both Departures and Arrivals)

    Current revision: **Revision 9**
    """

    def __init__(self, df: pd.DataFrame):
        """Initialization function for TrainGeneric object

        Args:
            * df (pd.DataFrame): The dataframe of the dataset to be trained on (**must be from Pandas**).

        ## Self
            * df (pd.DataFrame): The dataframe of the dataset to be trained on (**must be from Pandas**).
            * X (pd.DataFrame): The dataframe containing the values on the X training axis.
            * y (pd.DataFrame): The dataframe containing the values on the Y training axis.
            * X_train (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be fitted into the model.
            * X_test (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be tested against the model.
            * y_train (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be fitted into the model.
            * y_test (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be tested against the model.
            * model (Joblib AI Model): The exported model.
            * export (str): The path to the exported model.
            * extra_params (list[str]): Extra columns to be added to the training dataframe
            * extra_clean_params (list[str]): Extra columns to be added to the training dataframe cleaning
            process
        """
        self.df: pd.DataFrame = df
        self.X: pd.DataFrame = None
        self.y: pd.DataFrame = None
        self.X_train: pd.DataFrame = None
        self.X_test: pd.DataFrame = None
        self.y_train: pd.DataFrame = None
        self.y_test: pd.DataFrame = None
        self.model = None
        self.export: str = "tardis_rf_model.joblib"
        self.extra_params: list[str] = [
            "Number of trains delayed at departure",
            "Number of trains delayed at arrival",
            "Average delay of late trains at departure",
            "Average delay of all trains at departure",
            "Average delay of late trains at arrival",
            "Average delay of all trains at arrival",
        ]
        self.extra_clean_params: list[str] = [
            "Number of trains delayed at departure",
            "Number of trains delayed at arrival",
            "Average delay of late trains at departure",
            "Average delay of all trains at departure",
            "Average delay of late trains at arrival",
            "Average delay of all trains at arrival",
        ]

    def train(self):
        """
        Trains the model using the inputed data in the object's attributes
        """
        self.df["Jour de la semaine"] = (
            self.df["Jour de la semaine"].astype("category").cat.codes
        )

        self.df = self.df.dropna(
            subset=Parameters.common_clean_features[0] + self.extra_clean_params
        )

        self.X = self.df[Parameters.common_features + self.extra_params]
        self.y = self.df["Average delay of all trains at departure"]

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=Parameters.test_size,
            random_state=Parameters.random_state,
        )

        self.model = RandomForestRegressor(
            n_estimators=Parameters.n_estimators, random_state=Parameters.random_state
        )

        self.model.fit(self.X_train.values, self.y_train)

        self.y_pred = self.model.predict(self.X_test.values)

        self.mse = mean_squared_error(self.y_test, self.y_pred)
        self.rmse = np.sqrt(self.mse)
        self.r2 = r2_score(self.y_test, self.y_pred)

    def dump_results(self, extra: bool = False):
        """
        Dumps the results of the training process

        Args:
            extra (bool, optional): Show graphics and extra information about the training process. Defaults to False.
        """
        print("=== START OF MODEL REPORT ===")
        print("Generic AI Model\nModel performance:")
        print("➔ RMSE :", self.rmse)
        print("➔ R2 :", self.r2)

        if extra:
            ## Visualize correlation matrix (Value importance)
            print("Value importance")
            correlation_matrix = self.X.corr()
            print(correlation_matrix)

            # Visualize correlations
            plt.figure(figsize=(15, 8))
            sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
            plt.title("Feature Correlations")
        print("=== END OF MODEL REPORT ===")

    def save_model(self):
        """
        Saves the model to ``self.export`` file
        """
        joblib.dump(self.model, self.export)
        print(f"Model saved as '{self.export}'")

        print("Now doing model AutoTest...")
        loaded_model = joblib.load(f"./{self.export}")
        sample_pred = loaded_model.predict([self.X_test.iloc[0].values])
        print("Prediction on a sample:", sample_pred[0])
        print("Model AutoTest done!")


class TrainDepartures:
    """
    ## TrainDepartures
    This class is used to train the TARDIS AI Model.
    This model can be used for departures-only predictions

    Current revision: **Revision 9**
    """

    def __init__(self, df: pd.DataFrame):
        """Initialization function for TrainDepartures object

        Args:
            * df (pd.DataFrame): The dataframe of the dataset to be trained on (**must be from Pandas**).

        ## Self
            * df (pd.DataFrame): The dataframe of the dataset to be trained on (**must be from Pandas**).
            * X (pd.DataFrame): The dataframe containing the values on the X training axis.
            * y (pd.DataFrame): The dataframe containing the values on the Y training axis.
            * X_train (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be fitted into the model.
            * X_test (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be tested against the model.
            * y_train (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be fitted into the model.
            * y_test (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be tested against the model.
            * model (Joblib AI Model): The exported model.
            * export (str): The path to the exported model.
            * extra_params (list[str]): Extra columns to be added to the training dataframe
            * extra_clean_params (list[str]): Extra columns to be added to the training dataframe cleaning
            process
        """
        self.df: pd.DataFrame = df
        self.X: pd.DataFrame = None
        self.y: pd.DataFrame = None
        self.X_train: pd.DataFrame = None
        self.X_test: pd.DataFrame = None
        self.y_train: pd.DataFrame = None
        self.y_test: pd.DataFrame = None
        self.model = None
        self.export: str = "tardis_rf_model_departures.joblib"
        self.extra_params: list[str] = [
            "Number of trains delayed at departure",
            "Average delay of late trains at departure",
            "Average delay of all trains at departure",
        ]
        self.extra_clean_params: list[str] = [
            "Number of trains delayed at departure",
            "Average delay of late trains at departure",
            "Average delay of all trains at departure",
        ]

    def train(self):
        """
        Trains the model using the inputed data in the object's attributes
        """
        self.df["Jour de la semaine"] = (
            self.df["Jour de la semaine"].astype("category").cat.codes
        )

        self.df = self.df.dropna(
            subset=Parameters.common_clean_features[0] + self.extra_clean_params
        )

        self.X = self.df[Parameters.common_features + self.extra_params]
        self.y = self.df["Average delay of all trains at departure"]

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=Parameters.test_size,
            random_state=Parameters.random_state,
        )

        self.model = RandomForestRegressor(
            n_estimators=Parameters.n_estimators, random_state=Parameters.random_state
        )

        self.model.fit(self.X_train.values, self.y_train)

        self.y_pred = self.model.predict(self.X_test.values)

        self.mse = mean_squared_error(self.y_test, self.y_pred)
        self.rmse = np.sqrt(self.mse)
        self.r2 = r2_score(self.y_test, self.y_pred)

    def dump_results(self, extra: bool = False):
        """
        Dumps the results of the training process

        Args:
            extra (bool, optional): Show graphics and extra information about the training process. Defaults to False.
        """
        print("=== START OF MODEL REPORT ===")
        print("Departures AI Model\nModel performance:")
        print("➔ RMSE :", self.rmse)
        print("➔ R2 :", self.r2)

        if extra:
            ## Visualize correlation matrix (Value importance)
            print("Value importance")
            correlation_matrix = self.X.corr()
            print(correlation_matrix)

            # Visualize correlations
            plt.figure(figsize=(15, 8))
            sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
            plt.title("Feature Correlations")
        print("=== END OF MODEL REPORT ===")

    def save_model(self):
        """
        Saves the model to ``self.export`` file
        """
        joblib.dump(self.model, self.export)
        print(f"Model saved as '{self.export}'")

        print("Now doing model AutoTest...")
        loaded_model = joblib.load(f"./{self.export}")
        sample_pred = loaded_model.predict([self.X_test.iloc[0].values])
        print("Prediction on a sample:", sample_pred[0])
        print("Model AutoTest done!")


class TrainArrivals:
    """
    ## TrainArrivals
    This class is used to train the TARDIS AI Model.
    This model can be used for arrivals-only predictions

    Current revision: **Revision 9**
    """

    def __init__(self, df: pd.DataFrame):
        """
        Initialization function for TrainArrivals object

        Args:
            * df (pd.DataFrame): The dataframe of the dataset to be trained on (**must be from Pandas**).

        ## Self
            * df (pd.DataFrame): The dataframe of the dataset to be trained on (**must be from Pandas**).
            * X (pd.DataFrame): The dataframe containing the values on the X training axis.
            * y (pd.DataFrame): The dataframe containing the values on the Y training axis.
            * X_train (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be fitted into the model.
            * X_test (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be tested against the model.
            * y_train (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be fitted into the model.
            * y_test (pd.DataFrame): The dataframe containing the values of the X training axis ready to
            be tested against the model.
            * model (Joblib AI Model): The exported model.
            * export (str): The path to the exported model.
            * extra_params (list[str]): Extra columns to be added to the training dataframe
            * extra_clean_params (list[str]): Extra columns to be added to the training dataframe cleaning
            process
        """
        self.df: pd.DataFrame = df
        self.X: pd.DataFrame = None
        self.y: pd.DataFrame = None
        self.X_train: pd.DataFrame = None
        self.X_test: pd.DataFrame = None
        self.y_train: pd.DataFrame = None
        self.y_test: pd.DataFrame = None
        self.model = None
        self.export: str = "tardis_rf_model_arrivals.joblib"
        self.extra_params: list[str] = [
            "Number of trains delayed at arrival",
            "Average delay of late trains at arrival",
            "Average delay of all trains at arrival",
        ]
        self.extra_clean_params: list[str] = [
            "Number of trains delayed at arrival",
            "Average delay of late trains at arrival",
            "Average delay of all trains at arrival",
        ]

    def train(self):
        """
        Trains the model using the inputed data in the object's attributes
        """
        self.df["Jour de la semaine"] = (
            self.df["Jour de la semaine"].astype("category").cat.codes
        )

        self.df = self.df.dropna(
            subset=Parameters.common_clean_features[0] + self.extra_clean_params
        )

        self.X = self.df[Parameters.common_features + self.extra_params]
        self.y = self.df["Average delay of all trains at arrival"]

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=Parameters.test_size,
            random_state=Parameters.random_state,
        )

        self.model = RandomForestRegressor(
            n_estimators=Parameters.n_estimators, random_state=Parameters.random_state
        )

        self.model.fit(self.X_train.values, self.y_train)

        self.y_pred = self.model.predict(self.X_test.values)

        self.mse = mean_squared_error(self.y_test, self.y_pred)
        self.rmse = np.sqrt(self.mse)
        self.r2 = r2_score(self.y_test, self.y_pred)

    def dump_results(self, extra: bool = False):
        """
        Dumps the results of the training process

        Args:
            extra (bool, optional): Show graphics and extra information about the training process. Defaults to False.
        """
        print("=== START OF MODEL REPORT ===")
        print("Arrival AI Model\nModel performance:")
        print("➔ RMSE :", self.rmse)
        print("➔ R2 :", self.r2)

        if extra:
            ## Visualize correlation matrix (Value importance)
            print("Value importance")
            correlation_matrix = self.X.corr()
            print(correlation_matrix)

            # Visualize correlations
            plt.figure(figsize=(15, 8))
            sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
            plt.title("Feature Correlations")
        print("=== END OF MODEL REPORT ===")

    def save_model(self):
        """
        Saves the model to ``self.export`` file
        """
        joblib.dump(self.model, self.export)
        print(f"Model saved as '{self.export}'")

        print("Now doing model AutoTest...")
        loaded_model = joblib.load(f"./{self.export}")
        sample_pred = loaded_model.predict([self.X_test.iloc[0].values])
        print("Prediction on a sample:", sample_pred[0])
        print("Model AutoTest done!")


gen = TrainGeneric(df.copy())
gen.train()
gen.dump_results()
gen.save_model()
deps = TrainDepartures(df.copy())
deps.train()
deps.dump_results()
deps.save_model()
arrs = TrainArrivals(df.copy())
arrs.train()
arrs.dump_results()
arrs.save_model()

=== START OF MODEL REPORT ===
Generic AI Model
Model performance:
➔ RMSE : 0.6999256318797411
➔ R2 : 0.9930432179106718
=== END OF MODEL REPORT ===
Model saved as 'tardis_rf_model.joblib'
Now doing model AutoTest...
Prediction on a sample: 1.17541692210832
Model AutoTest done!
=== START OF MODEL REPORT ===
Departures AI Model
Model performance:
➔ RMSE : 0.0640252424099352
➔ R2 : 0.9999137433972736
=== END OF MODEL REPORT ===
Model saved as 'tardis_rf_model_departures.joblib'
Now doing model AutoTest...
Prediction on a sample: 0.6573267471899998
Model AutoTest done!
=== START OF MODEL REPORT ===
Arrival AI Model
Model performance:
➔ RMSE : 0.013522339872709281
➔ R2 : 0.9999981229319895
=== END OF MODEL REPORT ===
Model saved as 'tardis_rf_model_arrivals.joblib'
Now doing model AutoTest...
Prediction on a sample: 12.141306234903443
Model AutoTest done!


## Log des révisions / Revisions log

### Révision / Revisions

#### Rev. 1
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Average delay of all trains at departure',
    'Number of trains delayed at departure',
    'Number of scheduled trains',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of trains delayed at departure',
    'Number of scheduled trains',
    'Mois',
    'Jour de la semaine'
]
```

* Paramètres /  Parameters
  * test_size: 0.2
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE : ~7.58
  * R2 : ~-0.024
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 180/180
  * Arrival: 327/327

#### Rev. 2
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Average delay of all trains at departure',
    'Number of trains delayed at departure',
    'Number of scheduled trains',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of cancelled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Average delay of trains > 15min (if competing with flights)',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Pct delay due to external causes',
    'Pct delay due to infrastructure',
    'Pct delay due to traffic management',
    'Pct delay due to rolling stock',
    'Pct delay due to station management and equipment reuse',
    "Pct delay due to passenger handling (crowding, disabled persons, connections)",
    'Mois',
    'Jour de la semaine'
]
```

* Paramètres /  Parameters
  * test_size: 0.2
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE : ~0.35 🡻 (✓)
  * R2 : ~1 (~0.997) 🢁 (✓)
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 40/180
  * Arrival: 1/327

Notes:
* Model is way too exclusive. Only few correspondances can be made on a global scale. **Unfit for production.**
* Le modèle est trop exclusif. Seulement que quelques correspondances peuvent se faire sur une échelle globale. **Pas intégrable.**

#### Rev. 3
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Average delay of all trains at departure',
    'Number of trains delayed at departure',
    'Number of scheduled trains',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of cancelled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Average delay of trains > 15min (if competing with flights)',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Pct delay due to external causes',
    'Pct delay due to infrastructure',
    'Pct delay due to traffic management',
    'Pct delay due to rolling stock',
    'Pct delay due to station management and equipment reuse',
    "Pct delay due to passenger handling (crowding, disabled persons, connections)",
    'Mois',
    'Jour de la semaine'
]
```

* Paramètres /  Parameters
  * test_size: 0.6
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE : ~0.55 🢁 (⚠)
  * R2 : ~0.99 (~0.994) 🡻 (⚠)
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 40/180
  * Arrival: 1/327

Notes:
* Model is degraded and is still too exclusive. **Unfit for production.**
* Le modèle est dégradé et est toujours exclusif. **Pas intégrable.**

#### Rev. 4
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Average delay of all trains at departure',
    'Number of trains delayed at departure',
    'Number of scheduled trains',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of cancelled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Average delay of trains > 15min (if competing with flights)',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]
```

* Paramètres /  Parameters
  * test_size: 0.2
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE : ~0.33 🡻 (✓)
  * R2 : ~1 (~0.998) 🢁 (✓)
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 40/180
  * Arrival: 1/327

Notes:
* Model has increased in accuracy. Only few correspondances can be made on a global scale. **Unfit for production.**
* Le modèle a augmenté en précision mais ne peux faire que quelques correspondances sur une échelle globale. **Pas intégrable.**

#### Rev. 5
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of cancelled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Average delay of trains > 15min (if competing with flights)',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of cancelled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Average delay of trains > 15min (if competing with flights)',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]
```

* Paramètres /  Parameters
  * test_size: 0.2
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE : ~0.16 🡻 (✓)
  * R2 : ~1 (~0.999) 🢁 (✓)
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 40/180
  * Arrival: 1/327

Notes:
* Model has increased in accuracy. Only few correspondances can be made on a global scale. **Unfit for production.**
* Le modèle a augmenté en précision mais ne peux faire que quelques correspondances sur une échelle globale. **Pas intégrable.**

#### Rev. 6
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Average delay of trains > 15min (if competing with flights)',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Average delay of trains > 15min (if competing with flights)',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]
```

* Paramètres /  Parameters
  * test_size: 0.2
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE : ~0.51 🢁 (✓)
  * R2 : ~1 (~0.994) 🡻 (✓)
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 42/180
  * Arrival: 1/327

Notes:
* Model has significantly decreased in accuracy. Only manages to get two more correspondances compared to other models
  for departures. **Unfit for production.**
* Le modèle a dégréssé en précision et n'arrive qu'a faire deux prédictions en plus par rapport aux autres modèles
  sur les départs. **Pas intégrable.**

#### Rev. 7
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]
```

* Paramètres /  Parameters
  * test_size: 0.6
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE : ~0.39 🢁 (✓)
  * R2 : ~1 (~0.997) 🡻 (✓)
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 46/180
  * Arrival: 1/327

Notes:
* Model has slightly decreased in accuracy. Manages to get four more correspondances for departures. **Unfit for production.**
* Le modèle a dégréssé en précision et arrive à faire quatres prédictions en plus par rapport aux autres modèles
  sur les départs. **Pas intégrable.**

#### Rev. 8
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]
```

* Paramètres /  Parameters
  * test_size: 0.6
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE : ~0.09 🡻 (✓)
  * R2 : ~1 (~0.9999) 🢁 (✓)
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 55/180
  * Arrival: 2/327

Notes:
* Model has slightly in accuracy. Manages to get nine more correspondances for departures compared to *Rev.7*. **Unfit for production.**
* Le modèle a augmenté en précision et n'arrive qu'a faire deux prédictions en plus par rapport au modèle *Rev. 7*
  sur les départs. **Pas intégrable.**

#### Rev. 9 (POST AI Model v2)
```python
## Remove missing data in these columns
clean_features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of trains delayed at departure',
    'Number of trains delayed at arrival',
    'Number of trains delayed > 15min',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]

## Train model using these columns
features = [
    'Average journey time',
    'Number of scheduled trains',
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
    'Number of trains delayed > 15min',
    'Number of trains delayed > 30min',
    'Number of trains delayed > 60min',
    'Mois',
    'Jour de la semaine'
]

## Auxiliary columns used by specialized models
sp_features = [
    'Number of trains delayed at departure',
    'Average delay of late trains at departure',
    'Average delay of all trains at departure',
    'Number of trains delayed at arrival',
    'Average delay of late trains at arrival',
    'Average delay of all trains at arrival',
]
```

* Paramètres /  Parameters
  * test_size: 0.1
  * random_state: 42
  * n_estimators: 100
* Sortie /  Output
  * RMSE (GLOBAL) : ~0.7 🢁 (☓)
  * RMSE (Departures) : ~0.064 🡻 (✓)
  * RMSE (Arrivals) : ~0.013 🡻 (✓)
  * R2 (GLOBAL) : ~1 (~0.993) 🡻 (☓)
  * R2 (Departures) : ~1 (~0.9999) 🢁 (✓)
  * R2 (Arrivals) : ~1 (~0.99999) 🢁 (✓)
* Taux de Prédictions Réussies /  Pred Success Rate
  * Departures: 180/180
  * Arrival: 327/327

Notes:
* General model has lost in quality (Likely due to the test_size being to 0.1 instead of 0.2). Specialized models have gotten way higher in quality and merges well
  with dataset. Noise on departures is minimal but noise on Arrival is a bit higher, which is likely due to garbage data inside the dataset.
  As for now, **this model is recommended for deployment**.
* Le modèle général a perdu en qualité (probablement parce que la taille du test est de 0,1 au lieu de 0,2). Les modèles spécialisés ont gagné en
  qualité et fusionnent bien
  avec le jeu de données. Le bruit sur les départs est minime mais le bruit sur les arrivées est un peu plus élevé, ce qui est probablement dû à des données inutiles dans le jeu de données. Pour le moment, **le modèle est intégrable pour le déploiement.**

### Conclusions

L'IA finale utilisée sur le projet sera la **révision 9**.

The final AI model used on the project will be the **9th revision**.

# Hyperparameter Tuning with GridSearchCV
This cell demonstrates how to use GridSearchCV to find the best hyperparameters for the RandomForestRegressor, which can help improve model precision.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use a subset of data for speed
X_gen = gen.X_train
y_gen = gen.y_train
X_deps = deps.X_train
y_deps = deps.y_train
X_arrs = arrs.X_train
y_arrs = arrs.y_train

# General AI Scan
gen_rf = RandomForestRegressor(random_state=Parameters.random_state)
gen_grid_search = GridSearchCV(estimator=gen_rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
gen_grid_search.fit(X_gen, y_gen)

print("General AI: Best parameters found:", gen_grid_search.best_params_)
print("General AI: Best RMSE:", np.sqrt(-gen_grid_search.best_score_))
print("==================")

# Departures AI Scan
deps_rf = RandomForestRegressor(random_state=Parameters.random_state)
deps_grid_search = GridSearchCV(estimator=deps_rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
deps_grid_search.fit(X_deps, y_deps)

print("Departures AI: Best parameters found:", deps_grid_search.best_params_)
print("Departures AI: Best RMSE:", np.sqrt(-deps_grid_search.best_score_))
print("==================")

# Arrivals AI Scan
arrs_rf = RandomForestRegressor(random_state=Parameters.random_state)
arrs_grid_search = GridSearchCV(estimator=arrs_rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
arrs_grid_search.fit(X_arrs, y_arrs)

print("Arrivals AI: Best parameters found:", arrs_grid_search.best_params_)
print("Arrivals AI: Best RMSE:", np.sqrt(-arrs_grid_search.best_score_))
print("==================")

General AI: Best parameters found: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
General AI: Best RMSE: 0.3177911825333763
Departures AI: Best parameters found: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Departures AI: Best RMSE: 0.2605424195556421
Arrivals AI: Best parameters found: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Arrivals AI: Best RMSE: 1.822022499985863
