In [None]:
# --- Tarek Djaker notebook profile ---
import sys, os
sys.path.append(r'C:\Users\pigio\OneDrive\Documents\OneDrive\Desktop\projets\data_science_practice_2025\Tarek Djaker\lib')
from tarek_profile import nb_init, profile_banner
nb_init()
profile_banner(title=None)
# -------------------------------------

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")
df_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

### Data analysis

In [None]:
#### Make a complete analysis on data preprocessing
# Inconsistencies
# Duplicates (data.duplicated().sum())
# Missing values (data.isnull().sum())
# Categorical
# Outliers
# Feature Engineering
# Feature Selection and/or Dimensionality Reduction

In [None]:
data = pd.concat([df_train, df_test], axis=0)

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
def plot_feature_over_time(df, feature, date_id_start, date_id_end):
    df_filtered = df[(df['date'] >= date_id_start) & (df['date'] <= date_id_end)]

    if feature not in df_filtered.columns:
        print(f"Feature '{feature}' not found in the DataFrame.")
        return

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(df_filtered['date'], df_filtered[feature], label=feature, linestyle='-')
    plt.xlabel('Date')
    plt.ylabel(feature)
    plt.title(f'{feature} from {date_id_start} to {date_id_end}')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()



In [None]:
data['date'] = pd.to_datetime(data['date'])

In [None]:
data

In [None]:
import pandas as pd
import numpy as np # Ajout de numpy pour la gestion des NaN
from typing import Union, Dict, Any

def ordinal_encode_column(
    column: pd.Series,
    mapping: Dict[Any, int] = None,
    # Nouvelle option pour renvoyer le mapping (utile pour l'entraînement)
    return_mapping: bool = False
) -> Union[pd.Series, tuple[pd.Series, Dict[Any, int]]]:
    """
    Associe un nombre entier unique à chaque valeur distincte d'une colonne (encodage ordinal),
    en ignorant les valeurs NaN lors de la création du mapping.

    Args:
        column (pd.Series): La colonne (Série Pandas) à encoder.
        mapping (Dict[Any, int], optional): Dictionnaire de mapping existant {valeur: nombre}.
                                            Par défaut à None.
        return_mapping (bool): Si True, retourne le mapping en plus de la colonne encodée.
                               Doit être True pour l'ensemble d'entraînement.

    Returns:
        Union[pd.Series, tuple[pd.Series, Dict[Any, int]]]:
            - Si 'return_mapping' est True, retourne un tuple (colonne_encodée, nouveau_mapping).
            - Sinon, retourne seulement la colonne_encodée.
    """

    # Si aucun mapping n'est fourni, nous en créons un nouveau
    if mapping is None:
        # Trouver toutes les valeurs uniques SAUF NaN (pour les laisser np.nan plus tard)
        # Utilisation de dropna=True pour ignorer les NaN
        unique_values = column.dropna().unique()

        # Créer le mapping {valeur: index}
        mapping = {value: index for index, value in enumerate(sorted(unique_values))}

        print("Mapping créé (valeurs uniques et leurs entiers associés) :")
        print(mapping)

        encoded_column = column.map(mapping)

    # Si un mapping est fourni, nous l'appliquons
    else:
        encoded_column = column.map(mapping)

        # Afficher un avertissement si de nouvelles valeurs sont apparues
        if encoded_column.isnull().any():
            # Trouver les nouvelles valeurs qui ne sont pas des NaN d'origine
            unmapped_values = column[encoded_column.isnull() & column.notna()].unique()
            if len(unmapped_values) > 0:
                 print(f"ATTENTION : {len(unmapped_values)} nouvelles valeurs non mappées trouvées et encodées en NaN. Mapping nécessaire : {unmapped_values[:5]}...")

    # Gestion de la sortie
    if return_mapping:
        return encoded_column, mapping
    else:
        return encoded_column

In [None]:
data['weather_condition'] = ordinal_encode_column(data['weather_condition'])
data['oil_brent_price_indicator'] = ordinal_encode_column(data['oil_brent_price_indicator'])

In [None]:
import pandas as pd


CONVERSION_FACTOR_MS_TO_KMH = 3.6

def convertir_vitesse(vitesse_col: pd.Series) -> pd.Series:
    """
    Convertit une colonne de vitesse contenant 'km/h' et 'm/s' en une Série numérique en 'km/h'.

    Args:
        vitesse_col (pd.Series): La colonne de vitesse brute (ex: '27.74 km/h' ou '6.83 m/s').

    Returns:
        pd.Series: La colonne de vitesse nettoyée et convertie en km/h (type float).
    """

    # Créer une copie pour éviter les avertissements SettingWithCopyWarning
    col_cleaned = vitesse_col.copy()

    # 1. Nettoyer les chaînes de caractères et effectuer la conversion
    def process_value(val):
        if pd.isna(val):
            return val # Conserver les NaN

        val_str = str(val).strip()

        if 'km/h' in val_str:
            # Remplacer 'km/h' et convertir directement en float
            return float(val_str.replace('km/h', '').strip())

        elif 'm/s' in val_str:
            # Remplacer 'm/s', convertir en float, puis appliquer le facteur de conversion
            val_ms = float(val_str.replace('m/s', '').strip())
            return val_ms * CONVERSION_FACTOR_MS_TO_KMH

        # Si aucune unité n'est trouvée, essayez de retourner la valeur comme float (pour les valeurs déjà numériques)
        try:
            return float(val_str)
        except ValueError:
            return pd.NA # Retourner NA pour les formats non reconnus

    # Appliquer la fonction à toute la colonne
    return col_cleaned.apply(process_value)


In [None]:
data['wind_speed'] = convertir_vitesse(data['wind_speed'])

In [None]:
data['wind_speed']

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

# Note: Ces bibliothèques doivent être installées:
# pip install pandas numpy xgboost

def mice_xgboost_imputer_manual(df: pd.DataFrame, max_iter: int = 10, random_state: int = 42) -> pd.DataFrame:
    """
    Effectue l'imputation MICE (Multiple Imputation by Chained Equations)
    sur un DataFrame en utilisant un algorithme manuel sans fancyimpute,
    avec XGBoostRegressor comme estimateur.

    Args:
        df (pd.DataFrame): Le DataFrame d'entrée contenant des NaN.
        max_iter (int): Nombre maximum d'itérations pour l'imputation MICE.
        random_state (int): Graine aléatoire pour la reproductibilité.

    Returns:
        pd.DataFrame: Le DataFrame avec les valeurs NaN imputées.
    """

    # 1. Préparation et vérifications de robustesse
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f"L'entrée 'df' doit être un pandas DataFrame, mais est de type {type(df)}.")

    # Séparer les colonnes numériques des colonnes non-numériques
    numeric_cols = df.select_dtypes(include=np.number).columns
    df_numeric = df[numeric_cols].copy()

    # Si aucune colonne numérique à imputer, retourner l'original
    if df_numeric.empty:
        print("\nAvertissement: Aucune colonne numérique trouvée pour l'imputation. Retour du DataFrame original.")
        return df

    # Isoler les colonnes non-numériques pour les réintégrer à la fin
    non_numeric_cols = df.select_dtypes(exclude=np.number).columns
    df_non_numeric = df[non_numeric_cols].copy()

    # Identifier les colonnes qui ont des valeurs manquantes à imputer
    cols_with_nan = df_numeric.columns[df_numeric.isnull().any()].tolist()
    if not cols_with_nan:
        print("\nAvertissement: Aucune valeur manquante dans les colonnes numériques. Retour du DataFrame original.")
        return df

    print(f"\n--- Démarrage de l'imputation MICE manuelle ---")
    print(f"Colonnes à imputer: {cols_with_nan}")

    # 2. Imputation initiale (remplacer les NaN par la moyenne)
    # C'est l'étape de 'warm-up'
    df_imputed = df_numeric.fillna(df_numeric.mean())

    # Initialiser le modèle XGBoost
    xgb_estimator = XGBRegressor(
        n_estimators=100,
        random_state=random_state,
        n_jobs=-1,
        tree_method='hist',
        verbosity=0
    )

    # 3. Boucle MICE
    np.random.seed(random_state) # Fixer la graine pour la reproductibilité dans les splits
    for iter_count in range(max_iter):

        # Mélanger l'ordre d'imputation des colonnes pour éviter les biais
        np.random.shuffle(cols_with_nan)

        for col_to_impute in cols_with_nan:
            # Identifier les lignes manquantes et non manquantes
            missing_mask = df_numeric[col_to_impute].isnull()

            # Si aucune valeur manquante dans cette colonne, continuer
            if not missing_mask.any():
                continue

            # Ensemble d'entraînement: lignes non manquantes dans la colonne cible
            # La colonne cible (col_to_impute) est la variable Y
            # Toutes les autres colonnes numériques (déjà imputées à ce stade de l'itération) sont X

            X_train = df_imputed[~missing_mask].drop(columns=[col_to_impute])
            y_train = df_numeric.loc[~missing_mask, col_to_impute] # Utiliser les valeurs originales pour l'entraînement

            # Ensemble de prédiction: lignes manquantes dans la colonne cible
            X_predict = df_imputed[missing_mask].drop(columns=[col_to_impute])

            # Entraîner le modèle
            xgb_estimator.fit(X_train, y_train)

            # Prédire les valeurs manquantes
            y_predicted = xgb_estimator.predict(X_predict)

            # Mettre à jour le DataFrame imputé
            df_imputed.loc[missing_mask, col_to_impute] = y_predicted

        print(f"Itération {iter_count + 1}/{max_iter} complétée.")

    # 4. Réassembler le DataFrame final
    if not df_non_numeric.empty:
        # Assurer que les index sont alignés
        df_imputed = pd.merge(
            df_imputed,
            df_non_numeric,
            left_index=True,
            right_index=True,
            how='left'
        )

    # Rétablir l'ordre des colonnes original
    original_order = [col for col in df.columns if col in df_imputed.columns]
    df_imputed = df_imputed[original_order]

    print("Imputation MICE (XGBoost) terminée. Vérification des NaN dans les colonnes numériques: ", df_imputed[numeric_cols].isnull().any().any())
    print("-----------------------------------")

    return df_imputed

In [None]:
data = mice_xgboost_imputer_manual(data, max_iter=10, random_state=42)

In [None]:
def iqr_outlier_remover(column: pd.Series) -> pd.Series:
    """
    Détecte les outliers dans une colonne numérique en utilisant la méthode IQR
    et les remplace par la médiane de la colonne.

    Args:
        column (pd.Series): La colonne (série Pandas) numérique à traiter.

    Returns:
        pd.Series: La colonne traitée avec les outliers remplacés par la médiane.
    """

    # 1. Vérification de robustesse
    if not isinstance(column, pd.Series):
        raise TypeError("L'entrée doit être une série Pandas.")

    if not pd.api.types.is_numeric_dtype(column):
        print(f"Avertissement: La colonne '{column.name}' n'est pas numérique. Retour de la colonne originale.")
        return column

    # 2. Calcul des quartiles et de l'IQR
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1

    # 3. Définition des bornes (seuil conventionnel de 1.5 * IQR)
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # 4. Calcul de la médiane de la colonne
    median_value = column.median()

    # 5. Détection des outliers (valeurs en dehors des bornes)
    outlier_mask_low = column < lower_bound
    outlier_mask_high = column > upper_bound

    # Masque combiné des outliers
    outlier_mask = outlier_mask_low | outlier_mask_high

    # Nombre d'outliers détectés
    num_outliers = outlier_mask.sum()

    # Créer une copie de la colonne pour éviter de modifier le DataFrame original
    column_processed = column.copy()

    # 6. Remplacement des outliers par la médiane
    if num_outliers > 0:
        column_processed[outlier_mask] = median_value
        print(f"Traitement de la colonne '{column.name}': {num_outliers} outliers remplacés par la médiane ({median_value:.2f}).")
    else:
        print(f"Traitement de la colonne '{column.name}': Aucun outlier détecté par la méthode IQR.")

    return column_processed

In [None]:
data['electricity_demand'] = iqr_outlier_remover(data['electricity_demand'])

In [None]:
plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')

In [None]:
data['humidity'] = iqr_outlier_remover(data['humidity'])

In [None]:
plot_feature_over_time(data, 'humidity', '2016-06-01', '2016-12-01')

### Data Preprocessing Evaluation Strategy

In [None]:
# Provide a complete data preprocessing transformations

In [None]:
# 1. Handle Inconsistencies
def handle_inconsistencies(X_train, y_train, X_val=None):
    if X_val is not None:
        return X_train.copy(), y_train, X_val.copy()
    else:
        return X_train.copy(), y_train

# 2. Handling Duplicates
def handle_duplicates(X_train, y_train, X_val=None):
    if X_val is not None:
        return X_train.copy(), y_train, X_val.copy()
    else:
        X_train_no_duplicates = X_train.copy()
        y_train_no_duplicates = y_train.loc[X_train_no_duplicates.index]
        return X_train_no_duplicates, y_train_no_duplicates, X_val.copy()

# 3. Handling Missing Values
def handle_missing_values(X_train, y_train, X_val=None):
    if X_val is not None:
        X_train = X_train.fillna(-1)
        X_val = X_val.fillna(-1)
        return X_train.copy(), X_val.copy()
    else:
        X_train = X_train.fillna(-1)
        return X_train

# 4. Handling Categorical Values
def handle_categorical(X_train, y_train, X_val=None):
    if X_val is not None:
        return X_train.copy(), X_val.copy()
    else:
        return X_train.copy()

# 5. Handling Outliers
def handle_outliers(X_train, y_train, X_val=None):
    if X_val is not None:
        return X_train.copy(), y_train, X_val.copy()
    else:
        return X_train.copy(), y_train

# 6. Feature Engineering
def feature_engineering(X_train, y_train, X_val=None):
    if X_val is not None:
        return X_train.copy(), y_train, X_val.copy()
    else:
        return X_train.copy(), y_train

# 7. Feature Selection and Dimensionality Reduction
def feature_selection(X_train, y_train, X_val=None):
    selected_columns = ['humidity', 'temperature_station1',
       'temperature_station2', 'temperature_station3', 'temperature_station4',
       'temperature_station5', 'temperature_station6', 'temperature_station7',
       'temperature_station8', 'temperature_station9', 'temperature_station10']
    if X_val is not None:
        return X_train[selected_columns], X_val[selected_columns]
    else:
        return X_train[selected_columns]

In [None]:
def evaluate_pipeline(X, y, n_splits=5):

    ### call transformations here, if there is no learning and no need to be crossval
    X, y = handle_inconsistencies(X, y)
    # X, y = handle_duplicates(X, y)
    X  = handle_missing_values(X, y)
    # X_train = handle_categorical(X, y)
    X, y = handle_outliers(X, y)
    # X, y = feature_engineering(XX, y)
    X = feature_selection(X, y)

    model = LinearRegression()

    tscv = TimeSeriesSplit(n_splits=n_splits)

    train_scores = []
    val_scores = []

    for fold, (train_index, val_index) in enumerate(tscv.split(X)):
        print(f"Processing fold {fold + 1}/{n_splits}...")

        # Split data into train and validation sets
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[val_index].copy()

        ### call transformations here, if there is learning
        # X_train, y_train, X_val = handle_inconsistencies(X_train, y_train, X_val)
        X_train, y_train, X_val = handle_duplicates(X_train, y_train, X_val)
        # X_train, X_val = handle_missing_values(X_train, y_train, X_val)
        X_train, X_val = handle_categorical(X_train, y_train, X_val)
        # X_train, y_train, X_val = handle_outliers(X_train, y_train, X_val)
        X_train, y_train, X_val = feature_engineering(X_train, y_train, X_val)
        # X_train, X_val = feature_selection(X_train, y_train, X_val)

        # Train the model
        model.fit(X_train, y_train)

        # Predict on training set
        y_train_pred = model.predict(X_train)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_scores.append(train_mse)

        # Predict on validation set
        y_val_pred = model.predict(X_val)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_scores.append(val_mse)

        print(f"Fold {fold + 1} Train MSE: {train_mse:.4f}, Validation MSE: {val_mse:.4f}")

    # Compute mean, max, and min values for train and validation MSE
    mean_train_mse = np.mean(train_scores)
    max_train_mse = np.max(train_scores)
    min_train_mse = np.min(train_scores)

    mean_val_mse = np.mean(val_scores)
    max_val_mse = np.max(val_scores)
    min_val_mse = np.min(val_scores)

    # Print results
    print("\nTrain MSE:")
    print(f"Mean: {mean_train_mse:.4f}, Max: {max_train_mse:.4f}, Min: {min_train_mse:.4f}")

    print("\nValidation MSE:")
    print(f"Mean: {mean_val_mse:.4f}, Max: {max_val_mse:.4f}, Min: {min_val_mse:.4f}")

    return mean_val_mse  # Return mean validation MSE as the overall score

In [None]:
# Prepare X and y
X = df_train.copy().drop(columns=['electricity_demand'], axis=1)
y = df_train.copy().pop('electricity_demand')

# Run the evaluation
evaluate_pipeline(X, y)

### Generating Submission File

In [None]:
# Train and submit your results

In [None]:
# Prepare X_train and y_train from your data
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")

X_train = df_train.drop(columns=['electricity_demand'], axis=1)
y_train = df_train['electricity_demand']

X_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

In [None]:
def train_and_predict_to_submit(X_train, y_train, X_test):
    model = LinearRegression()

    X_train, y_train, X_test = handle_inconsistencies(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_duplicates(X_train, y_train, X_test)
    X_train, X_test = handle_missing_values(X_train, y_train, X_test)
    X_train, X_test = handle_categorical(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_outliers(X_train, y_train, X_test)
    X_train, y_train, X_test = feature_engineering(X_train, y_train, X_test)
    X_train, X_test = feature_selection(X_train, y_train, X_test)

    # Train the model on the entire training set
    print(f"Training model on entire dataset of shape: {X_train.shape}")
    model.fit(X_train, y_train)

    # Predict on the test set
    print(f"Predicting on test dataset of shape: {X_test.shape}")
    y_test_pred = model.predict(X_test)

    return y_test_pred

In [None]:
# Call serve_model to train and predict
y_test_pred = train_and_predict_to_submit(X_train, y_train, X_test)

In [None]:
# Generating Submission File
submission = pd.DataFrame({
    'date': X_test['date'],
    'electricity_demand': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False, sep=',')
print("Submission file saved as 'submission.csv'.")