In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib.dates import date2num
import joblib  # Pour sauvegarder le modèle
import time  # Pour mesurer le temps d'entraînement

In [2]:
# Charger les données
df = pd.read_pickle('chemin/cleaned_data_cleaned_10min.pkl')

# Sélectionner les colonnes pertinentes
cols = ['H_Power', 'H_Power_Factor', 'H_Voltage', 'F_Rotation_Speed', 'H_Frequency', 'B_Flow', 'A_Flow', 'E_Pressure_Drops_1']
data = df[cols].dropna()  # Assurez-vous de ne pas avoir de valeurs manquantes

In [30]:
data

Unnamed: 0_level_0,H_Power,H_Power_Factor,H_Voltage,F_Rotation_Speed,H_Frequency,B_Flow,A_Flow,E_Pressure_Drops_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01-01 00:00,34.113321,0.823854,10.055195,3001.267456,50.021124,246.806160,263.985489,173.610855
2023-01-01 00:10,34.060003,0.825730,10.056350,3000.067139,50.001119,247.323608,261.998383,174.025238
2023-01-01 00:20,34.043475,0.828994,10.056422,3000.064877,50.001081,247.323608,261.857555,173.953800
2023-01-01 00:30,34.026946,0.832258,10.056493,3000.062616,50.001044,247.323608,261.716728,173.882361
2023-01-01 00:40,34.010417,0.835521,10.056565,3000.060354,50.001006,247.323608,261.575900,173.810923
...,...,...,...,...,...,...,...,...
2023-10-31 23:10,33.584782,0.865749,10.059822,3003.404004,50.056733,246.402817,262.878845,174.679092
2023-10-31 23:20,33.539358,0.867168,10.060196,3003.386987,50.056450,246.633015,263.075386,174.603237
2023-10-31 23:30,33.493935,0.868588,10.060571,3003.369971,50.056166,246.863213,263.271927,174.527382
2023-10-31 23:40,33.448511,0.870008,10.060945,3003.352954,50.055883,247.093410,263.468468,174.451527


In [3]:
# Définir X et y
X = data.drop('H_Power', axis=1)
y = data['H_Power']

In [4]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Définition de la grille d'hyperparamètres
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [6]:
# Création du modèle XGBoost
model = XGBRegressor(objective='reg:squarederror')


In [7]:
# Configuration de la recherche en grille
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)


In [8]:
# Mesurer le temps d'entraînement
start_time = time.time()

In [9]:
# Entraînement de la recherche en grille
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [10]:
# Calculer le temps d'entraînement
training_time_seconds = time.time() - start_time
training_time_minutes = training_time_seconds / 60

print(f"Training Time: {training_time_minutes:.2f} minutes")

# Meilleurs hyperparamètres trouvés
print("Best Hyperparameters:", grid_search.best_params_)

Training Time: 19.51 minutes
Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 300, 'subsample': 1.0}


In [11]:
# Meilleur modèle
best_model = grid_search.best_estimator_

In [12]:
# Faire des prédictions avec le meilleur modèle
y_pred = best_model.predict(X_test)

In [13]:
# Évaluation du modèle
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error with Optimized Model: {mse:.2f}')
print(f'R^2 Score with Optimized Model: {r2:.2f}')

Mean Squared Error with Optimized Model: 0.77
R^2 Score with Optimized Model: 0.99


In [15]:
# Fonction pour faire des prédictions pour un jour ou une semaine
def predict_for_date(start_date, duration='day'):
    if duration == 'day':
        date_range = pd.date_range(start=start_date, periods=144, freq='10T')  # 144 intervalles pour un jour
    elif duration == 'week':
        date_range = pd.date_range(start=start_date, periods=7 * 144, freq='10T')  # 7 jours avec 144 intervalles (10 min)
    else:
        raise ValueError("Duration must be 'day' or 'week'")

    # Créer un DataFrame pour les caractéristiques à prédire
    X_pred = pd.DataFrame(index=date_range)

    # Remplir X_pred avec les dernières valeurs connues
    last_known_values = X.iloc[-1].values
    
    for i, col in enumerate(X.columns):
        X_pred[col] = np.tile(last_known_values[i], len(X_pred))  # Remplir avec des valeurs constantes

    # Faire des prédictions
    y_pred = best_model.predict(X_pred)

    return pd.DataFrame(y_pred, index=date_range, columns=['Predicted_H_Power'])

# Exemple d'utilisation
predicted_day = predict_for_date(start_date='2023-10-31', duration='day')
predicted_week = predict_for_date(start_date='2023-10-31', duration='week')


  date_range = pd.date_range(start=start_date, periods=144, freq='10T')  # 144 intervalles pour un jour
  date_range = pd.date_range(start=start_date, periods=7 * 144, freq='10T')  # 7 jours avec 144 intervalles (10 min)


In [16]:
# Visualiser les résultats réels et prédits pour l'ensemble de test
predict_period_dates = df.index[-len(y_test):]
df_results = pd.DataFrame({
    'Real': y_test[:1008],
    'Predicted': y_pred[:1008],
    'Error': y_test[:1008] - y_pred[:1008]
})

In [17]:
df_results

Unnamed: 0_level_0,Real,Predicted,Error
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-03-13 10:50,32.376110,32.194077,0.182033
2023-05-09 10:30,32.523581,35.900089,-3.376509
2023-01-14 09:40,32.186863,31.565310,0.621553
2023-05-22 01:50,37.123076,37.132351,-0.009275
2023-04-17 00:50,38.386051,38.489796,-0.103745
...,...,...,...
2023-07-09 23:50,30.538004,29.807373,0.730631
2023-08-29 01:10,29.693855,29.419981,0.273874
2023-03-21 20:10,34.679277,34.612133,0.067144
2023-10-05 23:40,36.616744,36.545933,0.070811


In [18]:
# Créer des graphiques interactifs avec Plotly
def create_interactive_plot(dates, real, predicted, title, critical_value=33):
    fig = go.Figure()
    
    # Ajouter les données réelles
    fig.add_trace(go.Scatter(x=dates, y=real, mode='lines', name='Données Réelles'))
    
    # Ajouter les données prédites
    fig.add_trace(go.Scatter(x=dates, y=predicted, mode='lines', name='Données Prédites'))
    
    # Ajouter une ligne pour la valeur critique
    fig.add_trace(go.Scatter(x=[dates.min(), dates.max()], y=[critical_value, critical_value], mode='lines', line=dict(color='red', dash='dash'), name=f'Valeur Critique ({critical_value} MW)'))
    
    fig.update_layout(
        title=title,
        xaxis_title='Date',
        yaxis_title='Puissance Électrique (MW)',
        legend_title='Légende',
        xaxis=dict(tickformat='%Y-%m-%d %H:%M', tickangle=-45)
    )
    
    fig.show()


In [19]:
# Créer un DataFrame pour faciliter le découpage
df_advanced = pd.DataFrame({
    'date': pd.to_datetime(predict_period_dates),
    'real': y_test,
    'predicted': y_pred,
    'error': y_test - y_pred
})

In [20]:
# Calcul du nombre de points pour chaque période
points_per_day = 24 * 6  # 144 points par jour (intervalles de 10 minutes)
points_per_week = 7 * points_per_day
points_per_month = 28 * points_per_day  # Considérant un mois de 30 jours

In [21]:
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [22]:
# 1 jour (144 points pour des intervalles de 10 minutes)
df_day = df_advanced.head(points_per_day)
create_interactive_plot(df_day['date'], df_day['real'], df_day['predicted'], 'Réel vs Prédit (1 jour)')
df_day_table = df_day[['date', 'real', 'predicted', 'error']]


In [23]:

# 1 semaine (1008 points)
df_week = df_advanced.head(points_per_week)
create_interactive_plot(df_week['date'], df_week['real'], df_week['predicted'], 'Réel vs Prédit (1 semaine)')
df_week_table = df_week[['date', 'real', 'predicted', 'error']]



In [26]:

# 1 mois (4320 points pour un mois de 30 jours)
df_month = df_advanced.head(points_per_month)
create_interactive_plot(df_month['date'], df_month['real'], df_month['predicted'], 'Réel vs Prédit (1 mois de 30 jours)')
df_month_table = df_month[['date', 'real', 'predicted', 'error']]



In [29]:
import xgboost as xgb
# Sauvegarde du modèle entraîné
best_model.save_model('XGBOOST_h_power_model_predicted.json')

In [33]:
df_results_pred_Xgboost = df_results

In [34]:
# Enregistre le DataFrame dans un fichier .pkl
df_results_pred_Xgboost.to_pickle('df_results_pred_Xgboost.pkl')