Enhancements Made:
Model Checkpointing: Added ModelCheckpoint to save the best model during training.
More Informative Logging: Kept the verbose level at 2 for detailed logging.
Better Data Handling: Ensured the selected columns are numeric and dropped rows with NaN values.
Hyperparameter Tuning: Used multiple callbacks for better control of the training process.
Advanced Plotting: Improved plotting for better visualization of training and validation losses, and anomaly detection.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import matplotlib.pyplot as plt
import tensorflow as tf

# Load the main dataset
data_path = 'data2.csv'  # Adjust the path if necessary
data = pd.read_csv(data_path)

# Load the selected columns from the specified path
selected_columns_path = 'AirEau_features_lag15.csv'  # Adjust the path if necessary
selected_columns_df = pd.read_csv(selected_columns_path)

# Select only the feature column
selected_columns = selected_columns_df['Feature'].tolist()

# Ensure 'Error Code' is included in the selected columns
if 'Error Code' not in selected_columns:
    selected_columns.append('Error Code')

# Ensure all selected columns are numeric
data_selected = data[selected_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with any NaN values
data_selected.dropna(inplace=True)

# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data_selected)

# Separate the target column for separate scaling
target_scaler = MinMaxScaler()
scaled_data[:, selected_columns.index('Error Code')] = target_scaler.fit_transform(
    data_selected['Error Code'].values.reshape(-1, 1)).flatten()

# Define the lookback period (number of previous time steps to consider)
lookback = 60  # Number of past time steps to look at
LABELS = 6  # Number of future time steps to predict

# Create sequences of data
def create_sequences(data, lookback, labels, target_idx):
    X, y = [], []
    for i in range(len(data) - lookback - labels + 1):
        X.append(data[i:i + lookback])
        y.append(data[i + lookback:i + lookback + labels, target_idx])
    return np.array(X), np.array(y)

# The target column index
target_idx = selected_columns.index('Error Code')

X, y = create_sequences(scaled_data, lookback, LABELS, target_idx)

# Reshape y to have the same sequence length as the model's output
y = y.reshape((y.shape[0], y.shape[1]))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

# Print the shapes of the datasets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Create a Sequential model
model = Sequential()

# Add LSTM layers with Dropout for regularization
model.add(LSTM(100, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))

# Add a Dense layer to match the output size
model.add(Dense(LABELS))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Display the model summary
model.summary()

# Callbacks for early stopping, learning rate reduction, and model checkpointing
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# Train the model with batch size of 64
history = model.fit(X_train, y_train, epochs=500, batch_size=64, validation_data=(X_test, y_test), 
                    verbose=2, shuffle=False, callbacks=[early_stopping, reduce_lr, model_checkpoint])

# Plot training loss and validation loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Load the best model
model.load_weights('best_model.h5')

# Make predictions
predictions = model.predict(X_test)

# Inverse transform the predictions and true values to get original scale values
y_test_inverse = target_scaler.inverse_transform(y_test.reshape(-1, 1))
predictions_inverse = target_scaler.inverse_transform(predictions.reshape(-1, 1))

# Evaluate the model
mse = mean_squared_error(y_test_inverse, predictions_inverse)
print(f'Mean Squared Error: {mse}')

# Plot true vs predicted values
plt.figure(figsize=(12, 6))
plt.plot(y_test_inverse, label='True Values')
plt.plot(predictions_inverse, label='Predicted Values')
plt.title('True vs Predicted Values')
plt.xlabel('Samples')
plt.ylabel('Value')
plt.legend()
plt.show()

# Calculate the absolute differences
differences = np.abs(y_test_inverse - predictions_inverse)

# Define a threshold for anomalies (this can be adjusted based on your data)
threshold = 0.05  # Example threshold

# Identify the anomalies
anomalies = differences > threshold

# Plot anomalies
plt.figure(figsize=(12, 6))
plt.plot(y_test_inverse, label='True Values')
plt.plot(predictions_inverse, label='Predicted Values')
plt.scatter(np.where(anomalies)[0], y_test_inverse[anomalies], color='red', label='Anomalies')
plt.title('Anomalies Detection')
plt.xlabel('Samples')
plt.ylabel('Value')
plt.legend()
plt.show()
