In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

!pip install plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots


import numpy as np
from scipy.stats import shapiro, normaltest, probplot

from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
%run "variables.py"

In [None]:
%run "functions.ipynb" 

In [None]:
%run "02-feature-engineering.ipynb"

# Data loading

Load the two CSV files, pred_maint_timeseries_with_failures.csv into pandas DataFrame.

In [None]:
# df = pd.read_csv(f'{DATA_DIR}/pred_maint_timeseries_with_failures.csv')
df = pd.read_csv(f'{DATA_DIR}/pred_maint_year_outliers_nulls.csv')
display(df.head())

# Model selection

Discuss potential modeling approaches suitable for time series anomaly detection or predictive maintenance (e.g., ARIMA, Prophet, LSTM, Isolation Forest, One-Class SVM). Explain the pros and cons of each approach in the context of this problem.


# Prepare data for modeling

Prepare data for modeling by handling missing values resulting from rolling calculations and splitting the data into training and testing sets.


In [None]:
df.dropna(inplace=True)
display(df.head())

**Reasoning**:
Split the data into training and testing sets (80/20 split) and separate features from target variables for both sets.


In [None]:
temperature_failure_flags = [f'enginegastemperature{i}_failure_flag' for i in range(1, 7)]
features = [col for col in df.columns if col not in temperature_failure_flags and col not in ['datetimestamp']]

X = df[features]
y = df[temperature_failure_flags]

train_size = int(len(df) * 0.8)
X_train, X_test = X[0:train_size], X[train_size:len(df)]
y_train, y_test = y[0:train_size], y[train_size:len(df)]

print("Training set shapes:")
print(f"Features (X_train): {X_train.shape}")
print(f"Targets (y_train): {y_train.shape}")
print("\nTesting set shapes:")
print(f"Features (X_test): {X_test.shape}")
print(f"Targets (y_test): {y_test.shape}")

In [None]:
# prompt: concept simulation usig isolation forest

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

# Select features relevant for anomaly detection.
# These could be the original sensor readings or engineered features like deviations and ratios.
# Using a subset of features often works well for Isolation Forest.
# Let's use the original temperature and pressure readings for simplicity in this concept simulation.
features_for_anomaly = [f'enginegastemperature{i}' for i in range(1, 7)] + ['pressure']

# Drop rows with NaN values introduced by previous feature engineering steps if any
# Although the data is already clean after dropna, it's good practice if using a subset of features
# that might not have NaNs.
X_anomaly = df[features_for_anomaly].copy()

# Initialize Isolation Forest model
# n_estimators: The number of base estimators in the ensemble.
# contamination: The proportion of outliers in the data set.
#   'auto': decides based on the training data's distribution.
#   float: The proportion of outliers in the data set.
#   We can set a contamination value based on our understanding or let it be 'auto'.
#   Given the low number of defined failures, a small contamination value might be reasonable,
#   or 'auto' can be tried first. Let's use 'auto' initially.
# random_state: Ensures reproducibility.
isolation_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)

# Fit the model to the data
# Isolation Forest trains on the entire dataset to find anomalies.
isolation_forest.fit(X_anomaly)

# Predict anomalies
# predict returns -1 for outliers and 1 for inliers.
# We want to identify the outliers (-1).
df['isolation_forest_anomaly'] = isolation_forest.predict(X_anomaly)

# Convert predictions to a more intuitive format: 1 for anomaly, 0 for normal
df['isolation_forest_anomaly'] = df['isolation_forest_anomaly'].map({-1: 1, 1: 0})

# Display the number of anomalies detected by Isolation Forest
print("Number of anomalies detected by Isolation Forest:", df['isolation_forest_anomaly'].sum())

# Compare Isolation Forest anomalies with the defined failure flags
print("\nComparison of Isolation Forest anomalies and defined failure flags:")

# Create a combined failure flag for any temperature sensor failure
df['any_temp_failure'] = df[temperature_failure_flags].max(axis=1)

# Calculate a confusion matrix or cross-tabulation
comparison_table = pd.crosstab(df['isolation_forest_anomaly'], df['any_temp_failure'],
                               rownames=['Isolation Forest Anomaly'], colnames=['Defined Failure Flag'])
print(comparison_table)

# Note: This comparison is a concept simulation. Isolation Forest is unsupervised
# and identifies statistical outliers based on the feature distribution.
# The defined failure flags are based on a specific domain-knowledge-driven rule.
# The two might not perfectly align. The table shows how many instances flagged by Isolation Forest
# were also flagged by your rule, and vice versa.

# Visualize a sample of data points, highlighting Isolation Forest anomalies
# Select a subset of features for visualization
viz_features = ['enginegastemperature1', 'enginegastemperature2'] # Example
plt.figure(figsize=(15, 7))
# Plot normal points
plt.scatter(df.index[df['isolation_forest_anomaly'] == 0], df[viz_features[0]][df['isolation_forest_anomaly'] == 0],
            c='blue', s=10, label='Normal')
# Plot anomalies
plt.scatter(df.index[df['isolation_forest_anomaly'] == 1], df[viz_features[0]][df['isolation_forest_anomaly'] == 1],
            c='red', s=20, label='Anomaly (Isolation Forest)')
plt.title(f'Isolation Forest Anomaly Detection on {viz_features[0]} over Time')
plt.xlabel('Time')
plt.ylabel(viz_features[0])
plt.legend()
plt.show()

# You can repeat the visualization for other sensors or combinations of features.