In [5]:
import pandas as pd
from sqlalchemy import create_engine

# Create a connection to the database
engine = create_engine('mssql+pyodbc://MSI/predictive_maintenance_db?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes')

# Load the processed data from the 'robs_processed_PdM_data' table into a Pandas DataFrame
merged_df_final = pd.read_sql('SELECT * FROM robs_processed_PdM_data', con=engine)

# Inspect the loaded data
print(merged_df_final.head())


         volt    pressure  vibration      rotate  vibration_t-1  pressure_t-1  \
0  160.885186  109.714695  33.203553  436.054810      32.535653     83.877577   
1  188.572644   82.834470  43.547499  322.956840      33.203553    109.714695   
2  184.634151   92.390897  34.471015  498.404580      43.547499     82.834470   
3  158.910760  105.526910  39.632091  354.412791      34.471015     92.390897   
4  170.827583   86.865686  32.497976  435.652685      39.632091    105.526910   

     volt_t-1  rotate_t-1  total_failure_flag  total_maint_flag  ...  \
0  167.517952  492.004989                   0                 0  ...   
1  160.885186  436.054810                   0                 0  ...   
2  188.572644  322.956840                   0                 0  ...   
3  184.634151  498.404580                   0                 0  ...   
4  158.910760  354.412791                   0                 0  ...   

   error_flag  anomaly_flag  maint_flag  failure_flag  time_since_last_maint  \


In [30]:
merged_df_final.columns

Index(['volt', 'pressure', 'vibration', 'rotate', 'vibration_t-1',
       'pressure_t-1', 'volt_t-1', 'rotate_t-1', 'total_failure_flag',
       'total_maint_flag', 'avg_rolling_mean_volt',
       'avg_rolling_mean_pressure', 'avg_rolling_mean_rotate',
       'avg_rolling_mean_vibration', 'avg_time_since_last_failure',
       'error_flag', 'anomaly_flag', 'maint_flag', 'failure_flag',
       'time_since_last_maint', 'pressure_x_volt', 'rotate_x_vibration',
       'overall_maintenance_history', 'model', 'age'],
      dtype='object')

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'


In [11]:
engine = create_engine('mssql+pyodbc://MSI/predictive_maintenance_db?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes')
errors_df = pd.read_sql('SELECT * FROM PdM_errors', con=engine)
failures_df = pd.read_sql('SELECT * FROM PdM_failures', con=engine)
machines_df = pd.read_sql('SELECT * FROM PdM_machines', con=engine)
maint_df = pd.read_sql('SELECT * FROM PdM_maint', con=engine)
telemetry_df = pd.read_sql('SELECT * FROM PdM_telemetry', con=engine)

In [13]:
model_mapping = {value: i for i, value in enumerate(machines_df['model'].unique(), start=1)}
machines_df['model'] = machines_df['model'].map(model_mapping)

In [14]:
failures_df = pd.get_dummies(failures_df, columns=['failure'], prefix='fail', dtype=int)
maint_df = pd.get_dummies(maint_df, columns=['comp'], prefix='maint', dtype=int)
errors_df = pd.get_dummies(errors_df, columns=['errorID'], prefix='', prefix_sep='', dtype=int)

In [16]:
# Convert date columns to datetime format
errors_df['datetime'] = pd.to_datetime(errors_df['datetime'])
failures_df['datetime'] = pd.to_datetime(failures_df['datetime'])
maint_df['datetime'] = pd.to_datetime(maint_df['datetime'])
telemetry_df['datetime'] = pd.to_datetime(telemetry_df['datetime'])

In [17]:
errors_df = errors_df.rename(columns={'datetime': 'datetime_error'})
failures_df = failures_df.rename(columns={'datetime': 'datetime_failure'})
maint_df = maint_df.rename(columns={'datetime': 'datetime_maint'})
telemetry_df = telemetry_df.rename(columns={'datetime': 'datetime_telemetry'})

In [19]:
failures_df['failure_flag'] = 1
failure_count = failures_df['failure_flag'].sum()

errors_df['error_flag'] = 1
error_count = errors_df['error_flag'].sum()

maint_df['maint_flag'] = 1
maintenance_count = maint_df['maint_flag'].sum()

In [20]:
telemetry_df = telemetry_df.sort_values('datetime_telemetry')
failures_df = failures_df.sort_values('datetime_failure')
errors_df = errors_df.sort_values('datetime_error')
maintenance_df = maint_df.sort_values('datetime_maint')

merged_df = pd.merge(
    telemetry_df,
    machines_df,
    on='machineID',
    how='left'
)
merged_df = pd.merge_asof(
    merged_df,
    failures_df[['machineID', 'datetime_failure', 'fail_comp1', 'fail_comp2', 'fail_comp3', 'fail_comp4', 'failure_flag']],
    by='machineID',
    left_on='datetime_telemetry',
    right_on='datetime_failure',
    direction='backward',
    tolerance=pd.Timedelta('1d')  
)
merged_df = pd.merge_asof(
    merged_df,
    errors_df[['machineID', 'datetime_error', 'error1', 'error2', 'error3', 'error4', 'error5', 'error_flag']],
    by='machineID',
    left_on='datetime_telemetry',
    right_on='datetime_error',
    direction='backward',
    tolerance=pd.Timedelta('1d')  
)
merged_df = pd.merge_asof(
    merged_df,
    maintenance_df[['machineID', 'datetime_maint', 'maint_comp1', 'maint_comp2', 'maint_comp3', 'maint_comp4', 'maint_flag']],
    by='machineID',
    left_on='datetime_telemetry',
    right_on='datetime_maint',
    direction='backward',
    tolerance=pd.Timedelta('7d') 
)
merged_df.fillna(0, inplace=True)

indicator_columns = ['failure_flag', 'error_flag', 'maint_flag',
                     'fail_comp1', 'fail_comp2', 'fail_comp3', 'fail_comp4',
                     'error1', 'error2', 'error3', 'error4', 'error5',
                     'maint_comp1', 'maint_comp2', 'maint_comp3', 'maint_comp4']

for col in indicator_columns:
    if col in merged_df.columns: 
        merged_df[col] = merged_df[col].astype(int)

display(merged_df)

Unnamed: 0,datetime_telemetry,machineID,volt,rotate,pressure,vibration,date,model,age,datetime_failure,...,error3,error4,error5,error_flag,datetime_maint,maint_comp1,maint_comp2,maint_comp3,maint_comp4,maint_flag
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,2015-01-01,1,18,0,...,0,0,0,0,0,0,0,0,0,0
1,2015-01-01 06:00:00,50,174.107006,380.449983,100.749836,43.185498,2015-01-01,2,4,0,...,0,0,0,0,0,0,0,0,0,0
2,2015-01-01 06:00:00,60,172.394188,394.464191,103.233454,39.144937,2015-01-01,2,3,0,...,0,0,0,0,0,0,0,0,0,0
3,2015-01-01 06:00:00,31,174.475535,483.608665,107.143516,31.630132,2015-01-01,4,11,0,...,0,0,0,0,0,0,0,0,0,0
4,2015-01-01 06:00:00,78,168.664547,504.012527,94.782861,42.962420,2015-01-01,2,19,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876095,2016-01-01 06:00:00,1,191.873171,382.736626,100.893691,37.940220,2016-01-01,1,18,0,...,0,0,0,0,2015-12-31 06:00:00,1,0,0,0,1
876096,2016-01-01 06:00:00,22,169.744691,459.621105,100.110835,43.578150,2016-01-01,4,14,0,...,0,0,0,0,0,0,0,0,0,0
876097,2016-01-01 06:00:00,24,187.324623,372.785606,109.024934,37.397309,2016-01-01,4,20,0,...,0,0,0,0,2015-12-28 06:00:00,0,0,1,0,1
876098,2016-01-01 06:00:00,9,176.007480,414.778970,93.243580,37.594398,2016-01-01,2,7,0,...,0,0,0,0,2015-12-29 06:00:00,1,0,0,0,1


In [22]:
# Time since last maintenance for each component
for comp in ['comp1', 'comp2', 'comp3', 'comp4']:
    merged_df[f'time_since_last_maint_{comp}'] = (
        merged_df.groupby('machineID')['datetime_telemetry']
        .transform(lambda x: x - x.where(merged_df[f'maint_{comp}'] == 1).ffill())
    ).dt.total_seconds() / 3600  # Convert to hours
    merged_df[f'time_since_last_maint_{comp}'].fillna(99999, inplace=True)  # Handle NaN values


In [27]:
# Cumulative errors in the last 24 hours for each component
for comp in ['comp1', 'comp2', 'comp3', 'comp4']:
    merged_df[f'cumulative_errors_last_24h_{comp}'] = (
        merged_df.groupby('machineID')['error_flag']
        .rolling(window=24).sum().reset_index(level=0, drop=True)
    )


In [24]:
# Extract cyclic/seasonal features from the datetime column
merged_df['hour_of_day'] = merged_df['datetime_telemetry'].dt.hour
merged_df['day_of_week'] = merged_df['datetime_telemetry'].dt.dayofweek


In [26]:
# Rolling standard deviations for each component
for comp in ['comp1', 'comp2', 'comp3', 'comp4']:
    merged_df[f'std_rolling_mean_volt_{comp}'] = (
        merged_df.groupby('machineID')['volt']
        .rolling(window=24).std().reset_index(level=0, drop=True)  # Use an integer for window size
    )


In [31]:
# Time since last maintenance for machines (in hours) - this focuses on general "whole machine" maintenance
merged_df['time_since_last_maint'] = (
    merged_df.groupby('machineID')['datetime_telemetry']
    .transform(lambda x: x - x.where(merged_df['maint_flag'] == 1).ffill())
).dt.total_seconds() / 3600
merged_df['time_since_last_maint'] = merged_df['time_since_last_maint'].fillna(99999)  

In [32]:
# Time since last failure for components
# Define the components
components = ['fail_comp1', 'fail_comp2', 'fail_comp3', 'fail_comp4']
for component in components:
    merged_df[f'time_since_last_failure_{component}'] = (
        merged_df.groupby('machineID')['datetime_telemetry']
        .transform(lambda x: x - x.where(merged_df[component] == 1).ffill())
    ).dt.total_seconds() / 3600  # Convert to hours
    merged_df[f'time_since_last_failure_{component}'].fillna(0, inplace=True)  # Fill NaN values with 0

In [33]:
# Overall maintenance history
merged_df['overall_maintenance_history'] = (
    merged_df.groupby('machineID')['maint_flag']
    .transform('sum')
)

In [34]:
# Rolling averages per component 
merged_df['datetime_telemetry'] = pd.to_datetime(merged_df['datetime_telemetry'])
merged_df = merged_df.sort_values(['machineID', 'datetime_telemetry'])
merged_df.set_index('datetime_telemetry', inplace=True)

component_metrics = {
    'fail_comp1': ['volt', 'rotate', 'pressure', 'vibration'],
    'fail_comp2': ['volt', 'rotate', 'pressure', 'vibration'],
    'fail_comp3': ['volt', 'rotate', 'pressure', 'vibration'],
    'fail_comp4': ['volt', 'rotate', 'pressure', 'vibration']
}

for component, metrics in component_metrics.items():
    for metric in metrics:
        merged_df[f'rolling_mean_{metric}_{component}'] = (
            merged_df.groupby('machineID')[metric]
            .rolling('24h').mean() 
        ).reset_index(level=0, drop=True)
merged_df.reset_index(inplace=True)

In [36]:
# Interaction features
merged_df['pressure_x_volt'] = merged_df['pressure'] * merged_df['volt']
merged_df['pressure_x_vibration'] = merged_df['pressure'] * merged_df['vibration']
merged_df['rotate_x_vibration'] = merged_df['rotate'] * merged_df['vibration']
merged_df['rotate_x_volt'] = merged_df['rotate'] * merged_df['volt']

# Lag features
merged_df['vibration_t-1'] = merged_df['vibration'].shift(1)
merged_df['pressure_t-1'] = merged_df['pressure'].shift(1)
merged_df['volt_t-1'] = merged_df['volt'].shift(1)
merged_df['rotate_t-1'] = merged_df['rotate'].shift(1)

# Filling NaN values with zeros
merged_df.fillna(0, inplace=True)


In [38]:
merged_df['total_failure_flag'] = merged_df[['fail_comp1', 'fail_comp2', 'fail_comp3', 'fail_comp4']].max(axis=1)
merged_df['total_maint_flag'] = merged_df[['maint_comp1', 'maint_comp2', 'maint_comp3', 'maint_comp4']].max(axis=1)
merged_df['total_error_flag'] = merged_df[['error1', 'error2', 'error3', 'error4', 'error5']].max(axis=1)

merged_df['avg_rolling_mean_volt'] = merged_df[['rolling_mean_volt_fail_comp1', 'rolling_mean_volt_fail_comp2', 'rolling_mean_volt_fail_comp3', 'rolling_mean_volt_fail_comp4']].mean(axis=1)
merged_df['avg_rolling_mean_pressure'] = merged_df[['rolling_mean_pressure_fail_comp1', 'rolling_mean_pressure_fail_comp2', 'rolling_mean_pressure_fail_comp3', 'rolling_mean_pressure_fail_comp4']].mean(axis=1)
merged_df['avg_rolling_mean_rotate'] = merged_df[['rolling_mean_rotate_fail_comp1', 'rolling_mean_rotate_fail_comp2', 'rolling_mean_rotate_fail_comp3', 'rolling_mean_rotate_fail_comp4']].mean(axis=1)
merged_df['avg_rolling_mean_vibration'] = merged_df[['rolling_mean_vibration_fail_comp1', 'rolling_mean_vibration_fail_comp2', 'rolling_mean_vibration_fail_comp3', 'rolling_mean_vibration_fail_comp4']].mean(axis=1)

merged_df['avg_time_since_last_failure'] = merged_df[['time_since_last_failure_fail_comp1', 'time_since_last_failure_fail_comp2', 'time_since_last_failure_fail_comp3', 'time_since_last_failure_fail_comp4']].mean(axis=1)

In [40]:
# Exclude columns you don't want as features (e.g., datetime, machineID, target columns)
columns_to_exclude = ['datetime_telemetry', 'machineID', 
                      'RUL_fail_comp1', 'RUL_fail_comp2', 'RUL_fail_comp3', 'RUL_fail_comp4',
                      'failure_flag', 'maint_flag', 'error_flag']  # Add any other non-feature columns

# Create a list of feature columns by excluding non-feature columns
feature_columns = [col for col in merged_df.columns if col not in columns_to_exclude]

# Output the list of feature columns
print(feature_columns)
len(feature_columns)

['volt', 'rotate', 'pressure', 'vibration', 'date', 'model', 'age', 'datetime_failure', 'fail_comp1', 'fail_comp2', 'fail_comp3', 'fail_comp4', 'datetime_error', 'error1', 'error2', 'error3', 'error4', 'error5', 'datetime_maint', 'maint_comp1', 'maint_comp2', 'maint_comp3', 'maint_comp4', 'time_since_last_maint_comp1', 'time_since_last_maint_comp2', 'time_since_last_maint_comp3', 'time_since_last_maint_comp4', 'hour_of_day', 'day_of_week', 'std_rolling_mean_volt_comp1', 'std_rolling_mean_volt_comp2', 'std_rolling_mean_volt_comp3', 'std_rolling_mean_volt_comp4', 'cumulative_errors_last_24h_comp1', 'cumulative_errors_last_24h_comp2', 'cumulative_errors_last_24h_comp3', 'cumulative_errors_last_24h_comp4', 'time_since_last_maint', 'time_since_last_failure_fail_comp1', 'time_since_last_failure_fail_comp2', 'time_since_last_failure_fail_comp3', 'time_since_last_failure_fail_comp4', 'overall_maintenance_history', 'rolling_mean_volt_fail_comp1', 'rolling_mean_rotate_fail_comp1', 'rolling_mean_

75

In [43]:
from sklearn.model_selection import train_test_split

# Select features (you can customize this based on your analysis)
features = ['volt', 'rotate', 'pressure', 'vibration', 'date', 'model', 'age', 
            'datetime_failure', 'fail_comp1', 'fail_comp2', 'fail_comp3', 'fail_comp4', 
            'datetime_error', 'error1', 'error2', 'error3', 'error4', 'error5', 'datetime_maint', 
            'maint_comp1', 'maint_comp2', 'maint_comp3', 'maint_comp4', 'time_since_last_maint_comp1', 
            'time_since_last_maint_comp2', 'time_since_last_maint_comp3', 'time_since_last_maint_comp4', 
            'hour_of_day', 'day_of_week', 'std_rolling_mean_volt_comp1', 'std_rolling_mean_volt_comp2', 
            'std_rolling_mean_volt_comp3', 'std_rolling_mean_volt_comp4', 'cumulative_errors_last_24h_comp1', 
            'cumulative_errors_last_24h_comp2', 'cumulative_errors_last_24h_comp3', 'cumulative_errors_last_24h_comp4', 
            'time_since_last_maint', 'time_since_last_failure_fail_comp1', 'time_since_last_failure_fail_comp2', 
            'time_since_last_failure_fail_comp3', 'time_since_last_failure_fail_comp4', 'overall_maintenance_history', 
            'rolling_mean_volt_fail_comp1', 'rolling_mean_rotate_fail_comp1', 'rolling_mean_pressure_fail_comp1', 
            'rolling_mean_vibration_fail_comp1', 'rolling_mean_volt_fail_comp2', 'rolling_mean_rotate_fail_comp2', 
            'rolling_mean_pressure_fail_comp2', 'rolling_mean_vibration_fail_comp2', 'rolling_mean_volt_fail_comp3', 
            'rolling_mean_rotate_fail_comp3', 'rolling_mean_pressure_fail_comp3', 'rolling_mean_vibration_fail_comp3', 
            'rolling_mean_volt_fail_comp4', 'rolling_mean_rotate_fail_comp4', 'rolling_mean_pressure_fail_comp4', 
            'rolling_mean_vibration_fail_comp4', 'pressure_x_volt', 'pressure_x_vibration', 'rotate_x_vibration', 
            'rotate_x_volt', 'vibration_t-1', 'pressure_t-1', 'volt_t-1', 'rotate_t-1', 'total_failure_flag', 
            'total_maint_flag', 'total_error_flag', 'avg_rolling_mean_volt', 'avg_rolling_mean_pressure', 
            'avg_rolling_mean_rotate', 'avg_rolling_mean_vibration', 'avg_time_since_last_failure']
X = merged_df[features]
X_numeric = merged_df[features].select_dtypes(include=[np.number])
y = merged_df['failure_flag']  # Binary: 1 for failure, 0 for no failure

X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

# Check the shape of the numeric datasets
print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')

Training data shape: (700880, 71)
Testing data shape: (175220, 71)


In [46]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a basic Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=40, n_jobs=-1)

# Fit the model on the training data
rf_clf.fit(X_train, y_train)


In [47]:
# Extract feature importance scores
importances = rf_clf.feature_importances_

# Create a DataFrame to view feature names and their corresponding importance
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance scores
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top features
print(feature_importance_df.head(20))


                              Feature  Importance
63                 total_failure_flag    0.405140
7                          fail_comp2    0.113093
6                          fail_comp1    0.095713
8                          fail_comp3    0.069943
9                          fail_comp4    0.060212
30   cumulative_errors_last_24h_comp2    0.029702
29   cumulative_errors_last_24h_comp1    0.021233
32   cumulative_errors_last_24h_comp4    0.016903
64                   total_maint_flag    0.015542
33              time_since_last_maint    0.015219
22        time_since_last_maint_comp4    0.011278
31   cumulative_errors_last_24h_comp3    0.010130
65                   total_error_flag    0.009127
44     rolling_mean_rotate_fail_comp2    0.008438
52     rolling_mean_rotate_fail_comp4    0.007001
66              avg_rolling_mean_volt    0.006661
48     rolling_mean_rotate_fail_comp3    0.005915
54  rolling_mean_vibration_fail_comp4    0.005316
47       rolling_mean_volt_fail_comp3    0.005298


In [48]:

# Select the top 20 most important features
top_20_features = feature_importance_df['Feature'].head(20).tolist()

# Subset X_train and X_test to use only these top 20 features
X_train_top20 = X_train[top_20_features]
X_test_top20 = X_test[top_20_features]

In [49]:
# Initialize the Random Forest classifier again
rf_clf_top20 = RandomForestClassifier(random_state=42)

# Train the classifier with the top 20 features
rf_clf_top20.fit(X_train_top20, y_train)

# Test the classifier on the test data
y_pred = rf_clf_top20.predict(X_test_top20)

# Evaluate the performance (e.g., accuracy, confusion matrix, etc.)
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')


Accuracy: 1.0
Confusion Matrix:
[[171644      0]
 [     0   3576]]


In [50]:
# Get the predicted probabilities for the test set
y_proba = rf_clf_top20.predict_proba(X_test_top20)[:, 1]  # Probability of failure (class 1)

# Check the predicted probabilities for the first five samples
print("Predicted failure probabilities for the first five samples:")
print(y_proba[:5])



Predicted failure probabilities for the first five samples:
[0. 0. 0. 0. 0.]


In [51]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    171644
           1       1.00      1.00      1.00      3576

    accuracy                           1.00    175220
   macro avg       1.00      1.00      1.00    175220
weighted avg       1.00      1.00      1.00    175220



In [54]:
# Drop or transform non-numeric columns before using SMOTE
X_numeric = X.drop(columns=['datetime_failure', 'datetime_error', 'datetime_maint', 'date'])

# Ensure no non-numeric columns remain
print(X_numeric.dtypes)



volt                           float64
rotate                         float64
pressure                       float64
vibration                      float64
model                            int64
                                ...   
avg_rolling_mean_volt          float64
avg_rolling_mean_pressure      float64
avg_rolling_mean_rotate        float64
avg_rolling_mean_vibration     float64
avg_time_since_last_failure    float64
Length: 71, dtype: object


In [59]:
X = X.drop(columns=['date', 'datetime_failure', 'datetime_error', 'datetime_maint'], errors='ignore')


In [64]:
# Import required libraries
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the data into training, validation, and test sets based on datetime_telemetry
train_data = merged_df[(merged_df['datetime_telemetry'] >= '2014-06-01') &
                       (merged_df['datetime_telemetry'] <= '2015-08-31')]
val_data = merged_df[(merged_df['datetime_telemetry'] >= '2015-09-01') &
                     (merged_df['datetime_telemetry'] <= '2015-10-31')]
test_data = merged_df[(merged_df['datetime_telemetry'] >= '2015-11-01') &
                      (merged_df['datetime_telemetry'] <= '2015-12-31')]

# Reset the index for each DataFrame
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Define the target and features for train, validation, and test sets
top_20_features = ['volt', 'rotate', 'pressure', 'vibration', 'model', 'age', 
                   'time_since_last_maint_comp1', 'time_since_last_maint_comp2', 
                   'time_since_last_maint_comp3', 'time_since_last_maint_comp4', 
                   'hour_of_day', 'day_of_week', 'std_rolling_mean_volt_comp1', 
                   'std_rolling_mean_volt_comp2', 'std_rolling_mean_volt_comp3', 
                   'std_rolling_mean_volt_comp4', 'cumulative_errors_last_24h_comp1', 
                   'cumulative_errors_last_24h_comp2', 'cumulative_errors_last_24h_comp3', 
                   'cumulative_errors_last_24h_comp4']

X_train = train_data[top_20_features]
X_val = val_data[top_20_features]
X_test = test_data[top_20_features]
y_train = train_data['failure_flag']
y_val = val_data['failure_flag']
y_test = test_data['failure_flag']

# Scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Check the new class distribution after SMOTE
print(f'Original dataset shape: {y_train.value_counts()}')
print(f'Resampled dataset shape: {y_train_res.value_counts()}')

# Train the Random Forest classifier on the balanced dataset
classifier = RandomForestClassifier(random_state=42, n_estimators=40, n_jobs=-1)
classifier.fit(X_train_res, y_train_res)

# Make predictions on the validation set
y_val_pred = classifier.predict(X_val_scaled)

# Evaluate the model
print(f'Validation Accuracy: {classifier.score(X_val_scaled, y_val)}')
print('Validation Confusion Matrix:')
print(confusion_matrix(y_val, y_val_pred))
print('Validation Classification Report:')
print(classification_report(y_val, y_val_pred))

# Get predicted probabilities for the validation set
y_proba = classifier.predict_proba(X_val_scaled)[:, 1]  # Probability of failure (class 1)
print('Predicted failure probabilities for the first five validation samples:')
print(y_proba[:5])


Original dataset shape: failure_flag
0    568090
1     12210
Name: count, dtype: int64
Resampled dataset shape: failure_flag
0    568090
1    568090
Name: count, dtype: int64
Validation Accuracy: 0.9773698820263705
Validation Confusion Matrix:
[[138784   2576]
 [   685   2055]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    141360
           1       0.44      0.75      0.56      2740

    accuracy                           0.98    144100
   macro avg       0.72      0.87      0.77    144100
weighted avg       0.98      0.98      0.98    144100

Predicted failure probabilities for the first five validation samples:
[0.15 0.   0.   0.   0.  ]


In [74]:
X_test_top20.head()

Unnamed: 0,total_failure_flag,fail_comp2,fail_comp1,fail_comp3,fail_comp4,cumulative_errors_last_24h_comp2,cumulative_errors_last_24h_comp1,cumulative_errors_last_24h_comp4,total_maint_flag,time_since_last_maint,time_since_last_maint_comp4,cumulative_errors_last_24h_comp3,total_error_flag,rolling_mean_rotate_fail_comp2,rolling_mean_rotate_fail_comp4,avg_rolling_mean_volt,rolling_mean_rotate_fail_comp3,rolling_mean_vibration_fail_comp4,rolling_mean_volt_fail_comp3,rolling_mean_volt_fail_comp2
339238,0,0,0,0,0,0.0,0.0,0.0,0,32.0,32.0,0.0,0,454.805893,454.805893,196.895908,454.805893,39.731436,196.895908,196.895908
105521,0,0,0,0,0,0.0,0.0,0.0,0,317.0,99999.0,0.0,0,440.755019,440.755019,165.824411,440.755019,39.41588,165.824411,165.824411
534002,0,0,0,0,0,0.0,0.0,0.0,1,0.0,0.0,0.0,0,448.318643,448.318643,196.998391,448.318643,40.742011,196.998391,196.998391
213313,0,0,0,0,0,0.0,0.0,0.0,1,0.0,0.0,0.0,0,445.782696,445.782696,173.137395,445.782696,40.155988,173.137395,173.137395
560297,0,0,0,0,0,5.0,5.0,5.0,0,170.0,3770.0,5.0,1,437.734005,437.734005,173.001859,437.734005,39.706019,173.001859,173.001859


In [72]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming X_train is your DataFrame containing the training features
# and you are using only numeric features for VIF calculation

# Calculate VIF for each feature
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

# Apply VIF on the top 20 features or any DataFrame with selected numeric features
vif_df = calculate_vif(X_train_top20)

# Display the VIF values
print(vif_df)


  vif = 1. / (1. - r_squared_i)


                              Feature         VIF
0                  total_failure_flag         inf
1                          fail_comp2         inf
2                          fail_comp1         inf
3                          fail_comp3         inf
4                          fail_comp4         inf
5    cumulative_errors_last_24h_comp2         inf
6    cumulative_errors_last_24h_comp1         inf
7    cumulative_errors_last_24h_comp4         inf
8                    total_maint_flag    1.807310
9               time_since_last_maint    1.288002
10        time_since_last_maint_comp4    1.500187
11   cumulative_errors_last_24h_comp3         inf
12                   total_error_flag    2.011093
13     rolling_mean_rotate_fail_comp2         inf
14     rolling_mean_rotate_fail_comp4         inf
15              avg_rolling_mean_volt         inf
16     rolling_mean_rotate_fail_comp3         inf
17  rolling_mean_vibration_fail_comp4  328.599221
18       rolling_mean_volt_fail_comp3         inf


In [73]:
# Drop highly correlated or redundant features
X_train_top20_reduced = X_train_top20.drop(['fail_comp1', 'fail_comp2', 'rolling_mean_volt_fail_comp2'], axis=1)

# Recalculate VIF
vif_df_reduced = calculate_vif(X_train_top20_reduced)
print(vif_df_reduced)


  vif = 1. / (1. - r_squared_i)


                              Feature         VIF
0                  total_failure_flag    1.891046
1                          fail_comp3    1.290776
2                          fail_comp4    1.423159
3    cumulative_errors_last_24h_comp2         inf
4    cumulative_errors_last_24h_comp1         inf
5    cumulative_errors_last_24h_comp4         inf
6                    total_maint_flag    1.807251
7               time_since_last_maint    1.287960
8         time_since_last_maint_comp4    1.499903
9    cumulative_errors_last_24h_comp3         inf
10                   total_error_flag    2.010963
11     rolling_mean_rotate_fail_comp2         inf
12     rolling_mean_rotate_fail_comp4         inf
13              avg_rolling_mean_volt         inf
14     rolling_mean_rotate_fail_comp3         inf
15  rolling_mean_vibration_fail_comp4  326.868611
16       rolling_mean_volt_fail_comp3         inf
