## Prepare data

### Only consider remove outliers

Read all the data. Apply simple outlier removal by validity range.

In [9]:
from utility import read_all_test_data_from_path
import numpy as np
import pandas as pd


def remove_outliers(df: pd.DataFrame):
    ''' # Description
    Remove outliers from the dataframe based on defined valid ranges. 
    Define a valid range of temperature and voltage. 
    Use ffil function to replace the invalid measurement with the previous value.
    '''
    df['temperature'] = df['temperature'].where(df['temperature'] <= 100, np.nan)
    df['temperature'] = df['temperature'].where(df['temperature'] >= 0, np.nan)
    df['temperature'] = df['temperature'].ffill()

    df['voltage'] = df['voltage'].where(df['voltage'] >= 6000, np.nan)
    df['voltage'] = df['voltage'].where(df['voltage'] <= 9000, np.nan)
    df['voltage'] = df['voltage'].ffill()

    df['position'] = df['position'].where(df['position'] >= 0, np.nan)
    df['position'] = df['position'].where(df['position'] <= 1000, np.nan)
    df['position'] = df['position'].ffill()


base_dictionary = '../../projects/maintenance_industry_4_2024/dataset/training_data/'
df_data = read_all_test_data_from_path(base_dictionary, remove_outliers, is_plot=False)

## Define experiment supporting function

### Training the model for all motors, using all the features.

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from utility import run_cross_val

def run_all_motors(df_data, mdl, window_size=0, single_run_result=True, mdl_type='clf'):
    all_results = []
    # Loop over all the six motors.
    for i in range(1, 7):
        # Get the name of the response variable.
        y_name = f'data_motor_{i}_label'
    
        # Seperate features and the response variable.
        # Remove the irrelavent features.
        df_x = df_data.drop(columns=['data_motor_1_label', 'data_motor_2_label', 'data_motor_3_label',
                                     'data_motor_4_label', 'data_motor_5_label','data_motor_6_label'])
        # Get y.
        y = df_data.loc[:, y_name]

        print(f'Model for predicting the label of motor {i}:')
        # Run cross validation.
        df_perf = run_cross_val(mdl, df_x, y, window_size=window_size, single_run_result=single_run_result, mdl_type=mdl_type)
        # Print the mean performance.
        print(df_perf.mean())
        print('\n')

        all_results.append(df_perf)

    return all_results

## Logistic regression

In [11]:
# We pick five tests with label 1.
df_data = df_data[df_data['test_condition'].isin(['20240325_155003', '20240425_093699', '20240425_094425', '20240426_140055', '20240426_141190'])]

### Window size = 0

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV

# Define the steps of the pipeline
steps = [
    ('standardizer', StandardScaler()),  # Step 1: StandardScaler
    ('mdl', LogisticRegression(class_weight='balanced'))    # Step 2: Linear Regression
]

# Create the pipeline
pipeline = Pipeline(steps)

# Define hyperparameters to search
param_grid = {
    'mdl__C': [0.001, 0.01, 0.1, 1, 10, 100]  # Inverse of regularization strength
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1', cv=5)

all_results = run_all_motors(df_data, grid_search, window_size=0, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 1:
Accuracy     0.884783
Precision    0.400000
Recall       0.400000
F1 score     0.400000
dtype: float64


Model for predicting the label of motor 2:
Accuracy     0.661677
Precision    0.600000
Recall       0.405562
F1 score     0.410823
dtype: float64


Model for predicting the label of motor 3:
Accuracy     0.734773
Precision    0.419116
Recall       0.800000
F1 score     0.435787
dtype: float64


Model for predicting the label of motor 4:
Accuracy     0.657775
Precision    0.600000
Recall       0.403638
F1 score     0.407146
dtype: float64


Model for predicting the label of motor 5:
Accuracy     0.843996
Precision    0.400000
Recall       0.400000
F1 score     0.400000
dtype: float64


Model for predicting the label of motor 6:
Accuracy     0.804987
Precision    0.470996
Recall       0.479070
F1 score     0.418126
dtype: float64




### window size = 5

In [15]:
all_results = run_all_motors(df_data, grid_search, window_size=5, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 1:
Accuracy     0.885232
Precision    0.400000
Recall       0.400000
F1 score     0.400000
dtype: float64


Model for predicting the label of motor 2:
Accuracy     0.660943
Precision    0.600000
Recall       0.405416
F1 score     0.410546
dtype: float64


Model for predicting the label of motor 3:
Accuracy     0.760123
Precision    0.420621
Recall       0.800000
F1 score     0.438667
dtype: float64


Model for predicting the label of motor 4:
Accuracy     0.656448
Precision    0.600000
Recall       0.402588
F1 score     0.405109
dtype: float64


Model for predicting the label of motor 5:
Accuracy     0.821477
Precision    0.400000
Recall       0.400000
F1 score     0.400000
dtype: float64


Model for predicting the label of motor 6:
Accuracy     0.806199
Precision    0.472566
Recall       0.479070
F1 score     0.419827
dtype: float64




### window size = 10

In [16]:
all_results = run_all_motors(df_data, grid_search, window_size=10, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 1:


Accuracy     0.884591
Precision    0.400000
Recall       0.400000
F1 score     0.400000
dtype: float64


Model for predicting the label of motor 2:
Accuracy     0.660596
Precision    0.600000
Recall       0.405661
F1 score     0.411010
dtype: float64


Model for predicting the label of motor 3:
Accuracy     0.761332
Precision    0.420796
Recall       0.800000
F1 score     0.438986
dtype: float64


Model for predicting the label of motor 4:
Accuracy     0.653421
Precision    0.400000
Recall       0.400000
F1 score     0.400000
dtype: float64


Model for predicting the label of motor 5:
Accuracy     0.808123
Precision    0.400000
Recall       0.400000
F1 score     0.400000
dtype: float64


Model for predicting the label of motor 6:
Accuracy     0.807476
Precision    0.474208
Recall       0.479070
F1 score     0.421584
dtype: float64




In [17]:
all_results = run_all_motors(df_data, grid_search, window_size=20, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 1:


Accuracy     0.885692
Precision    0.400000
Recall       0.400000
F1 score     0.400000
dtype: float64


Model for predicting the label of motor 2:
Accuracy     0.666133
Precision    0.600000
Recall       0.411912
F1 score     0.422485
dtype: float64


Model for predicting the label of motor 3:
Accuracy     0.763456
Precision    0.421133
Recall       0.800000
F1 score     0.439598
dtype: float64


Model for predicting the label of motor 4:
Accuracy     0.652520
Precision    0.600000
Recall       0.400151
F1 score     0.400301
dtype: float64


Model for predicting the label of motor 5:
Accuracy     0.80699
Precision    0.40000
Recall       0.40000
F1 score     0.40000
dtype: float64


Model for predicting the label of motor 6:
Accuracy     0.810086
Precision    0.277725
Recall       0.279070
F1 score     0.225279
dtype: float64


