## Prepare data

### Only consider remove outliers

Read all the data. Apply simple outlier removal by validity range.

In [2]:
from utility import read_all_test_data_from_path
import numpy as np
import pandas as pd


def remove_outliers(df: pd.DataFrame):
    ''' # Description
    Remove outliers from the dataframe based on defined valid ranges. 
    Define a valid range of temperature and voltage. 
    Use ffil function to replace the invalid measurement with the previous value.
    '''
    df['temperature'] = df['temperature'].where(df['temperature'] <= 100, np.nan)
    df['temperature'] = df['temperature'].where(df['temperature'] >= 0, np.nan)
    df['temperature'] = df['temperature'].ffill()

    df['voltage'] = df['voltage'].where(df['voltage'] >= 6000, np.nan)
    df['voltage'] = df['voltage'].where(df['voltage'] <= 9000, np.nan)
    df['voltage'] = df['voltage'].ffill()

    df['position'] = df['position'].where(df['position'] >= 0, np.nan)
    df['position'] = df['position'].where(df['position'] <= 1000, np.nan)
    df['position'] = df['position'].ffill()


base_dictionary = '../../projects/maintenance_industry_4_2024/dataset/training_data/'
df_data = read_all_test_data_from_path(base_dictionary, remove_outliers, is_plot=False)

## Define experiment supporting function

Here we define two supporting functions:
- run_one_motor_detection: runs one motor detection experiment
- run_all_motors: runs experiments on all the six motors.

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from utility import run_cross_val


def run_one_motor_detection(motor_idx, df_data, mdl, feature_list, window_size=0, single_run_result=True, mdl_type='clf'):
    ''' Train and test the fault detection model for one motor. '''
    # Get the name of the response variable.
    y_name = f'data_motor_{motor_idx}_label'

    # Seperate features and the response variable.
    # Remove the irrelavent features.
    feature_list.append('test_condition')
    df_x = df_data[feature_list]
    # Get y.
    y = df_data.loc[:, y_name]

    print(f'Model for predicting the label of motor {motor_idx}:')
    # Run cross validation.
    df_perf = run_cross_val(mdl, df_x, y, window_size=window_size, single_run_result=single_run_result, mdl_type=mdl_type)
    # Print the mean performance.
    print(df_perf.mean())
    print('\n')

    return df_perf


def run_all_motors(df_data, mdl, window_size=0, single_run_result=True, mdl_type='clf'):
    all_results = []
    # Loop over all the six motors.
    for i in range(1, 7):
        df_perf = run_one_motor_detection(i, df_data, mdl, window_size=window_size, single_run_result=single_run_result, mdl_type=mdl_type)

        all_results.append(df_perf)

    return all_results

## Logistic regression

### Dataset

In this notbook, we run an experiment on motor 6. We pick 5 datasets where four contains failure labels and the one does not.

In [4]:
# We pick five tests with label 1.
df_data = df_data[df_data['test_condition'].isin(['20240325_155003', '20240425_093699', '20240425_094425', '20240426_140055', '20240426_141190'])]

### Window size = 0

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV

# Define the steps of the pipeline
steps = [
    ('standardizer', StandardScaler()),  # Step 1: StandardScaler
    ('mdl', LogisticRegression(class_weight='balanced'))    # Step 2: Linear Regression
]

# Create the pipeline
pipeline = Pipeline(steps)

# Define hyperparameters to search
param_grid = {
    'mdl__C': [0.001, 0.01, 0.1, 1, 10, 100]  # Inverse of regularization strength
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1', cv=5)

# all_results = run_all_motors(df_data, grid_search, window_size=0, single_run_result=False, mdl_type='clf')
feature_list_all = ['time', 'data_motor_1_position', 'data_motor_1_temperature', 'data_motor_1_voltage',
                'data_motor_2_position', 'data_motor_2_temperature', 'data_motor_2_voltage',
                'data_motor_3_position', 'data_motor_3_temperature', 'data_motor_3_voltage',
                'data_motor_4_position', 'data_motor_4_temperature', 'data_motor_4_voltage',
                'data_motor_5_position', 'data_motor_5_temperature', 'data_motor_5_voltage',
                'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']
all_results = run_one_motor_detection(6, df_data, grid_search, feature_list_all, window_size=0, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 6:
Accuracy     0.391385
Precision    0.129136
Recall       0.600000
F1 score     0.202448
dtype: float64




In [6]:
feature_list_self = ['time', 'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']
all_results = run_one_motor_detection(6, df_data, grid_search, feature_list_self, window_size=0, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 6:
Accuracy     0.229143
Precision    0.129136
Recall       0.600000
F1 score     0.202448
dtype: float64




### window size = 5

In [7]:
# all_results = run_all_motors(df_data, grid_search, window_size=5, single_run_result=False, mdl_type='clf')
feature_list_all = ['time', 'data_motor_1_position', 'data_motor_1_temperature', 'data_motor_1_voltage',
                'data_motor_2_position', 'data_motor_2_temperature', 'data_motor_2_voltage',
                'data_motor_3_position', 'data_motor_3_temperature', 'data_motor_3_voltage',
                'data_motor_4_position', 'data_motor_4_temperature', 'data_motor_4_voltage',
                'data_motor_5_position', 'data_motor_5_temperature', 'data_motor_5_voltage',
                'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']
all_results = run_one_motor_detection(6, df_data, grid_search, feature_list_all, window_size=5, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 6:
Accuracy     0.393565
Precision    0.131180
Recall       0.600000
F1 score     0.204800
dtype: float64




In [8]:
feature_list_self = ['time', 'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']
all_results = run_one_motor_detection(6, df_data, grid_search, feature_list_self, window_size=5, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 6:
Accuracy     0.235018
Precision    0.131180
Recall       0.600000
F1 score     0.204800
dtype: float64




### window size = 10

In [9]:
# all_results = run_all_motors(df_data, grid_search, window_size=10, single_run_result=False, mdl_type='clf')
feature_list_all = ['time', 'data_motor_1_position', 'data_motor_1_temperature', 'data_motor_1_voltage',
                'data_motor_2_position', 'data_motor_2_temperature', 'data_motor_2_voltage',
                'data_motor_3_position', 'data_motor_3_temperature', 'data_motor_3_voltage',
                'data_motor_4_position', 'data_motor_4_temperature', 'data_motor_4_voltage',
                'data_motor_5_position', 'data_motor_5_temperature', 'data_motor_5_voltage',
                'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']
all_results = run_one_motor_detection(6, df_data, grid_search, feature_list_all, window_size=10, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 6:
Accuracy     0.395795
Precision    0.133303
Recall       0.600000
F1 score     0.207217
dtype: float64




In [10]:
feature_list_self = ['time', 'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']
all_results = run_one_motor_detection(6, df_data, grid_search, feature_list_self, window_size=10, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 6:
Accuracy     0.238048
Precision    0.133303
Recall       0.600000
F1 score     0.207217
dtype: float64




In [None]:
# all_results = run_all_motors(df_data, grid_search, window_size=10, single_run_result=False, mdl_type='clf')
feature_list_all = ['time', 'data_motor_1_position', 'data_motor_1_temperature', 'data_motor_1_voltage',
                'data_motor_2_position', 'data_motor_2_temperature', 'data_motor_2_voltage',
                'data_motor_3_position', 'data_motor_3_temperature', 'data_motor_3_voltage',
                'data_motor_4_position', 'data_motor_4_temperature', 'data_motor_4_voltage',
                'data_motor_5_position', 'data_motor_5_temperature', 'data_motor_5_voltage',
                'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']
all_results = run_one_motor_detection(6, df_data, grid_search, feature_list_all, window_size=100, single_run_result=False, mdl_type='clf')

In [17]:
feature_list_self = ['time', 'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']
all_results = run_one_motor_detection(6, df_data, grid_search, feature_list_self, window_size=100, single_run_result=False, mdl_type='clf')

Model for predicting the label of motor 6:
Accuracy     0.399745
Precision    0.185009
Recall       0.447619
F1 score     0.248517
dtype: float64


