## Install and import

In [None]:
# %pip install imblearn

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import joblib
import warnings

## Define the useful functions

The functions `combine_csv` and `load_data` are used to automate and simplify the process of loading and combining data from multiple CSV files located in different directories.

In summary:
- `combine_csv` : combines CSV files from a specified folder into a single DataFrame
- `load_data` : loads data from multiple folders and combines them into a single DataFrame.
These functions provide a convenient way to process and combine data from multiple CSV files and folders.

In [2]:
def combine_csv(folder_path, tmp_path):
    """
    Combine all CSV files in a folder into a single DataFrame.
    :param folder_path: Path to the folder containing the CSV files
    :param seq_idx: Sequence index
    :param label: Label of the sequence (Normal - 0, Abnormal - 1)
    :return: A single DataFrame containing all the data from the CSV files
    """

    # Get a list of all CSV files in the folder
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    # Create an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()

    # Iterate over the CSV files in the folder
    for file in csv_files:
        # Construct the full path to each CSV file
        file_path = os.path.join(folder_path, file)

        # Read each CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Drop the time. Will add later.
        df = df.drop(labels=df.columns[0], axis=1)

        # Extract the file name (excluding the extension) to use as a prefix
        file_name = os.path.splitext(file)[0]

        # Add a prefix to each column based on the file name
        df = df.add_prefix(f'{file_name}_')

        # Concatenate the current DataFrame with the combined DataFrame
        combined_df = pd.concat([combined_df, df], axis=1)

    df = pd.read_csv(file_path)
    combined_df = pd.concat([df['time'], combined_df], axis=1)
    combined_df.loc[:, 'test_condition'] = tmp_path

    return combined_df

def load_data(path_header, folder_path):
    df = pd.DataFrame()
    for tmp_path in folder_path:
        # path = path_header + tmp_path
        path = path_header + '/' + tmp_path
        tmp_df = combine_csv(path, tmp_path)
        df = pd.concat([df, tmp_df])
        df = df.reset_index(drop=True)
    return df

Next step is to be able to preprocess the data we are now able to read. The following functions allow datasets preparation and augmentation for machine learning models :

- `separate_features_and_target_variables`: This function takes a DataFrame (`df`), a list of training feature names (`training_features`), and the name of the target variable (`target_variable`). It separates the features and target variables from the DataFrame and returns them as a tuple.

- `delete_outliers`: This function identifies and removes outliers from a DataFrame (`X`) based on the z-scores of its columns. It calculates the z-scores, finds the rows where the z-scores are greater than a threshold value (`threshold`), and deletes those rows from the DataFrame. It also separates the features and target variables from the modified DataFrame and returns them.

- `add_1st_derivative_features`: This function calculates the first derivative of each column in a DataFrame (`X`) and adds the derivative features to the DataFrame. It iterates over the columns, calculates the first derivative using the `np.gradient` function, and appends the derivative features to a new DataFrame. Finally, it concatenates the new DataFrame with the original DataFrame and returns the combined DataFrame.

- `oversample_minority_class`: This function performs oversampling of the minority class using the Synthetic Minority Over-sampling Technique (SMOTE). It takes the training data (`X_train`) and training labels (`y_train`) as input, creates an SMOTE object, fits it on the training data and labels, and generates synthetic samples to balance the class distribution. It returns the oversampled training data and labels.

- `smoothen_data`: This function applies a moving average to smoothen the data in a DataFrame (`X`). It uses the `rolling` method to calculate the moving average with a specified window size (`window_size`). The smoothened data is returned as a new DataFrame.

In [3]:
def separate_features_and_target_variables(df, training_features, target_variable):
    """
    Separate the features and target variables.
    :param df: The DataFrame containing the features and target variables
    :param training_features: The names of the training features
    :param target_variable: The name of the target variable
    :return: A tuple containing the features DataFrame and the target variable Series
    """
    X = df[training_features]
    y = df[target_variable]

    return X, y

def delete_outliers(X, y, threshold=3):
    """
    Delete outliers from the DataFrame.
    :param df: The DataFrame containing the features and target variables
    :param threshold: The threshold used to identify outliers
    :return: The DataFrame with the outliers removed
    """
    print(y.value_counts())

    # Merge X and y
    df = pd.concat([X, y], axis=1)

    # Calculate the z-scores for each column
    z_scores = np.abs((df - df.mean()) / df.std())

    # Find the rows whose z-scores are greater than the threshold
    outlier_indices = np.where(z_scores > threshold)[0]

    # Delete the rows whose z-scores are greater than the threshold, with reindexing
    df = df.drop(outlier_indices, axis=0).reset_index(drop=True)

    # Separate the features and target variables
    X = df.drop(y.name, axis=1)
    y = df[y.name]
    print(y.value_counts())
    return X, y

def add_1st_derivative_features(X):
    """
    Add derivative features to the training and test data.
    :param X: dataframe in pandas format
    :return: Training and test data with the derivative features added
    """
    # Create a new DataFrame to store the derivative features
    X_new = pd.DataFrame()

    # Iterate over the columns in the DataFrame
    for col in X.columns:
        # Calculate the first derivative of the column
        first_derivative = np.gradient(X[col])

        # Create a new column name for the first derivative
        first_derivative_name = col + '_1st_der'

        # Add the first derivative to the new DataFrame
        X_new[first_derivative_name] = first_derivative

    # Concatenate the new DataFrame with the original DataFrame
    X_new = pd.concat([X, X_new], axis=1)

    return X_new

def oversample_minority_class(X_train, y_train):
    """
    Oversample the minority class using SMOTE.
    :param X_train: Training data
    :param y_train: Training labels
    :return: Oversampled training data and training labels
    """
    # Create an SMOTE object
    sm = SMOTE(random_state=42)

    # Fit the SMOTE object to the training data and labels
    X_train, y_train = sm.fit_resample(X_train, y_train)

    return X_train, y_train

def smoothen_data(X, window_size):
    """
    Smoothen the data using a moving average.
    :param X: Training or test data
    :param window_size: Window size for the moving average
    :return: Smoothened training or test data
    """
    X_smoothen = X.rolling(window=window_size, min_periods=1).mean()
    return X_smoothen

## Training the models

We've taken the decision to train different models regarding on the motor we're monitoring. It means that in the end, we'll run at least 6 parallel models for failure detection. Each model will be trained to detect the failure of exactly one motor.

### Define the training and target features

For now, we know that each instance gathers 18 features : 3 (temperature, voltage, position) for each one of the 6 motors. We are not sure whether it is necessary to keep these 18 features in our models. This is why we are building different sets of input features :
- `temperature` : All temperature values from the 6 motors (6 features).
- `voltage` : All voltage values from the 6 motors (6 features).
- `position` : All position values from the 6 motors (6 features).
- `solo_motor` : The features relative to the current motor (3 features).
- `solo_temperature` : The temperature of the current motor (1 feature).
- `all_num` : All features at the same time (18 features).

We'll train the models on these 6 different sets of input features.

Of course, the target feature of the model will be the `label` of the motor matching the model.

In [4]:
training_features = dict()
training_features["temperature"] = list()
training_features["voltage"] = list()
training_features["position"] = list()
training_features["solo_motor"] = dict()
training_features["solo_temperature"] = dict()
target_feature = dict()

for i in range(1, 7):
    training_features["temperature"].append("data_motor_{}_temperature".format(i))
    training_features["voltage"].append("data_motor_{}_voltage".format(i))
    training_features["position"].append("data_motor_{}_position".format(i))
    training_features["solo_motor"][i] = list()
    training_features["solo_motor"][i].append("data_motor_{}_temperature".format(i))
    training_features["solo_motor"][i].append("data_motor_{}_voltage".format(i))
    training_features["solo_motor"][i].append("data_motor_{}_position".format(i))
    training_features["solo_temperature"][i] = list()
    training_features["solo_temperature"][i].append("data_motor_{}_temperature".format(i))
    target_feature[i] = "data_motor_{}_label".format(i)

training_features["all_num"] = training_features["temperature"] + training_features["voltage"] + training_features["position"]

### Select and read the training data

We are able to choose the training and testing dataset among the data we collected on the actual robot. We'll define the testing datasets later in this notebook, but we make sure these two datasets do not overlap : if so, the test results will be overly optimistic, as the model was using the same data during training already.

As the selection is done, we read and store the available data in Pandas DataFrames.

In [5]:
# Select the data to use for training
path_training = [
    'motor1_group2', 'motor2_group2', 'motor3_group2',
    'motor4_group2', 'motor5_group2', 'motor6_group2',
    'static_with_fault_1', 'static_with_fault_2', 'static_with_fault_3', 
    'static_with_fault_4', 'static_with_fault_5', 'static_with_fault_6',
    # 'steady_state_after_movement', 'steady_state_not_moving',
]

path_header = os.path.abspath('../data_collection/collected_data/')

# Load the data
df = load_data(path_header, path_training)

### Preprocess the data and find the best models

First, let's choose which classifiers we'll train the data on, and which input features we'll use.

In [15]:
# Create a list of classifiers to evaluate
classifiers = [
    ('LogReg', LogisticRegression(class_weight='balanced', max_iter=1000)),
    ('SVM', SVC(class_weight='balanced')),
    # ('Decision Tree', DecisionTreeClassifier(class_weight='balanced')),
    # ('Random Forest', RandomForestClassifier(class_weight='balanced')),
    # Add more classifiers here
]

# Define hyperparameters for grid search for each classifier
param_grids = [
    {'C': np.logspace(-1, 1, 5)},  # Hyperparameters for Logistic Regression
    {'C': np.logspace(-1, 1, 5), 'gamma': np.logspace(-1, 1, 5), 'kernel': ['poly'], 'degree': [2,3]}, # Hyperparameters for SVM
    # {'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4]},  # Hyperparameters for Decision Tree
    # {'n_estimators': [10, 50, 100, 200], 'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4]} # Hyperparameters for Random Forest
    # Add more hyperparameters for other classifiers here
]

# Create the "models" directory if it doesn't exist
if not os.path.exists("models"):
     os.makedirs("models")

training_methods = ["all_num",
                    "solo_motor",
                    "temperature",
                    "solo_temperature",
                    # "voltage",
                    # "position",
                    ]

Now, let's train the models we chose on the training dataset.

In the process of selecting the best model, we'll not only optimize on the best F1 Score a model can get, but also on the hyperparameters, using a common strategy called grid search. Per se, grid search is a tuning technique that attempts to compute the optimum values of hyperparameters exhaustively.

Once the model is trained, we'll validate it using a part of the training dataset the model wasn't trained on. We'll store the obtained results in a Pandas DataFrame called `val_metrics`. We'll look into it right after.

In [7]:
# Create an empty list to store the results (evaluation metrics) for each classifier
val_metrics = pd.DataFrame(data=[], columns=['Motor', 'Training Features', 'Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Ignore warnings
warnings.filterwarnings('ignore')

for chosen_training_method in training_methods:

    # Select the data to use for training and testing
    X, y = dict(), dict()
    X_train, X_val, y_train, y_val = dict(), dict(), dict(), dict()

    for n_motor in range(1, 7):
        chosen_target_feature = target_feature[n_motor]
        if chosen_training_method == "solo_motor" or chosen_training_method == "solo_temperature":
            chosen_training_features = training_features[chosen_training_method][n_motor]
        else:
            chosen_training_features = training_features[chosen_training_method]
        
        X[n_motor], y[n_motor] = separate_features_and_target_variables(df, chosen_training_features, chosen_target_feature)

        # Delete outliers
        # X[n_motor], y[n_motor] = delete_outliers(X[n_motor], y[n_motor], threshold = 3)
        # It won't work because deleting outliers means keeping only the unfailed data
        # So we prefer to keep the outliers
        
        # Smoothen the data
        X[n_motor] = smoothen_data(X[n_motor], window_size=5)
        
        # Add derivative features
        X[n_motor] = add_1st_derivative_features(X[n_motor])

        # Oversample the minority class
        X[n_motor], y[n_motor] = oversample_minority_class(X[n_motor], y[n_motor])

        # Split the data into training and validating sets
        X_train[n_motor], X_val[n_motor], y_train[n_motor], y_val[n_motor] = train_test_split(X[n_motor], y[n_motor], test_size=0.2, random_state=42)

        # Iterate over the classifiers and perform grid search
        for classifier, param_grid in zip(classifiers, param_grids):

            # Rename the param_grid keys with the classifier name
            param_grid = {f'{classifier[0]}__{key}': value for key, value in param_grid.items()}

            # Create a pipeline with Standardization and the current classifier
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                classifier
            ])

            # Use GridSearchCV to find the best hyperparameters and fit the pipeline
            grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', verbose = 1)
            grid_search.fit(X_train[n_motor], y_train[n_motor])

            # Use grid_search.predict to make predictions on the testing dataset
            y_pred = grid_search.predict(X_val[n_motor])

            # Compute evaluation metrics
            conf_matrix = confusion_matrix(y_val[n_motor], y_pred)
            accuracy = accuracy_score(y_val[n_motor], y_pred)
            precision = precision_score(y_val[n_motor], y_pred)
            recall = recall_score(y_val[n_motor], y_pred)
            f1 = f1_score(y_val[n_motor], y_pred)


            # Store the results in a Pandas DataFrame
            new_row = pd.DataFrame({
                'Motor': n_motor,
                'Training Features': chosen_training_method,
                'Classifier': classifier[0],
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1
            }, index=[0])
            val_metrics = pd.concat([val_metrics, new_row], ignore_index=True)

            # Save the best model from grid search
            best_model = grid_search.best_estimator_
            best_model_name = f'best_{classifier[0]}_motor_{n_motor}_{chosen_training_method}.model'

            # Create the file
            file_path = os.path.join("models", best_model_name)
            open(file_path, 'w').close()
            joblib.dump(best_model, file_path)

# Save the validation metrics to a CSV file
file_path = os.path.join("models", 'val_metrics.csv')
val_metrics.to_csv(file_path, index=False)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates

### Explore the validation metrics

We previously stored the `val_metrics` DataFrame in a `csv` file in the same folder as the dumped models. Let's open it and look at the metrics we got for a specific type of classifier and input features :

In [8]:
file_path = os.path.join("models", 'val_metrics.csv')
val_metrics = pd.read_csv(file_path)

classifier_name = "LogReg"
training_method = "all_num"

best_val_metric = val_metrics.loc[(val_metrics['Classifier'] == classifier_name) & (val_metrics['Training Features'] == training_method)]

print(f'Best model metrics using {training_method} method and {classifier_name} classifier, for each motor (VALIDATION DATASET):')

best_val_metric

Best model metrics using all_num method and LogReg classifier, for each motor (VALIDATION DATASET):


Unnamed: 0,Motor,Training Features,Classifier,Accuracy,Precision,Recall,F1 Score
0,1,all_num,LogReg,0.989828,0.980092,1.0,0.989946
1,2,all_num,LogReg,0.993696,0.987607,1.0,0.993765
2,3,all_num,LogReg,0.982157,0.968603,0.995964,0.982093
3,4,all_num,LogReg,0.917846,0.88287,0.960956,0.920259
4,5,all_num,LogReg,0.95173,0.94918,0.952303,0.950739
5,6,all_num,LogReg,0.959904,0.933852,0.987654,0.96


## Testing the models

### Select and read the testing data

Let's define on which data we'll test our models. Remember that the training and testing datasets must not overlap, otherwise the test won't be fair.

In [9]:
# Select the data to use for testing
path_test = [
    'task_fault',
    'fault 1', 'fault 2', 'fault 3', 'fault 4', 'fault 5', 'fault 6'
]

path_header = os.path.abspath('../data_collection/collected_data/')

# Load the data
df_test = load_data(path_header, path_test)

### Actual test of the models

Here, we load one model at a time, as they are stored in a specific folder. If the model is unavailable, we print a corresponding message to warn the user.
As we did with the validation dataset, we store the obtained metrics in a Pandas DataFrame called `test_metrics`, in which we'll look into a few cells further.

In [18]:
test_metrics = pd.DataFrame(columns=['Motor', 'Training Features', 'Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

for chosen_training_method in training_methods:
    X_test, y_test = dict(), dict()
    for n_motor in range(1, 7):
        chosen_target_feature = target_feature[n_motor]
        if chosen_training_method == "solo_motor" or chosen_training_method == "solo_temperature":
            chosen_training_features = training_features[chosen_training_method][n_motor]
        else:
            chosen_training_features = training_features[chosen_training_method]
        
        X_test[n_motor], y_test[n_motor] = separate_features_and_target_variables(df_test, chosen_training_features, chosen_target_feature)
        
        # Preprocess the test data
        X_test[n_motor] = smoothen_data(X_test[n_motor], window_size=5)
        X_test[n_motor] = add_1st_derivative_features(X_test[n_motor])
        
        # Iterate over the classifiers
        for classifier in classifiers:
            classifier_name = classifier[0]
            best_model_name = f'best_{classifier_name}_motor_{n_motor}_{chosen_training_method}.model'
            file_path = os.path.join("models", best_model_name)
            try:
                best_model = joblib.load(file_path)

                # Make predictions on the test data
                y_pred = best_model.predict(X_test[n_motor])
            
                # Compute evaluation metrics
                conf_matrix = confusion_matrix(y_test[n_motor], y_pred)
                accuracy = accuracy_score(y_test[n_motor], y_pred)
                precision = precision_score(y_test[n_motor], y_pred)
                recall = recall_score(y_test[n_motor], y_pred)
                f1 = f1_score(y_test[n_motor], y_pred)
                
                # Create a new row for the test metrics dataframe
                new_row = pd.DataFrame({
                    'Motor': n_motor,
                    'Training Features': chosen_training_method,
                    'Classifier': classifier_name,
                    'Accuracy': accuracy,
                    'Precision': precision,
                    'Recall': recall,
                    'F1 Score': f1,
                }, index = [0])
                
                # Add the new row to the test metrics dataframe
                test_metrics = pd.concat([test_metrics, new_row], ignore_index=True)
            except FileNotFoundError:
                print("No model for {} motor {} using {} method".format(classifier_name, n_motor, chosen_training_method))

# Save the test metrics to a CSV file
file_path = os.path.join("models", 'test_metrics.csv')
test_metrics.to_csv(file_path, index=False)

No model for SVM motor 1 using solo_motor method
No model for SVM motor 2 using solo_motor method
No model for SVM motor 3 using solo_motor method
No model for SVM motor 4 using solo_motor method
No model for SVM motor 5 using solo_motor method
No model for SVM motor 6 using solo_motor method
No model for SVM motor 1 using temperature method
No model for SVM motor 2 using temperature method
No model for SVM motor 3 using temperature method
No model for SVM motor 4 using temperature method
No model for SVM motor 5 using temperature method
No model for SVM motor 6 using temperature method
No model for SVM motor 1 using solo_temperature method
No model for SVM motor 2 using solo_temperature method
No model for SVM motor 3 using solo_temperature method
No model for SVM motor 4 using solo_temperature method
No model for SVM motor 5 using solo_temperature method
No model for SVM motor 6 using solo_temperature method


### Explore the testing metrics

This is a similar exploration as we did with the validation metrics earlier.

In [19]:
file_path = os.path.join("models", 'test_metrics.csv')
test_metrics = pd.read_csv(file_path)

In [23]:
classifier_name = "LogReg"
training_method = "temperature"

best_test_metric = test_metrics.loc[(test_metrics['Classifier'] == classifier_name) & (test_metrics['Training Features'] == training_method)]

print(f'Best model metrics using {training_method} method and {classifier_name} classifier, for each motor (TESTING DATASET):')

best_test_metric

Best model metrics using temperature method and LogReg classifier, for each motor (TESTING DATASET):


Unnamed: 0,Motor,Training Features,Classifier,Accuracy,Precision,Recall,F1 Score
18,1,temperature,LogReg,0.936202,0.639262,1.0,0.779939
19,2,temperature,LogReg,0.826113,0.0,0.0,0.0
20,3,temperature,LogReg,0.8727,0.455108,0.3675,0.406639
21,4,temperature,LogReg,0.839763,0.439024,0.626087,0.516129
22,5,temperature,LogReg,0.739169,0.0,0.0,0.0
23,6,temperature,LogReg,0.829674,0.0,0.0,0.0


### Finding the best classification models

Among all the models we generated and trained, now is the time to choose the best. In order to achieve this goal, we'll take a look at the metrics of our models on the testing dataset, and choose the model having the best F1 Score for each motor.

The output DataFrame summarizes the best models we got, based on the following choices we made:
- The available classifiers
- Their possible hyperparameters
- The available input features
- The training dataset
- The testing dataset

In [22]:
best_models = pd.DataFrame(columns=['Motor', 'Training Features', 'Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

for n_motor in test_metrics["Motor"].unique():
    try:
        current_best_models = test_metrics.loc[(test_metrics["Motor"] == n_motor)]
        current_id_for_f1_max = current_best_models['F1 Score'].idxmax()
        current_best_model = test_metrics.loc[current_id_for_f1_max]
        current_best_model = pd.DataFrame({
            'Motor': current_best_model['Motor'],
            'Training Features': current_best_model['Training Features'],
            'Classifier': current_best_model['Classifier'],
            'Accuracy': current_best_model['Accuracy'],
            'Precision': current_best_model['Precision'],
            'Recall': current_best_model['Recall'],
            'F1 Score': current_best_model['F1 Score']
        }, index=[0])
        best_models = pd.concat([current_best_model, best_models], ignore_index=True)
    except:
        print(f"No model for motor {n_motor}")

# Sort the best models by motor number
best_models = best_models.sort_values('Motor')

# Save the best models to a CSV file
file_path = os.path.join("models", 'best_models.csv')
best_models.to_csv(file_path, index=False)

best_models

Unnamed: 0,Motor,Training Features,Classifier,Accuracy,Precision,Recall,F1 Score
5,1,temperature,LogReg,0.936202,0.639262,1.0,0.779939
4,2,solo_motor,LogReg,0.604154,0.043902,0.061433,0.051209
3,3,solo_motor,LogReg,0.8,0.331695,0.675,0.444811
2,4,all_num,LogReg,0.90178,0.644295,0.626087,0.635061
1,5,solo_temperature,LogReg,0.814837,0.427523,1.0,0.598972
0,6,solo_motor,LogReg,0.539763,0.209524,0.727626,0.325359
