## Install and import

In [1]:
# %pip install imblearn

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import joblib
import warnings

## Define the useful functions

The functions `combine_csv` and `load_data` are used to automate and simplify the process of loading and combining data from multiple CSV files located in different directories.

In summary:
- `combine_csv` : combines CSV files from a specified folder into a single DataFrame
- `load_data` : loads data from multiple folders and combines them into a single DataFrame.
These functions provide a convenient way to process and combine data from multiple CSV files and folders.

In [2]:
def combine_csv(folder_path, tmp_path):
    """
    Combine all CSV files in a folder into a single DataFrame.
    :param folder_path: Path to the folder containing the CSV files
    :param seq_idx: Sequence index
    :param label: Label of the sequence (Normal - 0, Abnormal - 1)
    :return: A single DataFrame containing all the data from the CSV files
    """

    # Get a list of all CSV files in the folder
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    # Create an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()

    # Iterate over the CSV files in the folder
    for file in csv_files:
        # Construct the full path to each CSV file
        file_path = os.path.join(folder_path, file)

        # Read each CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Drop the time. Will add later.
        df = df.drop(labels=df.columns[0], axis=1)

        # Extract the file name (excluding the extension) to use as a prefix
        file_name = os.path.splitext(file)[0]

        # Add a prefix to each column based on the file name
        df = df.add_prefix(f'{file_name}_')

        # Concatenate the current DataFrame with the combined DataFrame
        combined_df = pd.concat([combined_df, df], axis=1)

    df = pd.read_csv(file_path)
    combined_df = pd.concat([df['time'], combined_df], axis=1)
    combined_df.loc[:, 'test_condition'] = tmp_path

    return combined_df

def load_data(path_header, folder_path):
    df = pd.DataFrame()
    for tmp_path in folder_path:
        # path = path_header + tmp_path
        path = path_header + '/' + tmp_path
        tmp_df = combine_csv(path, tmp_path)
        df = pd.concat([df, tmp_df])
        df = df.reset_index(drop=True)
    return df

In [3]:
def separate_features_and_target_variables(df, training_features, target_variable):
    """
    Separate the features and target variables.
    :param df: The DataFrame containing the features and target variables
    :param training_features: The names of the training features
    :param target_variable: The name of the target variable
    :return: A tuple containing the features DataFrame and the target variable Series
    """
    X = df[training_features]
    y = df[target_variable]

    return X, y

def delete_outliers(X, y, threshold=3):
    """
    Delete outliers from the DataFrame.
    :param df: The DataFrame containing the features and target variables
    :param threshold: The threshold used to identify outliers
    :return: The DataFrame with the outliers removed
    """
    print(y.value_counts())

    # Merge X and y
    df = pd.concat([X, y], axis=1)

    # Calculate the z-scores for each column
    z_scores = np.abs((df - df.mean()) / df.std())

    # Find the rows whose z-scores are greater than the threshold
    outlier_indices = np.where(z_scores > threshold)[0]

    # Delete the rows whose z-scores are greater than the threshold, with reindexing
    df = df.drop(outlier_indices, axis=0).reset_index(drop=True)

    # Separate the features and target variables
    X = df.drop(y.name, axis=1)
    y = df[y.name]
    print(y.value_counts())
    return X, y

def add_1st_derivative_features(X):
    """
    Add derivative features to the training and test data.
    :param X: dataframe in pandas format
    :return: Training and test data with the derivative features added
    """
    # Create a new DataFrame to store the derivative features
    X_new = pd.DataFrame()

    # Iterate over the columns in the DataFrame
    for col in X.columns:
        # Calculate the first derivative of the column
        first_derivative = np.gradient(X[col])

        # Create a new column name for the first derivative
        first_derivative_name = col + '_1st_der'

        # Add the first derivative to the new DataFrame
        X_new[first_derivative_name] = first_derivative

    # Concatenate the new DataFrame with the original DataFrame
    X_new = pd.concat([X, X_new], axis=1)

    return X_new

def oversample_minority_class(X_train, y_train):
    """
    Oversample the minority class using SMOTE.
    :param X_train: Training data
    :param y_train: Training labels
    :return: Oversampled training data and training labels
    """
    # Create an SMOTE object
    sm = SMOTE(random_state=42)

    # Fit the SMOTE object to the training data and labels
    X_train, y_train = sm.fit_resample(X_train, y_train)

    return X_train, y_train

def smoothen_data(X, window_size):
    """
    Smoothen the data using a moving average.
    :param X: Training or test data
    :param window_size: Window size for the moving average
    :return: Smoothened training or test data
    """
    X_smoothen = X.rolling(window=window_size, min_periods=1).mean()
    return X_smoothen

## Preprocessing of the data

### Define the training and target features

In [4]:
training_features = dict()
training_features["temperature"] = list()
training_features["voltage"] = list()
training_features["position"] = list()
training_features["solo_motor"] = dict()
target_feature = dict()

for i in range(1, 7):
    training_features["temperature"].append("data_motor_{}_temperature".format(i))
    training_features["voltage"].append("data_motor_{}_voltage".format(i))
    training_features["position"].append("data_motor_{}_position".format(i))
    training_features["solo_motor"][i] = list()
    training_features["solo_motor"][i].append("data_motor_{}_temperature".format(i))
    training_features["solo_motor"][i].append("data_motor_{}_voltage".format(i))
    training_features["solo_motor"][i].append("data_motor_{}_position".format(i))
    target_feature[i] = "data_motor_{}_label".format(i)

training_features["all_num"] = training_features["temperature"] + training_features["voltage"] + training_features["position"]

### Select and read the data

In [6]:
# Select the data to use for training and testing
path_training = [
    'static_with_fault_1', 'static_with_fault_2', 'static_with_fault_3', 
    'static_with_fault_4', 'static_with_fault_5', 'static_with_fault_6', 
    'steady_state_after_movement', 'steady_state_not_moving',
]
path_test = [
    'task_fault',
]
path_header = os.path.abspath('../data_collection/collected_data/')

# Load the data
df = load_data(path_header, path_training)
df_test = load_data(path_header, path_test)

### Preprocess the data and find the best models

In [7]:
warnings.filterwarnings('ignore')

# Create a list of classifiers to evaluate
classifiers = [
    ('LogReg', LogisticRegression(class_weight='balanced', max_iter=1000)),
    # ('SVM', SVC(class_weight='balanced')),
    # ('Decision Tree', DecisionTreeClassifier(class_weight='balanced')),
    # ('Random Forest', RandomForestClassifier(class_weight='balanced')),
    # Add more classifiers here
]

# Define hyperparameters for grid search for each classifier
param_grids = [
    {'C': np.logspace(-1, 1, 5)},  # Hyperparameters for Logistic Regression
    # {'C': np.logspace(-1, 1, 5), 'gamma': np.logspace(-1, 1, 5), 'kernel': ['poly'], 'degree': [2,3]}, # Hyperparameters for SVM
    # {'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4]},  # Hyperparameters for Decision Tree
    # {'n_estimators': [10, 50, 100, 200], 'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4]} # Hyperparameters for Random Forest
    # Add more hyperparameters for other classifiers here
]

# Create an empty list to store the results (evaluation metrics) for each classifier
val_metrics = dict()

# Create the "models" directory if it doesn't exist
if not os.path.exists("models"):
     os.makedirs("models")

for chosen_training_method in ["all_num", "solo_motor", "temperature", "voltage", "position"]:

    # Initialize the results dictionary for the current training method
    val_metrics[chosen_training_method] = dict()

    # Select the data to use for training and testing
    X, y = dict(), dict()
    X_train, X_val, y_train, y_val = dict(), dict(), dict(), dict()

    for n_motor in range(1, 7):
        chosen_target_feature = target_feature[n_motor]
        if chosen_training_method == "solo_motor":
            chosen_training_features = training_features[chosen_training_method][n_motor]
        else:
            chosen_training_features = training_features[chosen_training_method]
        
        X[n_motor], y[n_motor] = separate_features_and_target_variables(df, chosen_training_features, chosen_target_feature)

        # Delete outliers
        # X[n_motor], y[n_motor] = delete_outliers(X[n_motor], y[n_motor], threshold = 3)
        # It won't work because deleting outliers means keeping only the unfailed data
        # So we prefer to keep the outliers
        
        # Smoothen the data
        X[n_motor] = smoothen_data(X[n_motor], window_size=5)
        
        # Add derivative features
        X[n_motor] = add_1st_derivative_features(X[n_motor])

        # Oversample the minority class
        X[n_motor], y[n_motor] = oversample_minority_class(X[n_motor], y[n_motor])

        # Split the data into training and validating sets
        X_train[n_motor], X_val[n_motor], y_train[n_motor], y_val[n_motor] = train_test_split(X[n_motor], y[n_motor], test_size=0.2, random_state=42)

        # Initialize the results dictionary for the current motor
        val_metrics[chosen_training_method][n_motor] = list()

        # Iterate over the classifiers and perform grid search
        for classifier, param_grid in zip(classifiers, param_grids):
            
            # Rename the param_grid keys with the classifier name
            param_grid = {f'{classifier[0]}__{key}': value for key, value in param_grid.items()}

            # Create a pipeline with Standardization and the current classifier
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                classifier
            ])

            # Use GridSearchCV to find the best hyperparameters and fit the pipeline
            grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', verbose = 1)
            grid_search.fit(X_train[n_motor], y_train[n_motor])

            # Use grid_search.predict to make predictions on the testing dataset
            y_pred = grid_search.predict(X_val[n_motor])

            # Compute evaluation metrics
            conf_matrix = confusion_matrix(y_val[n_motor], y_pred)
            accuracy = accuracy_score(y_val[n_motor], y_pred)
            precision = precision_score(y_val[n_motor], y_pred)
            recall = recall_score(y_val[n_motor], y_pred)
            f1 = f1_score(y_val[n_motor], y_pred)

            # Store the results in a dictionary
            val_result = {
                'Classifier': classifier[0],
                'Best Parameters': grid_search.best_params_,
                'Confusion Matrix': conf_matrix,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1
            }

            # Append the result to the list of results
            val_metrics[chosen_training_method][n_motor].append(val_result)

            # Save the best model from grid search
            best_model = grid_search.best_estimator_
            best_model_name = f'best_{classifier[0]}_motor_{n_motor}_{chosen_training_method}.model'

            # Create the file
            file_path = os.path.join("models", best_model_name)
            open(file_path, 'w').close()
            joblib.dump(best_model, file_path)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


KeyboardInterrupt: 

In [7]:
training_method = "all_num"
metric = "F1 Score"

print([val_metrics[training_method][i][0][metric] for i in range(1, 7)])

IndexError: list index out of range

In [8]:
test_metrics = dict()

for chosen_training_method in ["all_num", "solo_motor", "temperature", "voltage", "position"]:
    test_metrics[chosen_training_method] = dict()
    X_test, y_test = dict(), dict()
    for n_motor in range(1, 7):
        chosen_target_feature = target_feature[n_motor]
        if chosen_training_method == "solo_motor":
            chosen_training_features = training_features[chosen_training_method][n_motor]
        else:
            chosen_training_features = training_features[chosen_training_method]
        
        X_test[n_motor], y_test[n_motor] = separate_features_and_target_variables(df_test, chosen_training_features, chosen_target_feature)
        
        # Load the best model from grid search
        best_model_name = f'best_LogReg_motor_{n_motor}_{chosen_training_method}.model'
        file_path = os.path.join("models", best_model_name)
        best_model = joblib.load(file_path)
        
        # Preprocess the test data
        X_test[n_motor] = smoothen_data(X_test[n_motor], window_size=5)
        X_test[n_motor] = add_1st_derivative_features(X_test[n_motor])
        
        # Make predictions on the test data
        y_pred = best_model.predict(X_test[n_motor])
        
        # Initialize the results dictionary for the current motor
        test_metrics[chosen_training_method][n_motor] = list()

        # Compute evaluation metrics
        conf_matrix = confusion_matrix(y_test[n_motor], y_pred)
        accuracy = accuracy_score(y_test[n_motor], y_pred)
        precision = precision_score(y_test[n_motor], y_pred)
        recall = recall_score(y_test[n_motor], y_pred)
        f1 = f1_score(y_test[n_motor], y_pred)
        
        # Store the results in a dictionary
        test_result = {
            'Classifier': 'LogReg',
            'Confusion Matrix': conf_matrix,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        }
        
        # Append the result to the list of results
        test_metrics[chosen_training_method][n_motor].append(test_result)

In [9]:
training_method = "solo_motor"
metric = "F1 Score"

print([test_metrics[training_method][i][0][metric] for i in range(1, 7)])

best_training_methods = dict()
for n_motor in range(1, 7):
    best_f1_score = 0
    best_training_method = ""

    for training_method in test_metrics.keys():
        f1 = test_metrics[training_method][n_motor][0]["F1 Score"]
        if f1 > best_f1_score:
            best_f1_score = f1
            best_training_method = training_method

    best_training_methods[n_motor] = [best_training_method, best_f1_score]

df_best_training_methods = pd.DataFrame(best_training_methods.values(), index=best_training_methods.keys(), columns=["Best Training Method", "Best F1 Score"])
print(df_best_training_methods)


[0.0, 0.0, 0.35007610350076107, 0.25384615384615383, 0.0, 0.0]
  Best Training Method  Best F1 Score
1                            0.000000
2                            0.000000
3              all_num       0.391823
4              all_num       0.578231
5                            0.000000
6                            0.000000
