In [22]:
# Requirements
%pip install pandas
%pip install numpy
%pip install sklearn
%pip install xgboost
%pip install lightgbm
%pip install scikit-optimize

# Imports
import pandas as pd
import numpy as np
import sklearn as sk
import xgboost as xgb
import lightgbm as lgb
import skopt as so

from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
from skopt import Real, Categorical, Integer

In [23]:
# Rolling Window calc function
def add_rolling_features(df, state_col, load_col, speed_col, sensors, window_sizes):
    """
    Calculates rolling statistics for each sensor in the dataset.

    Parameters:
    - df: The input DataFrame.
    - state_col: The column containing the state of the machine.
    - load_col: The column containing the load on the machine.
    - speed_col: The column containing the speed of the machine.
    - sensors: A list of columns containing sensor data.
    - window_sizes: A list of window sizes to calculate statistics for.

    Returns:
    - A new DataFrame with the rolling statistics added as new columns.
    """
    # Define statistics to calculate
    stats = ['mean', 'min', 'max', 'median', 'std']

    # Iterate over each sensor and each window size
    for sensor in sensors:
        for window in window_sizes:
            # Group by specified columns and apply rolling functions
            grouped = df.groupby([state_col, load_col, speed_col])[sensor]
            for stat in stats:
                # Construct new column name
                new_col_name = f'{sensor}_{stat}{window}'
                if stat == 'mean':
                    df[new_col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).mean())
                elif stat == 'min':
                    df[new_col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).min())
                elif stat == 'max':
                    df[new_col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).max())
                elif stat == 'median':
                    df[new_col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).median())
                elif stat == 'std':
                    df[new_col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).std())

    return df

def train_and_optimize_model(X, y, model, param_grid, cross_val, num_iter, sc, num_jobs, rnd_state):
    """
    Trains a machine learning model using grid search with cross-validation.

    Parameters:
    - X, y: Training data and labels.
    - model: Uninitialized model that supports scikit-learn interface.
    - param_grid: Grid of parameters to tune.
    - cv: Cross-validation strategy.
    - scoring: Metric to evaluate the models.
    - n_jobs: Number of jobs to run in parallel.
    - random_state: Seed for random number generator.

    Returns:
    - A dictionary containing:
      - 'best_model': The best model from grid search.
      - 'best_score': The highest score achieved.
      - 'best_params': The parameters for the best model.
      - 'cv_results': Detailed results from all the training iterations.
    """
    # Setup BayesSearchCV
    bayes_search = BayesSearchCV(
        estimator = model,
        search_spaces = param_grid,
        scoring = sc,
        n_iter = num_iter, # Number of iterations to run
        cv = cross_val,      # 3-fold cross-validation
        n_jobs = num_jobs, # Use all available cores
        random_state = rnd_state,
    )
    bayes_search.fit(X, y)

    return {
        'best_model': bayes_search.best_estimator_,
        'best_score': bayes_search.best_score_,
        'best_params': bayes_search.best_params_,
        'cv_results': bayes_search.cv_results_
    }

In [24]:
# Prepare data
# Load datasets
missing_tooth = pd.read_csv('missing_tooth.csv')
tooth_chipped_fault = pd.read_csv('tooth_chipped_fault.csv')
surface_fault = pd.read_csv('surface_fault.csv')
no_fault = pd.read_csv('no_fault.csv')
root_crack = pd.read_csv('root_crack.csv')
eccentricity = pd.read_csv('eccentricity.csv')

# Concatenate datasets
df = pd.concat([
    no_fault,
    missing_tooth,
    tooth_chipped_fault,
    surface_fault,
    root_crack,
    eccentricity
])

# Transform gear_fault_desc for easier handling, and drop unnecessary columns, transform time_x to datetime and normalize time
# Create new dataset to work with and addjust
df_work = df.copy()
df_work['time_x'] = pd.to_datetime(df['time_x'])
df_work['time_normalized'] = df_work.groupby(['gear_fault_desc', 'load_value', 'speedSet'])['time_x'].transform(lambda x: (x - x.min()).dt.total_seconds())
df_work = df_work.drop(columns=['time_x'])
df_work['state'] = df_work.gear_fault_desc.apply(lambda x: ['No fault', 'missing tooth', 'chipped tooth', 'surface defect', 'Root crack', 'eccentricity'].index(x))
df_work = df_work.drop(columns=['gear_fault_desc'])

In [25]:
# KPSS test for stationarity showed it is not stationary so I will split the data into first 80% for training and last 20% for testing
X_train = df_work[df_work['time_normalized'] < 4].copy()
X_test = df_work[df_work['time_normalized'] >= 4].copy()

# I will calculate the rolling 25, 50, 100, 250, 500, 1000 mean, min, max, median and standard deviation 
# for sensor1 and sensor2 and use it to create a new features for every combination of state, load_value and speedSet
window_sizes = [25, 50, 100, 250, 500, 1000]
X_train = add_rolling_features(X_train, 'state', 'load_value', 'speedSet', ['sensor1', 'sensor2'], window_sizes)
X_test = add_rolling_features(X_test, 'state', 'load_value', 'speedSet', ['sensor1', 'sensor2'], window_sizes)

# X_train and X_test have NaN values that I will fill with 0, 
# delete time_normalized column as in a practical feature in this case
# and standardize the data
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
Y_train = X_train['state']
Y_test = X_test['state']
X_train = X_train.drop(columns=['state', 'time_normalized'])
X_test = X_test.drop(columns=['state', 'time_normalized'])
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Define models and hyperparameters
search_spaces = {
    'learning_rate': Real(0.01, 0.5, 'log-uniform'),
    'n_estimators': Integer(50, 1000),
    'num_leaves': Integer(20, 40),
    'max_depth': Integer(3, 10),
    'min_child_weight': Real(0.001, 10, 'log-uniform'),
    'colsample_bytree': Real(0.1, 1.0, 'uniform'),
    'subsample': Real(0.5, 1.0, 'uniform')
}

# Define models
model = lgb.LGBMClassifier(boosting_type='gbdt', objective='multiclass', random_state=0)
cv=4
n_iter=32
scoring='accuracy'
n_jobs=-1
random_state=0
# Train and optimize model
result = train_and_optimize_model(X_train, Y_train, model, search_spaces, cv, n_iter, scoring, n_jobs, random_state)
# The training can be easilly appliead for other models, just change or add the model and the search_spaces

In [None]:
print(result['model'])

In [None]:
print(result['best_score'])

In [None]:
print(result['best_params'])

In [None]:
print(result['cv_results'])