# MAFAULDA - Machinery Fault Database

Dataset source: http://www02.smt.ufrj.br/~offshore/mfs/page_01.html

In [15]:
from glob import glob
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
from sktime.classification.kernel_based import RocketClassifier
from sktime.transformations.panel.rocket import MiniRocket, MiniRocketMultivariate
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.classification.interval_based import SupervisedTimeSeriesForest
from sktime.datasets import load_from_arff_to_dataframe
from sktime.classification.interval_based import (
    CanonicalIntervalForest,
    DrCIF,
    RandomIntervalSpectralEnsemble,
    SupervisedTimeSeriesForest,
    TimeSeriesForestClassifier,
)
from sktime.datatypes._panel._convert import (
    from_3d_numpy_to_nested,
    from_3d_numpy_to_multi_index,
    from_multi_index_to_nested,
    from_multi_index_to_3d_numpy
)
from sktime.classification.kernel_based import Arsenal
from sktime.datasets import load_from_tsfile
from tsfresh.utilities.dataframe_functions import roll_time_series
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn.datasets import make_multilabel_classification
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import f1_score
from mcfly.find_architecture import find_best_architecture
import matplotlib.pyplot as plt
from collections import Counter
from timeit import default_timer as timer
from datetime import timedelta
import seaborn as sns
import warnings

warnings.simplefilter('ignore')

SEED=42

%matplotlib inline

In [10]:
# data_path# + data_paths[k]
data_paths["overhang-cage-fault-20g"]

'/mnt/f/Downloads/MAFAULDA//overhang/cage-fault/20g/'

In [18]:
%%time 

data_path = "/mnt/f/Downloads/MAFAULDA/"
data_paths = {
    "normal": data_path + "normal/",
    "overhang-cage-fault-20g": data_path + "/overhang/cage-fault/20g/",
    # "imbalance-20g": data_path + "imbalance/20g/"
}

df = pd.DataFrame()
labels = []

for k, val in data_paths.items():
    label = k
    print("label:", label)
    
    for file_name in listdir(val):
        df_tmp = pd.read_csv(val + file_name, header=None)[0:130000]
        df_tmp = df_tmp.astype("float32")
        
        instance_id = file_name.split(".csv")[0] + "_" + label
        df_tmp["instances"] = instance_id
        df_tmp["timepoints"] = list(range(len(df_tmp)))
        # df_tmp["label"] = label
        labels.append(label)

        # print("file_id:", instance_id, "df_tmp length:", len(df_tmp))
        df = pd.concat([df, df_tmp])
    del df_tmp

print("Finished")

label: normal
label: overhang-cage-fault-20g
Finished
CPU times: user 26.7 s, sys: 10.6 s, total: 37.3 s
Wall time: 44.7 s


In [19]:
df.columns = ['s0', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 'instances', 'timepoints']
df = df[['instances', 'timepoints', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7']]
df = df.set_index(['instances', 'timepoints'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,s0,s1,s2,s3,s4,s5,s6,s7
instances,timepoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12.288_normal,0,4.5595,0.1752,0.28721,-0.017751,-0.41565,0.032459,-0.11218,-0.12814
12.288_normal,1,4.6038,-0.051295,-0.19405,-0.060071,-0.41809,0.036547,-0.11043,0.11831
12.288_normal,2,4.5703,-0.96908,0.038033,-0.028329,-0.43081,0.041924,-0.14331,-0.071527
12.288_normal,3,4.587,0.89127,0.072973,0.007453,-0.40017,0.04109,-0.11984,0.043445
12.288_normal,4,4.5887,-1.716,-0.32929,-0.033063,-0.50281,0.040474,-0.2527,0.023901


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 12740000 entries, ('12.288_normal', 0) to ('61.2352_overhang-cage-fault-20g', 129999)
Data columns (total 8 columns):
 #   Column  Dtype  
---  ------  -----  
 0   s0      float32
 1   s1      float32
 2   s2      float32
 3   s3      float32
 4   s4      float32
 5   s5      float32
 6   s6      float32
 7   s7      float32
dtypes: float32(8)
memory usage: 454.6+ MB


In [21]:
df_nested = from_multi_index_to_nested(multi_ind_dataframe=df, instance_index='instances')
df_nested.head()

Unnamed: 0,s0,s1,s2,s3,s4,s5,s6,s7
0,0 4.55950 1 4.60380 2 ...,0 0.175200 1 -0.051295 2 ...,0 0.287210 1 -0.194050 2 ...,0 -0.017751 1 -0.060071 2 ...,0 -0.415650 1 -0.418090 2 ...,0 0.032459 1 0.036547 2 ...,0 -0.11218 1 -0.11043 2 -...,0 -0.128140 1 0.118310 2 ...
1,0 -0.52828 1 -0.52552 2 -...,0 -1.68410 1 1.47330 2 -...,0 -0.029689 1 0.148640 2 ...,0 -0.001552 1 0.037974 2 ...,0 1.073900 1 1.126800 2 ...,0 -0.017822 1 -0.024118 2 ...,0 0.17262 1 0.21368 2 ...,0 -0.039353 1 -0.003820 2 ...
2,0 4.47690 1 4.46430 2 ...,0 -1.70450 1 1.72010 2 -...,0 -0.076488 1 0.273950 2 ...,0 -0.051497 1 0.021210 2 ...,0 -0.56721 1 -0.45291 2 -...,0 0.041456 1 0.039043 2 ...,0 -0.031897 1 0.069768 2 ...,0 0.040889 1 -0.033122 2 ...
3,0 4.55540 1 4.53610 2 ...,0 -2.18540 1 1.43250 2 -...,0 -0.23123 1 0.29763 2 -...,0 -0.088589 1 -0.009870 2 ...,0 1.31250 1 1.36200 2 ...,0 0.004337 1 0.002316 2 ...,0 0.300160 1 0.378970 2 ...,0 0.035949 1 -0.049696 2 ...
4,0 4.49670 1 4.62210 2 ...,0 0.972110 1 1.062900 2 ...,0 0.352170 1 0.117360 2 ...,0 0.071360 1 0.066884 2 ...,0 -0.32078 1 -0.31932 2 -...,0 0.004632 1 0.008688 2 ...,0 -0.124090 1 -0.169270 2 ...,0 -0.082553 1 -0.038890 2 ...


In [22]:
df_nested.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   s0      98 non-null     object
 1   s1      98 non-null     object
 2   s2      98 non-null     object
 3   s3      98 non-null     object
 4   s4      98 non-null     object
 5   s5      98 non-null     object
 6   s6      98 non-null     object
 7   s7      98 non-null     object
dtypes: object(8)
memory usage: 6.2+ KB


In [29]:
df_nested.shape + df_nested.iloc[0,0].shape

(98, 8, 130000)

In [31]:
del df

In [32]:
def transform_data(transformer, X_train, X_test):
    print("Starting data transformation ...")
    
    start = timer()
    transformer.fit(X_train)
    print(f"Elapsed time for {type(transformer).__name__} fitting:", 
          (timedelta(seconds=timer()-start)).seconds, 
          "seconds")
    
    start = timer()
    X_train_transform = transformer.transform(X_train)
    print("Elapsed time for the transformation of X_train:", (timedelta(seconds=timer()-start)).seconds, "seconds")
    
    start = timer()
    X_test_transform = transformer.transform(X_test)
    print("Elapsed time for the transformation of X_test:", (timedelta(seconds=timer()-start)).seconds, "seconds \n")
    
    return X_train_transform, X_test_transform


def get_model_name(model):
    return type(model).__name__


def train_model(model, X_train, y_train):
    model_name = get_model_name(model)
    
    print(f"Starting {model_name} model training ...")
    start = timer()
    model = model.fit(X_train, y_train)
    print(f"Elapsed time for the {model_name} model training:", 
          (timedelta(seconds=timer()-start)).seconds, 
          "seconds \n")
    return model

def evaluate_model(model, X_test, y_test):
    model_name = get_model_name(model)
    print(f"Starting {model_name} model evaluation ...")

    start = timer()
    y_pred = model.predict(X_test_transform)
    print(f"Elapsed time for the {model_name} model prediction:", 
      (timedelta(seconds=timer()-start)).seconds, 
      "seconds \n")
    
    print(f"Classification Report for the {model_name} model \n")
    print(classification_report(y_test, y_pred, digits=4))
    
    print("")
    # print("F1 binary:\t", f1_score(y_test, y_pred))
    print("F1 micro:\t", round(f1_score(y_test, y_pred, average="micro"), 4))
    print("F1 macro:\t", round(f1_score(y_test, y_pred, average="macro"), 4))
    print("F1 weighted:\t", round(f1_score(y_test, y_pred, average="weighted"), 4))
    print("")

In [33]:
labels = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(df_nested, 
                                                    labels, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=labels, 
                                                    shuffle=True)

print("Data shape: (examples, channels, series_length)\n")
print("X_train shape:\t", X_train.shape + X_train.iloc[0,0].shape)
print("y_train shape:\t", y_train.shape)

print("X_test shape:\t", X_test.shape + X_test.iloc[0,0].shape)
print("y_test shape:\t", y_test.shape)

Data shape: (examples, channels, series_length)

X_train shape:	 (78, 8, 130000)
y_train shape:	 (78,)
X_test shape:	 (20, 8, 130000)
y_test shape:	 (20,)


## Data transformation using MiniROCKET

In [None]:
%%time 

min_rocket = MiniRocketMultivariate(n_jobs=4, random_state=SEED)
X_train_transform, X_test_transform = transform_data(min_rocket, X_train, X_test)

## MiniROCKET features + RidgeClassifierCV

In [None]:
%%time 

model = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
model = train_model(model, X_train_transform, y_train)
evaluate_model(model, X_test_transform, y_test)