# PHM North America challenge '23

# Data processing

In [None]:
%load_ext autoreload
%autoreload 2

from conscious_engie_icare import distance_metrics
from conscious_engie_icare.normalization import normalize_1
from conscious_engie_icare.nmf_profiling import derive_df_orders, derive_df_vib, extract_nmf_per_number_of_component
from conscious_engie_icare.util import calc_tpr_at_fpr_threshold, calc_fpr_at_tpr_threshold, calculate_roc_characteristics
from conscious_engie_icare.viz.viz import illustrate_nmf_components_for_paper
from conscious_engie_icare.viz.spectrogram import plot_stft, plot_periodogram, plot_welch
from conscious_engie_icare.data.phm_data_handler import BASE_PATH_HEALTHY, FILE_NAMES_HEALTHY, fetch_and_unzip_data, load_data

import os
import pandas as pd
from tqdm import tqdm
from sklearn.decomposition import PCA
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
import numpy as np
import glob
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
import string
import pickle
from scipy.cluster.hierarchy import linkage, fcluster
from matplotlib.colors import LogNorm
from umap import UMAP
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
fetch_and_unzip_data()

# Building a decomposition matrix 

1. [Load healthy data & extract FFT](#Load-all-healthy-data-and-extract-STFT-and-PSD)
2. [Convert to orders](#Order-transformation)

> **Difference to industrial use case**: There is only one location --> matrix is 3 dimensional instead of 6 dimensional

## Load all healthy data and convert to frequency domain.

As the data is given in the time domain, we transform it to the frequency domain.
Each individual measurement is transformed to a frequency spectrum with a short-term Fourier transform (STFT).

**Short-term Fourier transform (STFT)**: Let $x(t)$ represent the original vibration signal in the time domain with the time index $t$.
The **STFT** is applied to $x(t)$ to obtain a representation $X(f,\tau)$ in the frequency domain, where $f$ is the frequency index and $\tau$ is the time window index.


**Welch's method**: Welch's method (also called the periodogram method) for estimating power spectra is carried out by dividing the time signal into successive blocks, forming the periodogram for each block, and averaging [source](https://ccrma.stanford.edu/~jos/sasp/Welch_s_Method.html).

In [None]:
nperseg = 10240
noverlap = nperseg // 2
nfft = None
fs = 20480
data_healthy, f = load_data(FILE_NAMES_HEALTHY, nperseg=nperseg, noverlap=noverlap, nfft=nfft, fs=fs)

We use most of the healthy data for training. 25% are held out for validation.

In [None]:
# V1: randomly shuffle the data and split into train and test set once
# V2: 4 different independent splits
# V3: repeat N times: Sample equal amount of samples from healthy and faulty data
# ADD_VALIDATION_AND_TEST: determines whether to add validation and test data from the challenge (used with 'V1')
RANDOM_SPLIT = 'V3'
ADD_VALIDATION_AND_TEST = True
SPLIT = 0.75
N = 100
CACHE_RESULTS = False
LOAD_CACHED_RESULTS = True
CACHING_FOLDER_NAME = os.path.join('..', 'data', 'CACHED_RESULTS_300124')
assert RANDOM_SPLIT in ['V1', 'V2', 'V3']
assert CACHE_RESULTS != LOAD_CACHED_RESULTS

data_healthy_train_folds = []
data_healthy_test_folds = []
if RANDOM_SPLIT == 'V1':
    N=1
    # randomly shuffle the data and split into train and test set once
    split_id = int(len(data_healthy) * SPLIT)
    random.Random(42).shuffle(data_healthy)   # !!!
    data_healthy_train = data_healthy[:split_id]
    data_healthy_test = data_healthy[split_id:]
    data_healthy_train_folds = [data_healthy_train]
    data_healthy_test_folds = [data_healthy_test]
elif RANDOM_SPLIT == 'V2':
    N=4
    n_total = len(data_healthy)
    for i in range(4):
        split_id_start = (n_total * i) // 4
        split_id_stop = (n_total * (i+1)) // 4
        data_healthy_test_ = data_healthy[split_id_start:split_id_stop]
        data_healthy_train_ = data_healthy[:split_id_start] + data_healthy[split_id_stop:]
        data_healthy_test_folds.append(data_healthy_test_)
        data_healthy_train_folds.append(data_healthy_train_)
elif RANDOM_SPLIT == 'V3':
    for i in range(N):
        # randomly sample equal amount of samples from healthy and faulty data
        split_id = int(len(data_healthy) * SPLIT)
        random.Random(i).shuffle(data_healthy)
        data_healthy_train_ = data_healthy[:split_id]
        data_healthy_test_ = data_healthy[split_id:]
        data_healthy_test_folds.append(data_healthy_test_)
        data_healthy_train_folds.append(data_healthy_train_)

len(data_healthy_train_folds[0])

## Order transformation and binning

In the order-tarnsformed domain, the frequency components are transformed to the number of rotations per minute (RPM) of the gears.

In [None]:
setup = {'start': 0.5, 'stop': 100.5, 'n_windows': 50, 'window_steps': 2, 'window_size': 2}

# load transformed data (if specified)
if LOAD_CACHED_RESULTS:
    df_orders_train_folds, meta_data_train_folds = load_cached_data()

# load train data and transform to orders
else:
    df_vib_train_folds = []
    df_orders_train_folds = []
    meta_data_train_folds = []
    for fold, data_healthy_train_ in enumerate(tqdm(data_healthy_train_folds, desc='Deriving orders on training set per fold')):
        df_vib_train_folds.append(derive_df_vib(data_healthy_train_, f)) # f!!!
        df_orders_train_, meta_data_train_ = derive_df_orders(df_vib_train_folds[-1], setup, f, verbose=False)
        df_orders_train_[meta_data_train_.columns] = meta_data_train_
        df_orders_train_folds.append(df_orders_train_)
        meta_data_train_folds.append(meta_data_train_)
        """
        fpath = os.path.join('df_nmf_models_folds_241023', f'df_orders_train_folds_{fold}.pkl')
        with open(fpath, 'wb') as file:
            pickle.dump(df_orders_train_, file)
        fpath = os.path.join('df_nmf_models_folds_241023', f'meta_data_train_folds_{fold}.pkl')
        with open(fpath, 'wb') as file:
            pickle.dump(meta_data_train_, file)
        """
    if CACHE_RESULTS:
        # cache train data
        with open(fpath_df_orders_train_folds, 'wb') as file:
            pickle.dump(df_orders_train_folds, file)
        # cache test data
        with open(fpath_meta_data_train_folds, 'wb') as file:
            pickle.dump(meta_data_train_folds, file)

# plot effect of orders
cols = df_orders_train_folds[-1].columns
BAND_COLS = cols[cols.str.contains('band')].tolist()
idx_cols = ['index', 'rotational speed [RPM]', 'torque [Nm]', 'direction',
            'unique_sample_id', 'sample_id']
cols = BAND_COLS + idx_cols
df_ = df_orders_train_folds[-1].reset_index()[cols]
df_ = pd.melt(df_, id_vars=idx_cols, var_name='frequency band', value_name='frequency band value')
fig = px.line(df_, x='frequency band', y='frequency band value',
              facet_row='direction', color='unique_sample_id',
              hover_data=['rotational speed [RPM]', 'torque [Nm]'],
              title='Frequency bands for healthy samples, before normalisation',
              markers=True, width=1200, height=600)
# draw verical line at band_39.5-40.5 in plotly express figure
# for x in [39, 79]:
#    fig.add_shape(type='line', x0=x, y0=0, x1=x, y1=2, line=dict(color='black', width=1, dash='dash'))
fig

We observe **major peaks at 40 and 80 orders**. 
40 orders corresponds to the number of teeth of the driving gear (= **gear mesh frequency**), 80 orders corresponds to a **harmonic frequency**.
The driven gear has 72 teeth which are not visible in the order spectrum. 

## Frequency-band normalization

> **Observation**, ***if there are no other sensors (y, z) present***: Without normalisation much higher explained variance!

In [None]:
df_V_train_normalized_folds = [normalize_1(df_orders_train_, BAND_COLS) for df_orders_train_ in df_orders_train_folds]
idx_vars = ['rotational speed [RPM]', 'torque [Nm]', 'direction', 'unique_sample_id', 'sample_id']
df_ = df_V_train_normalized_folds[-1].reset_index()
df_[idx_vars] = df_orders_train_folds[-1][idx_vars]
df_ = pd.melt(df_, id_vars=['index'] + idx_vars, 
    var_name='frequency band', value_name='frequency band value'
    )
fig = px.line(df_, x='frequency band', y='frequency band value',
              facet_row='direction', color='unique_sample_id',
            # hover_data=['rotational speed [RPM]', 'torque [Nm]'], 
              title='Frequency bands for healthy samples, after normalisation',
              markers=True, width=1200, height=600)
fig.show()

In [None]:
df_V_train_folds = df_V_train_normalized_folds # df_V_train_not_normalized 

How large are the folds?

In [None]:
len_V = [len(V_) for V_ in df_V_train_folds]
pd.Series(len_V).plot.hist()

©, 2023, Sirris