# SKAB v0.9

* **datetime**: Represents dates and times of the moment when the value is written to the database (YYYY-MM-DD hh:mm:ss)
* **Accelerometer1RMS**: Shows a vibration acceleration (Amount of g units);
* **Accelerometer2RMS**: Shows a vibration acceleration (Amount of g units);
* **Current**: Shows the amperage on the electric motor (Ampere);
* **Pressure**: Represents the pressure in the loop after the water pump (Bar);
* **Temperature**: Shows the temperature of the engine body (The degree Celsius);
* **Thermocouple**: Represents the temperature of the fluid in the circulation loop (The degree Celsius);
* **Voltage**: Shows the voltage on the electric motor (Volt);
* **RateRMS**: Represents the circulation flow rate of the fluid inside the loop (Liter per minute);
* **anomaly**: Shows if the point is anomalous (0 or 1).
* **changepoint**: Shows if the point is a changepoint for collective anomalies (0 or 1)

In [4]:
import sys
import os
import pandas as pd
import numpy as np

import time
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GroupKFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics  import average_precision_score, make_scorer, roc_curve,f1_score, precision_score, recall_score, fbeta_score, auc, roc_auc_score, accuracy_score, confusion_matrix, classification_report,precision_recall_curve
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit

## Read data

In [5]:
# Caminho da pasta com os arquivos CSV
folder_path = "./valve1"

# Lista todos os arquivos CSV na pasta
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Lista para armazenar os DataFrames
dfs = []

# Loop para ler cada CSV
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, sep=';')
    dfs.append(df)

# Concatena todos os DataFrames
df_value1 = pd.concat(dfs, ignore_index=True)

print(f"Total de linhas concatenadas: {len(df_value1)}")


Total de linhas concatenadas: 18160


In [6]:
# Caminho da pasta com os arquivos CSV
folder_path = "./valve2"

# Lista todos os arquivos CSV na pasta
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Lista para armazenar os DataFrames
dfs = []

# Loop para ler cada CSV
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, sep=';')
    dfs.append(df)

# Concatena todos os DataFrames
df_value2 = pd.concat(dfs, ignore_index=True)

print(f"Total de linhas concatenadas: {len(df_value2)}")

Total de linhas concatenadas: 4312


In [7]:
df_value1.head()

Unnamed: 0,datetime,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS,anomaly,changepoint
0,2020-03-09 10:14:33,0.026588,0.040111,1.3302,0.054711,79.3366,26.0199,233.062,32.0,0.0,0.0
1,2020-03-09 10:14:34,0.02617,0.040453,1.35399,0.382638,79.5158,26.0258,236.04,32.0,0.0,0.0
2,2020-03-09 10:14:35,0.026199,0.039419,1.54006,0.710565,79.3756,26.0265,251.38,32.0,0.0,0.0
3,2020-03-09 10:14:36,0.026027,0.039641,1.33458,0.382638,79.6097,26.0393,234.392,32.0,0.0,0.0
4,2020-03-09 10:14:37,0.02629,0.040273,1.07851,-0.273216,79.6109,26.042,225.342,32.0,0.0,0.0


In [8]:
df_value2.head()

Unnamed: 0,datetime,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS,anomaly,changepoint
0,2020-03-09 15:56:30,0.027608,0.039203,1.29048,0.054711,68.6194,24.367,241.062,32.0362,0.0,0.0
1,2020-03-09 15:56:31,0.027166,0.03994,1.28565,0.382638,68.5923,24.366,238.709,32.9649,0.0,0.0
2,2020-03-09 15:56:32,0.027718,0.040167,1.15588,0.054711,68.5207,24.3666,226.485,32.0362,0.0,0.0
3,2020-03-09 15:56:33,0.028045,0.038026,0.971268,0.382638,68.5425,24.3634,220.378,32.9649,0.0,0.0
4,2020-03-09 15:56:34,0.027644,0.03858,1.07246,-0.273216,68.6569,24.3639,233.922,32.0,0.0,0.0


In [9]:
df_value2['valve'] = 'value2'
df_value1['valve'] = 'value1'
df = pd.concat([df_value1, df_value2], ignore_index=True)
df.shape

(22472, 12)

In [10]:
pd.crosstab(df['valve'], df['anomaly'])

anomaly,0.0,1.0
valve,Unnamed: 1_level_1,Unnamed: 2_level_1
value1,11851,6309
value2,2795,1517


In [11]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [12]:
df.anomaly.value_counts()

anomaly
0.0    14646
1.0     7826
Name: count, dtype: int64

In [13]:
df.anomaly.value_counts(normalize=True)

anomaly
0.0    0.651744
1.0    0.348256
Name: proportion, dtype: float64

In [14]:
df.isnull().sum()/len(df)

datetime               0.0
Accelerometer1RMS      0.0
Accelerometer2RMS      0.0
Current                0.0
Pressure               0.0
Temperature            0.0
Thermocouple           0.0
Voltage                0.0
Volume Flow RateRMS    0.0
anomaly                0.0
changepoint            0.0
valve                  0.0
dtype: float64

In [15]:
df.columns

Index(['datetime', 'Accelerometer1RMS', 'Accelerometer2RMS', 'Current',
       'Pressure', 'Temperature', 'Thermocouple', 'Voltage',
       'Volume Flow RateRMS', 'anomaly', 'changepoint', 'valve'],
      dtype='object')

In [16]:
window = '1T'  # Ajuste para '5T', '10T', etc.

def aggregate_by_valve(data, window):
    return data.resample(window).agg({
        'Accelerometer1RMS': ['mean', 'std', 'min', 'max'],
        'Accelerometer2RMS': ['mean', 'std', 'min', 'max'],
        'Current': ['mean', 'std', 'min', 'max'],
        'Pressure': ['mean', 'std', 'min', 'max'],
        'Temperature': ['mean', 'std', 'min', 'max'],
        'Thermocouple': ['mean', 'std', 'min', 'max'],
        'Voltage': ['mean', 'std', 'min', 'max'],
        'Volume Flow RateRMS': ['mean', 'std', 'min', 'max'],
        'anomaly': 'max'  # Se houver pelo menos um evento anômalo na janela
    })

df_agg = df.groupby('valve').apply(lambda x: x.set_index('datetime').pipe(aggregate_by_valve, window)).reset_index()

# Renomeando colunas
df_agg.columns = ['_'.join(col).strip('_') for col in df_agg.columns]
df_agg.dropna(inplace=True)
df_agg = df_agg.rename(columns={"anomaly_max": "anomaly"})

  return data.resample(window).agg({
  return data.resample(window).agg({
  df_agg = df.groupby('valve').apply(lambda x: x.set_index('datetime').pipe(aggregate_by_valve, window)).reset_index()


In [18]:
df_agg.shape

(399, 35)

In [19]:
# Lista para armazenar os dados de treinamento e teste
train = list()
test = list()

data = df_agg.sort_values(by=['valve', 'datetime'])

# Iterar sobre cada poco
for poco_id, poco_data in data.groupby('valve'):
    # Calcular o índice para dividir os dados em treinamento e teste
    split_index = int(0.8 * len(poco_data))
    
    # Dividir os dados do poco em treinamento e teste
    poco_train = poco_data.iloc[:split_index]
    poco_test = poco_data.iloc[split_index:]
    
    # Adicionar os dados de treinamento e teste à lista
    train.append(poco_train)
    test.append(poco_test)

# Concatenar os dados de treinamento e teste
train = pd.concat(train)
test = pd.concat(test)

# Verificar o tamanho dos conjuntos de treinamento e teste
print("Tamanho do conjunto de treinamento:", len(train))
print("Tamanho do conjunto de teste:", len(test))

Tamanho do conjunto de treinamento: 319
Tamanho do conjunto de teste: 80


In [20]:
print(f'Treino valve: {train.valve.unique()}')
print(f'Teste valve: {test.valve.unique()}')
print(f'Tamanho do treino: {len(train)}')
print(f'Tamanho do teste: {len(test)}')

# Verificar a proporção de anomalias
print(f"Proporção de anomalias no conjunto de treino: {train.anomaly.mean():.2%}")
print(f"Proporção de anomalias no conjunto de teste: {test.anomaly.mean():.2%}")


Treino valve: ['value1' 'value2']
Teste valve: ['value1' 'value2']
Tamanho do treino: 319
Tamanho do teste: 80
Proporção de anomalias no conjunto de treino: 39.50%
Proporção de anomalias no conjunto de teste: 41.25%


In [21]:
train.to_parquet('./data/skab_train_agg.parquet.gzip',
              compression='gzip')

test.to_parquet('./data/skab_test_agg.parquet.gzip',
              compression='gzip')

## Processamento dos dados

In [None]:
import numpy as np
import pandas as pd
import pywt
from scipy.stats import skew, kurtosis


def wavelet_features_window(series, wavelet='db4', level=1):
    coeffs = pywt.wavedec(series, wavelet=wavelet, level=level)
    total_energy = sum(np.sum(c**2) for c in coeffs)
    total_energy = total_energy if total_energy > 1e-12 else 1e-12  # evita divisão por zero
    features = []
    
    for c in coeffs:
        c = np.array(c, dtype=np.float32)
        mean = np.mean(c)
        std = np.std(c)

        # Energia normalizada
        energy = np.sum(c**2) / total_energy

        # Entropia de Shannon
        abs_sum = np.sum(np.abs(c))
        if abs_sum > 1e-12:
            probs = np.abs(c) / abs_sum
            ent = -np.sum(probs * np.log(probs + 1e-12))
        else:
            ent = 0.0
        
        # Skew e kurtosis
        sk = skew(c, nan_policy='omit')
        if np.isnan(sk): sk = 0.0
        
        kurt = kurtosis(c, nan_policy='omit')
        if np.isnan(kurt): kurt = 0.0
        
        for val in [mean, std, energy, ent, sk, kurt]:
            if not np.isfinite(val):
                val = 0.0
            features.append(val)
    
    return features


In [24]:
def aggregate_by_valve(data, window='5T', columns=None, wavelet='db4', level=1):
    if columns is None:
        columns = ['Accelerometer1RMS', 'Accelerometer2RMS', 'Current',
       'Pressure', 'Temperature', 'Thermocouple', 'Voltage',
       'Volume Flow RateRMS']
    
    num_metrics_per_level = 6  # mean, std, energy, entropy, skew, kurtosis
    num_features = num_metrics_per_level * (level + 1)
    results = []

    for col in columns:
        resampled = data[col].resample(window)

        # --- Métricas padrão ---
        agg_df = resampled.agg(['mean','std','min','max'])

        # --- Métricas via Wavelet ---
        metrics_list = []
        valid_index = []
        for t, x in resampled:
            if len(x) > 0:  # <-- agora ignora janelas vazias
                metrics_list.append(
                    wavelet_features_window(x.values, wavelet=wavelet, level=level)
                )
                valid_index.append(t)  # guarda apenas timestamps válidos

        # cria DataFrame só com índices válidos
        metrics_wavelet = pd.DataFrame(metrics_list, index=valid_index)

        # Nomes das colunas para wavelet
        col_names = []
        for lvl in range(level+1):
            for m in ['mean','std','energy','entropy','skew','kurtosis']:
                col_names.append(f'{col}_{m}_L{lvl}')
        metrics_wavelet.columns = col_names

        # Concatena métricas padrão (filtradas) + wavelet
        combined = pd.concat([agg_df.loc[valid_index].add_prefix(f'{col}_'), 
                              metrics_wavelet], axis=1)
        results.append(combined)

    df_features = pd.concat(results, axis=1)

    # target só para janelas válidas
    df_features['anomaly'] = data['anomaly'].resample(window).max().loc[df_features.index]

    return df_features


In [30]:
dfs = []
for valve, group in df.groupby('valve'):
    group = group.set_index('datetime')  # garante que estamos usando datetime como índice
    df_valve = aggregate_by_valve(group, window='1T', level=1)

    df_valve = df_valve.reset_index()  # 'index' vira a coluna datetime
    df_valve.rename(columns={'index': 'datetime'}, inplace=True)

    df_valve['valve'] = valve  # mantém a identificação do poço
    dfs.append(df_valve)

  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  df_features['anomaly'] = data['anomaly'].resample(window).max().loc[df_features.index]
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  resampled = data[col].resample(window)
  df_features['anomaly'] = data['anomaly'].resample(window).max().loc[df_features.index]


In [31]:
# Concatena todos os poços
df_agg = pd.concat(dfs, ignore_index=True)

# Garante que datetime está ordenado e em datetime
df_agg['datetime'] = pd.to_datetime(df_agg['datetime'])
df_agg = df_agg.sort_values(['valve', 'datetime']).reset_index(drop=True)

In [32]:
df_agg.shape

(399, 131)

In [33]:
df_agg.head()

Unnamed: 0,datetime,Accelerometer1RMS_mean,Accelerometer1RMS_std,Accelerometer1RMS_min,Accelerometer1RMS_max,Accelerometer1RMS_mean_L0,Accelerometer1RMS_std_L0,Accelerometer1RMS_energy_L0,Accelerometer1RMS_entropy_L0,Accelerometer1RMS_skew_L0,...,Volume Flow RateRMS_skew_L0,Volume Flow RateRMS_kurtosis_L0,Volume Flow RateRMS_mean_L1,Volume Flow RateRMS_std_L1,Volume Flow RateRMS_energy_L1,Volume Flow RateRMS_entropy_L1,Volume Flow RateRMS_skew_L1,Volume Flow RateRMS_kurtosis_L1,anomaly,valve
0,2020-03-09 10:14:00,0.026203,0.000255,0.025694,0.026652,0.037062,0.00029,0.999973,2.772558,-0.188577,...,1.368828,0.287717,-0.097302,0.329389,5.7e-05,2.123774,0.114037,0.254641,0.0,value1
1,2020-03-09 10:15:00,0.026171,0.000263,0.025604,0.026711,0.037001,0.000303,0.999971,3.465702,-0.069063,...,-0.309117,1.765233,0.162456,0.524465,0.000144,3.170814,-0.041791,-1.357751,0.0,value1
2,2020-03-09 10:16:00,0.026345,0.000325,0.025715,0.027165,0.037255,0.000377,0.999957,3.465685,0.037739,...,1.348212,0.270238,-0.027028,0.34056,5.7e-05,2.313733,0.415614,2.04343,0.0,value1
3,2020-03-09 10:17:00,0.026352,0.000255,0.025888,0.026831,0.037277,0.00026,0.999959,3.465712,-0.334277,...,1.82933,2.09303,-0.020911,0.276333,3.7e-05,2.373578,0.161942,2.358521,0.0,value1
4,2020-03-09 10:18:00,0.026358,0.000275,0.025553,0.026919,0.037286,0.000305,0.999963,3.465702,-0.639194,...,0.812644,-0.901945,0.043931,0.379742,7.1e-05,2.726422,0.171413,-0.091527,0.0,value1


In [34]:
df_agg.anomaly.value_counts().sum()

399

In [36]:
df_agg[~df_agg.anomaly.isnull()].shape

(399, 131)

In [37]:
def diagnostico_colunas_zeradas(df, valve_col='valve'):
    """
    Para cada valve, mostra quais colunas estão totalmente zeradas,
    quantas estão zeradas parcialmente e o percentual de zeros.
    """
    resultados = {}

    for valve, grupo in df.groupby(valve_col):
        # Verifica colunas totalmente zeradas
        colunas_zeradas = grupo.columns[(grupo == 0).all()]
        
        # Conta zeros por coluna e calcula percentual
        contagem_zeros = (grupo == 0).sum()
        percentual_zeros = (grupo == 0).mean() * 100  # em %

        resultados[valve] = {
            "colunas_totalmente_zeradas": list(colunas_zeradas),
            "contagem_zeros": contagem_zeros.to_dict(),
            "percentual_zeros": percentual_zeros.to_dict()
        }

        print(f"\n=== valve: {valve} ===")
        print(f"Colunas totalmente zeradas: {list(colunas_zeradas)}")
        print("Top 5 colunas com mais zeros:")
        print(percentual_zeros.sort_values(ascending=False).head())

    return resultados

In [39]:
# Exemplo de uso:
resultados = diagnostico_colunas_zeradas(df_agg, valve_col='valve')



=== valve: value1 ===
Colunas totalmente zeradas: []
Top 5 colunas com mais zeros:
anomaly                     60.3125
datetime                     0.0000
Thermocouple_kurtosis_L0     0.0000
Thermocouple_max             0.0000
Thermocouple_mean_L0         0.0000
dtype: float64

=== valve: value2 ===
Colunas totalmente zeradas: []
Top 5 colunas com mais zeros:
anomaly                     59.493671
datetime                     0.000000
Thermocouple_kurtosis_L0     0.000000
Thermocouple_max             0.000000
Thermocouple_mean_L0         0.000000
dtype: float64


In [40]:
df_agg['valve'].value_counts()

valve
value1    320
value2     79
Name: count, dtype: int64

In [41]:
df_agg.anomaly.value_counts()

anomaly
0.0    240
1.0    159
Name: count, dtype: int64

In [42]:
df_agg.valve.value_counts(normalize=True)

valve
value1    0.802005
value2    0.197995
Name: proportion, dtype: float64

In [43]:
pd.crosstab(df_agg['valve'], df_agg['anomaly'])

anomaly,0.0,1.0
valve,Unnamed: 1_level_1,Unnamed: 2_level_1
value1,193,127
value2,47,32


### Feature selection

In [44]:
def select_low_correlation_columns(df, threshold=0.99):
    """
    Retorna uma lista de colunas com correlação de Pearson
    menor que o threshold entre si.

    Parâmetros:
    -----------
    df : pd.DataFrame
        DataFrame com features numéricas.
    threshold : float
        Limite máximo de correlação permitido entre duas colunas.

    Retorna:
    --------
    selected_columns : list
        Lista de colunas selecionadas (menos correlacionadas).
    """
    corr_matrix = df.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Lista de colunas a remover
    removed_columns = [col for col in upper_tri.columns if any(upper_tri[col] >= threshold)]

    # Colunas selecionadas
    selected_columns = [col for col in df.columns if col not in removed_columns]
    
    return selected_columns

In [51]:
df_agg.shape

(399, 131)

In [46]:
feature_modeling = [col for col in df_agg.columns if col not in ['datetime', 'valve', 'anomaly']]
feature_modeling

['Accelerometer1RMS_mean',
 'Accelerometer1RMS_std',
 'Accelerometer1RMS_min',
 'Accelerometer1RMS_max',
 'Accelerometer1RMS_mean_L0',
 'Accelerometer1RMS_std_L0',
 'Accelerometer1RMS_energy_L0',
 'Accelerometer1RMS_entropy_L0',
 'Accelerometer1RMS_skew_L0',
 'Accelerometer1RMS_kurtosis_L0',
 'Accelerometer1RMS_mean_L1',
 'Accelerometer1RMS_std_L1',
 'Accelerometer1RMS_energy_L1',
 'Accelerometer1RMS_entropy_L1',
 'Accelerometer1RMS_skew_L1',
 'Accelerometer1RMS_kurtosis_L1',
 'Accelerometer2RMS_mean',
 'Accelerometer2RMS_std',
 'Accelerometer2RMS_min',
 'Accelerometer2RMS_max',
 'Accelerometer2RMS_mean_L0',
 'Accelerometer2RMS_std_L0',
 'Accelerometer2RMS_energy_L0',
 'Accelerometer2RMS_entropy_L0',
 'Accelerometer2RMS_skew_L0',
 'Accelerometer2RMS_kurtosis_L0',
 'Accelerometer2RMS_mean_L1',
 'Accelerometer2RMS_std_L1',
 'Accelerometer2RMS_energy_L1',
 'Accelerometer2RMS_entropy_L1',
 'Accelerometer2RMS_skew_L1',
 'Accelerometer2RMS_kurtosis_L1',
 'Current_mean',
 'Current_std',
 'Cur

In [47]:
feature_model = select_low_correlation_columns(df_agg[feature_modeling], threshold=0.99)
print("Colunas selecionadas:", feature_model)

Colunas selecionadas: ['Accelerometer1RMS_mean', 'Accelerometer1RMS_std', 'Accelerometer1RMS_min', 'Accelerometer1RMS_max', 'Accelerometer1RMS_std_L0', 'Accelerometer1RMS_energy_L0', 'Accelerometer1RMS_entropy_L0', 'Accelerometer1RMS_skew_L0', 'Accelerometer1RMS_kurtosis_L0', 'Accelerometer1RMS_mean_L1', 'Accelerometer1RMS_std_L1', 'Accelerometer1RMS_entropy_L1', 'Accelerometer1RMS_skew_L1', 'Accelerometer1RMS_kurtosis_L1', 'Accelerometer2RMS_mean', 'Accelerometer2RMS_std', 'Accelerometer2RMS_min', 'Accelerometer2RMS_max', 'Accelerometer2RMS_std_L0', 'Accelerometer2RMS_energy_L0', 'Accelerometer2RMS_skew_L0', 'Accelerometer2RMS_kurtosis_L0', 'Accelerometer2RMS_mean_L1', 'Accelerometer2RMS_std_L1', 'Accelerometer2RMS_entropy_L1', 'Accelerometer2RMS_skew_L1', 'Accelerometer2RMS_kurtosis_L1', 'Current_mean', 'Current_std', 'Current_min', 'Current_max', 'Current_mean_L0', 'Current_std_L0', 'Current_energy_L0', 'Current_skew_L0', 'Current_kurtosis_L0', 'Current_mean_L1', 'Current_std_L1', '

In [48]:
len(feature_modeling)

128

In [49]:
len(feature_model)

105

In [50]:
len(feature_modeling) - len(feature_model)

23

In [52]:
df_agg2 = df_agg[['datetime', 'valve', 'anomaly'] + feature_model]
df_agg2.shape

(399, 108)

### Data split

In [53]:
# Lista para armazenar os dados de treinamento e teste
train = list()
test = list()

data = df_agg2.sort_values(by=['valve', 'datetime'])

# Iterar sobre cada poco
for poco_id, poco_data in data.groupby('valve'):
    # Calcular o índice para dividir os dados em treinamento e teste
    split_index = int(0.8 * len(poco_data))
    
    # Dividir os dados do poco em treinamento e teste
    poco_train = poco_data.iloc[:split_index]
    poco_test = poco_data.iloc[split_index:]
    
    # Adicionar os dados de treinamento e teste à lista
    train.append(poco_train)
    test.append(poco_test)

# Concatenar os dados de treinamento e teste
train = pd.concat(train)
test = pd.concat(test)

# Verificar o tamanho dos conjuntos de treinamento e teste
print("Tamanho do conjunto de treinamento:", len(train))
print("Tamanho do conjunto de teste:", len(test))


Tamanho do conjunto de treinamento: 319
Tamanho do conjunto de teste: 80


In [54]:
print(f'Treino valve: {train.valve.unique()}')
print(f'Teste valve: {test.valve.unique()}')
print(f'Tamanho do treino: {len(train)}')
print(f'Tamanho do teste: {len(test)}')

Treino valve: ['value1' 'value2']
Teste valve: ['value1' 'value2']
Tamanho do treino: 319
Tamanho do teste: 80


In [55]:
# Verificar a proporção de anomalias
print(f"Proporção de anomalias no conjunto de treino: {train.anomaly.mean():.2%}")
print(f"Proporção de anomalias no conjunto de teste: {test.anomaly.mean():.2%}")

Proporção de anomalias no conjunto de treino: 39.50%
Proporção de anomalias no conjunto de teste: 41.25%


In [56]:
train.head()

Unnamed: 0,datetime,valve,anomaly,Accelerometer1RMS_mean,Accelerometer1RMS_std,Accelerometer1RMS_min,Accelerometer1RMS_max,Accelerometer1RMS_std_L0,Accelerometer1RMS_energy_L0,Accelerometer1RMS_entropy_L0,...,Volume Flow RateRMS_max,Volume Flow RateRMS_std_L0,Volume Flow RateRMS_energy_L0,Volume Flow RateRMS_skew_L0,Volume Flow RateRMS_kurtosis_L0,Volume Flow RateRMS_mean_L1,Volume Flow RateRMS_std_L1,Volume Flow RateRMS_entropy_L1,Volume Flow RateRMS_skew_L1,Volume Flow RateRMS_kurtosis_L1
0,2020-03-09 10:14:00,value1,0.0,0.026203,0.000255,0.025694,0.026652,0.00029,0.999973,2.772558,...,32.9962,0.279597,0.999943,1.368828,0.287717,-0.097302,0.329389,2.123774,0.114037,0.254641
1,2020-03-09 10:15:00,value1,0.0,0.026171,0.000263,0.025604,0.026711,0.000303,0.999971,3.465702,...,32.9962,0.45572,0.999856,-0.309117,1.765233,0.162456,0.524465,3.170814,-0.041791,-1.357751
2,2020-03-09 10:16:00,value1,0.0,0.026345,0.000325,0.025715,0.027165,0.000377,0.999957,3.465685,...,32.9964,0.255478,0.999943,1.348212,0.270238,-0.027028,0.34056,2.313733,0.415614,2.04343
3,2020-03-09 10:17:00,value1,0.0,0.026352,0.000255,0.025888,0.026831,0.00026,0.999959,3.465712,...,32.9964,0.23372,0.999963,1.82933,2.09303,-0.020911,0.276333,2.373578,0.161942,2.358521
4,2020-03-09 10:18:00,value1,0.0,0.026358,0.000275,0.025553,0.026919,0.000305,0.999963,3.465702,...,32.9966,0.339662,0.999929,0.812644,-0.901945,0.043931,0.379742,2.726422,0.171413,-0.091527


In [57]:
train.tail()

Unnamed: 0,datetime,valve,anomaly,Accelerometer1RMS_mean,Accelerometer1RMS_std,Accelerometer1RMS_min,Accelerometer1RMS_max,Accelerometer1RMS_std_L0,Accelerometer1RMS_energy_L0,Accelerometer1RMS_entropy_L0,...,Volume Flow RateRMS_max,Volume Flow RateRMS_std_L0,Volume Flow RateRMS_energy_L0,Volume Flow RateRMS_skew_L0,Volume Flow RateRMS_kurtosis_L0,Volume Flow RateRMS_mean_L1,Volume Flow RateRMS_std_L1,Volume Flow RateRMS_entropy_L1,Volume Flow RateRMS_skew_L1,Volume Flow RateRMS_kurtosis_L1
378,2020-03-09 16:54:00,value2,0.0,0.027539,0.000376,0.026828,0.028403,0.000394,0.99993,3.433936,...,32.9584,0.297338,0.999893,0.285174,0.575001,0.100217,0.456791,2.734323,1.130926,1.694846
379,2020-03-09 16:55:00,value2,0.0,0.02757,0.00033,0.026735,0.028292,0.000363,0.99995,3.433944,...,32.9584,0.444783,0.999939,-0.419219,1.485768,-0.017475,0.350139,2.732703,-0.394482,0.10673
380,2020-03-09 16:56:00,value2,0.0,0.027345,0.000333,0.026849,0.028484,0.000374,0.999954,3.433941,...,32.9581,0.263289,0.999962,0.004636,2.814032,0.020873,0.279022,2.087904,0.270288,3.461863
381,2020-03-09 16:57:00,value2,0.0,0.027415,0.000343,0.026715,0.028057,0.000388,0.999947,3.465686,...,32.9581,0.334147,0.999869,-0.618868,0.405131,-0.121265,0.502193,2.76897,-0.87254,0.438202
382,2020-03-09 16:58:00,value2,0.0,0.027615,0.000318,0.027062,0.028337,0.000361,0.999955,3.433944,...,32.9579,0.379564,0.999942,0.21905,0.156503,-0.062938,0.338623,2.864726,0.09342,-0.287324


In [58]:
test.head()

Unnamed: 0,datetime,valve,anomaly,Accelerometer1RMS_mean,Accelerometer1RMS_std,Accelerometer1RMS_min,Accelerometer1RMS_max,Accelerometer1RMS_std_L0,Accelerometer1RMS_energy_L0,Accelerometer1RMS_entropy_L0,...,Volume Flow RateRMS_max,Volume Flow RateRMS_std_L0,Volume Flow RateRMS_energy_L0,Volume Flow RateRMS_skew_L0,Volume Flow RateRMS_kurtosis_L0,Volume Flow RateRMS_mean_L1,Volume Flow RateRMS_std_L1,Volume Flow RateRMS_entropy_L1,Volume Flow RateRMS_skew_L1,Volume Flow RateRMS_kurtosis_L1
256,2020-03-09 14:31:00,value1,1.0,0.027521,0.00036,0.026636,0.028223,0.000405,0.999952,3.433933,...,32.9745,1.977376,0.999925,0.941821,-0.98397,-0.037259,0.36446,2.511131,-0.733606,2.868316
257,2020-03-09 14:32:00,value1,0.0,0.027547,0.000382,0.02678,0.028437,0.000487,0.999954,3.465657,...,32.9743,0.43198,0.999911,0.053479,-0.544498,0.005531,0.427268,2.890837,0.202181,-0.422059
258,2020-03-09 14:33:00,value1,0.0,0.02749,0.000377,0.026609,0.028414,0.000464,0.999958,3.433916,...,33.0,0.46979,0.99989,0.436664,0.491496,0.128738,0.461063,2.944145,-0.168267,-0.686981
259,2020-03-09 14:34:00,value1,0.0,0.027555,0.00036,0.026849,0.028303,0.000445,0.999957,3.465671,...,32.974,0.317402,0.999939,1.039247,-0.281414,0.062664,0.350143,2.613412,0.332838,0.363288
260,2020-03-09 14:35:00,value1,0.0,0.027355,0.000557,0.026283,0.028466,0.000736,0.999949,3.465554,...,33.0,0.395182,0.999905,0.082523,-1.305361,0.165117,0.412875,3.045536,-0.084549,-0.920412


In [59]:
test.tail()

Unnamed: 0,datetime,valve,anomaly,Accelerometer1RMS_mean,Accelerometer1RMS_std,Accelerometer1RMS_min,Accelerometer1RMS_max,Accelerometer1RMS_std_L0,Accelerometer1RMS_energy_L0,Accelerometer1RMS_entropy_L0,...,Volume Flow RateRMS_max,Volume Flow RateRMS_std_L0,Volume Flow RateRMS_energy_L0,Volume Flow RateRMS_skew_L0,Volume Flow RateRMS_kurtosis_L0,Volume Flow RateRMS_mean_L1,Volume Flow RateRMS_std_L1,Volume Flow RateRMS_entropy_L1,Volume Flow RateRMS_skew_L1,Volume Flow RateRMS_kurtosis_L1
394,2020-03-09 17:10:00,value2,1.0,0.027408,0.000318,0.026671,0.028005,0.000431,0.999977,3.433925,...,29.0,0.343275,0.999825,0.128994,-0.546819,0.138425,0.518033,3.234564,-0.583329,-0.870918
395,2020-03-09 17:11:00,value2,1.0,0.027433,0.000296,0.026709,0.027959,0.000343,0.999968,3.465697,...,29.0,0.471291,0.999842,-0.178606,-0.856901,-0.084501,0.499454,3.173012,0.075588,-1.066161
396,2020-03-09 17:12:00,value2,1.0,0.027367,0.000287,0.026753,0.028076,0.000344,0.999976,3.433948,...,32.0,2.041854,0.999865,0.555438,-1.438656,0.124748,0.472641,3.045365,0.18575,-0.362831
397,2020-03-09 17:13:00,value2,1.0,0.027383,0.000319,0.026678,0.028,0.000373,0.999959,3.46569,...,32.9562,0.447332,0.999865,0.242628,-0.033591,-0.034697,0.527253,3.03582,0.247175,0.815768
398,2020-03-09 17:14:00,value2,0.0,0.027237,0.000289,0.026822,0.027582,0.00033,0.999965,2.079405,...,32.9562,0.330445,0.999923,0.223344,-1.575231,-0.088884,0.390172,1.660261,0.388925,-0.382995


### Feature Selection

In [60]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from boruta import BorutaPy

def selecionar_features_com_boruta(df: pd.DataFrame, target: str, task_type="classification"):
    X = df.drop(columns=[target]).values
    y = df[target].values
    
    if task_type == "classification":
        rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    elif task_type == "regression":
        rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    else:
        raise ValueError("task_type deve ser 'classification' ou 'regression'")
    
    # Inicializa Boruta
    boruta_selector = BorutaPy(
        estimator=rf,
        n_estimators="auto",  # Deixa o Boruta decidir o número ótimo de árvores
        verbose=0,
        alpha=0.05,
        random_state=42
    )
    
    boruta_selector.fit(X, y)
    
    # Cria DataFrame com resultados
    features = df.drop(columns=[target]).columns
    resultados = pd.DataFrame({
        "feature": features,
        "support": boruta_selector.support_,
        "ranking": boruta_selector.ranking_
    }).sort_values(by="ranking")
    
    selected_features = resultados[resultados["support"]]["feature"].tolist()
    
    return selected_features, resultados


In [61]:

selected, ranking = selecionar_features_com_boruta(train[feature_model + ['anomaly']], "anomaly", task_type="classification")
print("Features selecionadas:", selected)
print(ranking)

Features selecionadas: ['Volume Flow RateRMS_mean', 'Volume Flow RateRMS_std', 'Volume Flow RateRMS_min', 'Volume Flow RateRMS_max', 'Volume Flow RateRMS_energy_L0', 'Volume Flow RateRMS_std_L0']
                          feature  support  ranking
92       Volume Flow RateRMS_mean     True        1
93        Volume Flow RateRMS_std     True        1
94        Volume Flow RateRMS_min     True        1
95        Volume Flow RateRMS_max     True        1
97  Volume Flow RateRMS_energy_L0     True        1
..                            ...      ...      ...
43                   Pressure_min    False       96
58          Temperature_energy_L0    False       97
44                   Pressure_max    False       98
69         Thermocouple_energy_L0    False       99
74         Thermocouple_energy_L1    False       99

[105 rows x 3 columns]


In [63]:
len(selected)

6

In [64]:
train[selected + ['datetime', 'valve', 'anomaly']].shape

(319, 9)

In [65]:
train_processed = train[selected + ['datetime', 'valve', 'anomaly']]
train_processed.shape

(319, 9)

In [67]:
train_processed.head()

Unnamed: 0,Volume Flow RateRMS_mean,Volume Flow RateRMS_std,Volume Flow RateRMS_min,Volume Flow RateRMS_max,Volume Flow RateRMS_energy_L0,Volume Flow RateRMS_std_L0,datetime,valve,anomaly
0,32.115408,0.324407,32.0,32.9962,0.999943,0.279597,2020-03-09 10:14:00,value1,0.0
1,32.293166,0.493304,31.004,32.9962,0.999856,0.45572,2020-03-09 10:15:00,value1,0.0
2,32.105353,0.308289,32.0,32.9964,0.999943,0.255478,2020-03-09 10:16:00,value1,0.0
3,32.070182,0.256715,32.0,32.9964,0.999963,0.23372,2020-03-09 10:17:00,value1,0.0
4,32.157911,0.366335,32.0,32.9966,0.999929,0.339662,2020-03-09 10:18:00,value1,0.0


In [66]:
test_processed = test[selected + ['datetime', 'valve', 'anomaly']]
test_processed.shape

(80, 9)

In [68]:
train_processed.to_parquet('./data/skab_train_agg_dwt.parquet.gzip',
              compression='gzip')

test_processed.to_parquet('./data/skab_test_agg_dwt.parquet.gzip',
              compression='gzip')