# Instâncias reais

* **timestamp**: observations timestamps loaded into pandas DataFrame as its index;
* **P-PDG**: pressure variable at the Permanent Downhole Gauge (PDG);
* **P-TPT**: pressure variable at the Temperature and Pressure Transducer (TPT);
* **T-TPT**: temperature variable at the Temperature and Pressure Transducer (TPT);
* **P-MON-CKP**: pressure variable upstream of the production choke (CKP);
* **T-JUS-CKP**: temperature variable downstream of the production choke (CKP);
* **P-JUS-CKGL**: pressure variable upstream of the gas lift choke (CKGL);
* **T-JUS-CKGL**: temperature variable upstream of the gas lift choke (CKGL);
* **QGL**: gas lift flow rate;
* **class**: observations labels associated with three types of periods (normal, fault transient, and faulty steady state).


* **label**: instance label (event type);
* **well**: well name. Hand-drawn and simulated instances have fixed names. Real instances have names masked with incremental id;
* **id**: instance identifier. Hand-drawn and simulated instances have incremental id. Each real instance has an id generated from its first timestamp.

https://github.com/petrobras/3W

In [1]:
import sys
import os
import pandas as pd
import numpy as np

import joblib
import pickle

from pyod.models.ecod import ECOD

from pyod.models.suod import SUOD

from pyod.models.xgbod import XGBOD

from pyod.models.loda import LODA

from pyod.models.pca import PCA



import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDOneClassSVM
from sklearn.pipeline import make_pipeline
from sklearn.svm import OneClassSVM

from sklearn.model_selection import GroupKFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics  import average_precision_score, make_scorer, roc_curve,f1_score, precision_score, recall_score, fbeta_score, auc, roc_auc_score, accuracy_score, confusion_matrix, classification_report,precision_recall_curve
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit

### Function to calculate metrics

In [2]:
def metrics(y_test, y_pred):
    cm = list()
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm)                      
    plt.figure(figsize=(8,6))  
    sns.heatmap(cm_df, annot=True)
    print("Classification Report: \n", classification_report(y_test, y_pred, digits=5))  


    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TP = cm[1][1]

    print("Specificity:", TN/(TN+FP))


    roc_auc = roc_auc_score(y_test, y_pred, multi_class = 'ovr', average=None)
    gini = 2*roc_auc -1
    print("Gini: ",gini)
    print("ROC AUC:: ",roc_auc)

## Read data

In [4]:
df = pd.read_parquet('./dataset/real_instances.parquet.gzip') 
df.shape

(14516197, 12)

In [5]:
df.head()

Unnamed: 0_level_0,label,well,id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-02-01 02:02:07,0,WELL-00001,20170201020207,0.0,10092110.0,119.0944,1609800.0,84.59782,1564147.0,,0.0,0.0
2017-02-01 02:02:08,0,WELL-00001,20170201020207,0.0,10092000.0,119.0944,1618206.0,84.58997,1564148.0,,0.0,0.0
2017-02-01 02:02:09,0,WELL-00001,20170201020207,0.0,10091890.0,119.0944,1626612.0,84.58213,1564148.0,,0.0,0.0
2017-02-01 02:02:10,0,WELL-00001,20170201020207,0.0,10091780.0,119.0944,1635018.0,84.57429,1564148.0,,0.0,0.0
2017-02-01 02:02:11,0,WELL-00001,20170201020207,0.0,10091670.0,119.0944,1643424.0,84.56644,1564148.0,,0.0,0.0


In [6]:
df_simulated = pd.read_parquet('./dataset/simulated_instances.parquet.gzip') 
df_simulated.shape

(33902803, 12)

In [7]:
df_simulated.head()

Unnamed: 0_level_0,label,well,id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-10-06 03:57:02,1,SIMULATED,1,22520410.0,13068630.0,96.93275,1049626.0,70.33402,,,,0
2018-10-06 03:57:03,1,SIMULATED,1,22520430.0,13068650.0,96.93279,1049626.0,70.33425,,,,0
2018-10-06 03:57:04,1,SIMULATED,1,22520440.0,13068640.0,96.93283,1049626.0,70.33449,,,,0
2018-10-06 03:57:05,1,SIMULATED,1,22520420.0,13068600.0,96.93287,1049626.0,70.33473,,,,0
2018-10-06 03:57:06,1,SIMULATED,1,22520390.0,13068560.0,96.9329,1049626.0,70.33496,,,,0


In [8]:
df_simulated.tail()

Unnamed: 0_level_0,label,well,id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-05-19 19:01:05,8,SIMULATED,81,33586320.0,28076270.0,3.253794,4002285.0,37.74498,,,,108
2018-05-19 19:01:06,8,SIMULATED,81,33586440.0,28076420.0,3.253723,4002285.0,38.21421,,,,108
2018-05-19 19:01:07,8,SIMULATED,81,33586900.0,28076490.0,3.253835,4002278.0,38.75061,,,,108
2018-05-19 19:01:08,8,SIMULATED,81,33586480.0,28076560.0,3.253942,4002255.0,39.31084,,,,108
2018-05-19 19:01:09,8,SIMULATED,81,33587020.0,28076570.0,3.254185,4002234.0,39.83253,,,,108


In [6]:
df.id.nunique()

1013

In [7]:
df['label'].value_counts()

label
0    9903155
4    2462076
3     569152
5     552529
7     466338
1     312136
2     194233
6      56578
Name: count, dtype: int64

In [8]:
df['class'].value_counts()

class
0.0      10003293
4.0       2462076
3.0        569152
105.0      317565
107.0      283262
101.0       95658
102.0       65130
7.0         25870
2.0         16100
5.0         13031
6.0         12951
1.0         10417
106.0        6252
Name: count, dtype: int64

In [9]:
pd.crosstab(df['well'], df['label'])

label,0,1,2,3,4,5,6,7
well,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
WELL-00001,1652442,58468,0,17976,261457,0,0,73033
WELL-00002,3641159,24017,9121,0,807601,0,47869,0
WELL-00003,463543,0,35406,0,0,0,0,0
WELL-00004,85505,0,0,0,307674,0,8709,0
WELL-00005,979611,0,0,0,271525,0,0,0
WELL-00006,2058403,229651,0,0,0,0,0,305517
WELL-00007,14370,0,0,0,71651,0,0,0
WELL-00008,1008122,0,0,0,0,0,0,0
WELL-00009,0,0,6738,0,0,0,0,0
WELL-00010,0,0,9809,0,592220,0,0,0


In [10]:
pd.crosstab(df['class'], df['label'])

label,0,1,2,3,4,5,6,7
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,9439612,195376,52017,0,0,127930,36319,152039
1.0,0,10417,0,0,0,0,0,0
2.0,0,0,16100,0,0,0,0,0
3.0,0,0,0,569152,0,0,0,0
4.0,0,0,0,0,2462076,0,0,0
5.0,0,0,0,0,0,13031,0,0
6.0,0,0,0,0,0,0,12951,0
7.0,0,0,0,0,0,0,0,25870
101.0,0,95658,0,0,0,0,0,0
102.0,0,0,65130,0,0,0,0,0


In [11]:
df.drop_duplicates(subset=['id'], keep='last').groupby("label")["id"].count()

label
0    588
1      5
2     22
3     32
4    344
5     11
6      6
7      5
Name: id, dtype: int64

In [12]:
df.isnull().sum()/len(df)

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.000580
P-TPT         0.008007
T-TPT         0.008007
P-MON-CKP     0.077959
T-JUS-CKP     0.113279
P-JUS-CKGL    0.071501
T-JUS-CKGL    1.000000
QGL           0.191125
class         0.043775
dtype: float64

In [13]:
df[df['well']=='WELL-00005'].isnull().sum()/len(df[df['well']=='WELL-00005'])

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.000000
P-TPT         0.000000
T-TPT         0.000000
P-MON-CKP     0.000026
T-JUS-CKP     0.000000
P-JUS-CKGL    0.000018
T-JUS-CKGL    1.000000
QGL           1.000000
class         0.000000
dtype: float64

In [14]:
df[df['well']=='WELL-00003'].isnull().sum()/len(df[df['well']=='WELL-00003'])

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.006151
P-TPT         0.006151
T-TPT         0.006177
P-MON-CKP     0.000884
T-JUS-CKP     1.000000
P-JUS-CKGL    1.000000
T-JUS-CKGL    1.000000
QGL           0.001309
class         1.000000
dtype: float64

In [15]:
df[df['well']=='WELL-00005'].head()

Unnamed: 0_level_0,label,well,id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-03-31 06:00:14,0,WELL-00005,20170331060014,0.0,20792900.0,106.3653,11509150.0,67.44353,1325708000.0,,,0.0
2017-03-31 06:00:15,0,WELL-00005,20170331060014,0.0,20793320.0,106.3653,11509150.0,67.4435,1325647000.0,,,0.0
2017-03-31 06:00:16,0,WELL-00005,20170331060014,0.0,20793730.0,106.3653,11509150.0,67.44349,1325586000.0,,,0.0
2017-03-31 06:00:17,0,WELL-00005,20170331060014,0.0,20794150.0,106.3653,11509150.0,67.44347,1325526000.0,,,0.0
2017-03-31 06:00:18,0,WELL-00005,20170331060014,0.0,20794570.0,106.3653,11509150.0,67.44345,1325465000.0,,,0.0


In [16]:
df[df['well']=='WELL-00005']['QGL'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: QGL, dtype: float64

In [17]:
df[df['well']=='WELL-00004'].isnull().sum()/len(df[df['well']=='WELL-00004'])

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.000848
P-TPT         0.001617
T-TPT         0.001590
P-MON-CKP     0.000000
T-JUS-CKP     0.001660
P-JUS-CKGL    1.000000
T-JUS-CKGL    1.000000
QGL           1.000000
class         0.001426
dtype: float64

In [18]:
df[df['well']=='WELL-00008'].isnull().sum()/len(df[df['well']=='WELL-00008'])

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.000000
P-TPT         0.000000
T-TPT         0.000000
P-MON-CKP     1.000000
T-JUS-CKP     1.000000
P-JUS-CKGL    0.000011
T-JUS-CKGL    1.000000
QGL           1.000000
class         0.000000
dtype: float64

In [19]:
df = df.reset_index()
df.head()

Unnamed: 0,timestamp,label,well,id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2017-02-01 02:02:07,0,WELL-00001,20170201020207,0.0,10092110.0,119.0944,1609800.0,84.59782,1564147.0,,0.0,0.0
1,2017-02-01 02:02:08,0,WELL-00001,20170201020207,0.0,10092000.0,119.0944,1618206.0,84.58997,1564148.0,,0.0,0.0
2,2017-02-01 02:02:09,0,WELL-00001,20170201020207,0.0,10091890.0,119.0944,1626612.0,84.58213,1564148.0,,0.0,0.0
3,2017-02-01 02:02:10,0,WELL-00001,20170201020207,0.0,10091780.0,119.0944,1635018.0,84.57429,1564148.0,,0.0,0.0
4,2017-02-01 02:02:11,0,WELL-00001,20170201020207,0.0,10091670.0,119.0944,1643424.0,84.56644,1564148.0,,0.0,0.0


In [20]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [21]:
df['target'] = 1
df.loc[(df['label']==0)&(df['class']==0), "target"] = 0

In [22]:
df.target.value_counts()

target
0    9439612
1    5076585
Name: count, dtype: int64

In [23]:
df.target.value_counts(normalize=True)

target
0    0.650281
1    0.349719
Name: proportion, dtype: float64

In [24]:
df2 = df.drop(['T-JUS-CKGL'], axis=1)
df2 = df2.dropna()

In [25]:
df2.isnull().sum()/len(df2)

timestamp     0.0
label         0.0
well          0.0
id            0.0
P-PDG         0.0
P-TPT         0.0
T-TPT         0.0
P-MON-CKP     0.0
T-JUS-CKP     0.0
P-JUS-CKGL    0.0
QGL           0.0
class         0.0
target        0.0
dtype: float64

In [26]:
df2[df2['QGL']==0].shape[0]

9732398

In [27]:
df2[df2['QGL']==0].shape[0]/len(df2)

0.8760840894710648

In [28]:
for i in ['QGL','P-PDG','P-TPT','T-TPT','P-MON-CKP','T-JUS-CKP','P-JUS-CKGL']:
    print(df2[df2[i]==0].shape[0]/len(df2))

0.8760840894710648
0.5958815109618104
0.0
0.0
0.0
0.0
0.0


In [29]:
for i in ['QGL','P-PDG','P-TPT','T-TPT','P-MON-CKP','T-JUS-CKP','P-JUS-CKGL']:
    print(df2[i].describe())

count    1.110898e+07
mean     1.669846e-01
std      4.545471e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      4.146514e+00
Name: QGL, dtype: float64
count    1.110898e+07
mean    -3.768879e+40
std      2.075011e+41
min     -1.180116e+42
25%      0.000000e+00
50%      0.000000e+00
75%      2.383022e+07
max      4.485805e+07
Name: P-PDG, dtype: float64
count    1.110898e+07
mean     2.786632e+07
std      2.052537e+08
min      4.488654e+06
25%      8.529824e+06
50%      1.412388e+07
75%      1.664426e+07
max      2.941990e+09
Name: P-TPT, dtype: float64
count    1.110898e+07
mean     1.082515e+02
std      2.157222e+01
min      7.236000e+00
25%      1.164867e+02
50%      1.170522e+02
75%      1.178645e+02
max      1.196061e+02
Name: T-TPT, dtype: float64
count    1.110898e+07
mean     4.574082e+06
std      3.594868e+06
min      7.528369e+05
25%      1.682463e+06
50%      2.493327e+06
75%      6.179458e+06
max      1.284977e+07
Name: P-MO

In [30]:
df2 = df2.drop(['QGL'], axis=1)

In [None]:
# for i in df2['well'].unique():
#     print(i)
#     print(df2[df2['well']==i].isnull().sum()/len(df2[df2['well']==i]))

In [None]:
# window = '5T'  # Ajuste para '5T', '10T', etc.

# def aggregate_by_well(data, window):
#     return data.resample(window).agg({
#         'P-PDG': ['mean', 'std', 'min', 'max'],
#         'P-TPT': ['mean', 'std', 'min', 'max'],
#         'T-TPT': ['mean', 'std', 'min', 'max'],
#         'P-MON-CKP': ['mean', 'std', 'min', 'max'],
#         'T-JUS-CKP': ['mean', 'std', 'min', 'max'],
#         'P-JUS-CKGL': ['mean', 'std', 'min', 'max'],
#         'target': 'max'  # Se houver pelo menos um evento anômalo na janela
#     })

# df_agg = df.groupby('well').apply(lambda x: x.set_index('timestamp').pipe(aggregate_by_well, window)).reset_index()

# # Renomeando colunas
# df_agg.columns = ['_'.join(col).strip('_') for col in df_agg.columns]
# df_agg.dropna(inplace=True)
# df_agg = df_agg.rename(columns={"target_max": "target"})

In [31]:
import numpy as np
import pandas as pd
import pywt
from scipy.stats import skew, kurtosis


def wavelet_features_window(series, wavelet='db4', level=1):
    coeffs = pywt.wavedec(series, wavelet=wavelet, level=level)
    total_energy = sum(np.sum(c**2) for c in coeffs)
    total_energy = total_energy if total_energy > 1e-12 else 1e-12  # evita divisão por zero
    features = []
    
    for c in coeffs:
        c = np.array(c, dtype=np.float32)
        mean = np.mean(c)
        std = np.std(c)

        # Energia normalizada
        energy = np.sum(c**2) / total_energy

        # Entropia de Shannon
        abs_sum = np.sum(np.abs(c))
        if abs_sum > 1e-12:
            probs = np.abs(c) / abs_sum
            ent = -np.sum(probs * np.log(probs + 1e-12))
        else:
            ent = 0.0
        
        # Skew e kurtosis
        sk = skew(c, nan_policy='omit')
        if np.isnan(sk): sk = 0.0
        
        kurt = kurtosis(c, nan_policy='omit')
        if np.isnan(kurt): kurt = 0.0
        
        for val in [mean, std, energy, ent, sk, kurt]:
            if not np.isfinite(val):
                val = 0.0
            features.append(val)
    
    return features


# def aggregate_by_well(data, window='5T', columns=None, wavelet='db4', level=1):
#     if columns is None:
#         columns = ['P-PDG','P-TPT','T-TPT','P-MON-CKP','T-JUS-CKP','P-JUS-CKGL']
    
#     num_metrics_per_level = 6  # mean, std, energy, entropy, skew, kurtosis
#     num_features = num_metrics_per_level * (level + 1)
#     results = []

#     for col in columns:
#         resampled = data[col].resample(window)

#         # --- Métricas padrão ---
#         agg_df = resampled.agg(['mean','std','min','max'])

#         # --- Métricas via Wavelet ---
#         metrics_list = []
#         for _, x in resampled:
#             if len(x) > 0:
#                 metrics_list.append(wavelet_features_window(x.values, wavelet=wavelet, level=level))
#             else:
#                 metrics_list.append([np.nan] * num_features)

#         metrics_wavelet = pd.DataFrame(metrics_list, index=agg_df.index)

#         # Nomes das colunas para wavelet
#         col_names = []
#         for lvl in range(level+1):
#             for m in ['mean','std','energy','entropy','skew','kurtosis']:
#                 col_names.append(f'{col}_{m}_L{lvl}')
#         metrics_wavelet.columns = col_names

#         # Concatena métricas padrão + wavelet
#         combined = pd.concat([agg_df.add_prefix(f'{col}_'), metrics_wavelet], axis=1)
#         results.append(combined)

#     df_features = pd.concat(results, axis=1)
#     df_features['target'] = data['target'].resample(window).max()
#     return df_features


In [32]:
def aggregate_by_well(data, window='5T', columns=None, wavelet='db4', level=1):
    if columns is None:
        columns = ['P-PDG','P-TPT','T-TPT','P-MON-CKP','T-JUS-CKP','P-JUS-CKGL']
    
    num_metrics_per_level = 6  # mean, std, energy, entropy, skew, kurtosis
    num_features = num_metrics_per_level * (level + 1)
    results = []

    for col in columns:
        resampled = data[col].resample(window)

        # --- Métricas padrão ---
        agg_df = resampled.agg(['mean','std','min','max'])

        # --- Métricas via Wavelet ---
        metrics_list = []
        valid_index = []
        for t, x in resampled:
            if len(x) > 0:  # <-- agora ignora janelas vazias
                metrics_list.append(
                    wavelet_features_window(x.values, wavelet=wavelet, level=level)
                )
                valid_index.append(t)  # guarda apenas timestamps válidos

        # cria DataFrame só com índices válidos
        metrics_wavelet = pd.DataFrame(metrics_list, index=valid_index)

        # Nomes das colunas para wavelet
        col_names = []
        for lvl in range(level+1):
            for m in ['mean','std','energy','entropy','skew','kurtosis']:
                col_names.append(f'{col}_{m}_L{lvl}')
        metrics_wavelet.columns = col_names

        # Concatena métricas padrão (filtradas) + wavelet
        combined = pd.concat([agg_df.loc[valid_index].add_prefix(f'{col}_'), 
                              metrics_wavelet], axis=1)
        results.append(combined)

    df_features = pd.concat(results, axis=1)

    # target só para janelas válidas
    df_features['target'] = data['target'].resample(window).max().loc[df_features.index]

    return df_features


In [33]:
dfs = []
for well, group in df2.groupby('well'):
    group = group.set_index('timestamp')  # garante que estamos usando timestamp como índice
    df_well = aggregate_by_well(group, window='5T', level=1)

    df_well = df_well.reset_index()  # 'index' vira a coluna timestamp
    df_well.rename(columns={'index': 'timestamp'}, inplace=True)

    df_well['well'] = well  # mantém a identificação do poço
    dfs.append(df_well)

In [34]:
# Concatena todos os poços
df_agg = pd.concat(dfs, ignore_index=True)

# Garante que timestamp está ordenado e em datetime
df_agg['timestamp'] = pd.to_datetime(df_agg['timestamp'])
df_agg = df_agg.sort_values(['well', 'timestamp']).reset_index(drop=True)

In [35]:
df_agg.shape

(36771, 99)

In [36]:
df_agg.head()

Unnamed: 0,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-PDG_mean_L0,P-PDG_std_L0,P-PDG_energy_L0,P-PDG_entropy_L0,P-PDG_skew_L0,...,P-JUS-CKGL_skew_L0,P-JUS-CKGL_kurtosis_L0,P-JUS-CKGL_mean_L1,P-JUS-CKGL_std_L1,P-JUS-CKGL_energy_L1,P-JUS-CKGL_entropy_L1,P-JUS-CKGL_skew_L1,P-JUS-CKGL_kurtosis_L1,target,well
0,2014-01-24 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.361752e-11,2.291413e-13,3.080235e-33,4.127087,-7.682201,-3.0,1.0,WELL-00001
1,2014-01-24 09:35:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
2,2014-01-24 09:40:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
3,2014-01-24 09:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
4,2014-01-24 09:50:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001


In [37]:
# dfs = []
# for well, group in df2.groupby('well'):
#     group = group.set_index('timestamp')  # garante que estamos usando timestamp como índice
#     df_well = aggregate_by_well(group, window='5T', level=1)

#     df_well = df_well.reset_index()  # 'index' vira a coluna timestamp
#     df_well.rename(columns={'index': 'timestamp'}, inplace=True)

#     df_well['well'] = well  # mantém a identificação do poço
#     dfs.append(df_well)

In [38]:
# # Concatena todos os poços
# df_agg = pd.concat(dfs, ignore_index=True)

# # Garante que timestamp está ordenado e em datetime
# df_agg['timestamp'] = pd.to_datetime(df_agg['timestamp'])
# df_agg = df_agg.sort_values(['well', 'timestamp']).reset_index(drop=True)

In [39]:
# df_agg.head()

In [40]:
# df_agg.shape

In [69]:
# df_agg = pd.concat(dfs, ignore_index=True)
# df_agg

In [41]:
# df_agg.shape

In [42]:
# df.shape

In [43]:
df_agg.target.value_counts().sum()

36771

In [44]:
df_agg2 = df_agg[~df_agg.target.isnull()]
df_agg2.shape

(36771, 99)

In [74]:
def diagnostico_colunas_zeradas(df, well_col='well'):
    """
    Para cada well, mostra quais colunas estão totalmente zeradas,
    quantas estão zeradas parcialmente e o percentual de zeros.
    """
    resultados = {}

    for well, grupo in df.groupby(well_col):
        # Verifica colunas totalmente zeradas
        colunas_zeradas = grupo.columns[(grupo == 0).all()]
        
        # Conta zeros por coluna e calcula percentual
        contagem_zeros = (grupo == 0).sum()
        percentual_zeros = (grupo == 0).mean() * 100  # em %

        resultados[well] = {
            "colunas_totalmente_zeradas": list(colunas_zeradas),
            "contagem_zeros": contagem_zeros.to_dict(),
            "percentual_zeros": percentual_zeros.to_dict()
        }

        print(f"\n=== Well: {well} ===")
        print(f"Colunas totalmente zeradas: {list(colunas_zeradas)}")
        print("Top 5 colunas com mais zeros:")
        print(percentual_zeros.sort_values(ascending=False).head())

    return resultados

In [75]:
# Exemplo de uso:
resultados = diagnostico_colunas_zeradas(df_agg2, well_col='well')



=== Well: WELL-00001 ===
Colunas totalmente zeradas: []
Top 5 colunas com mais zeros:
P-PDG_skew_L0        99.942230
P-PDG_kurtosis_L0    99.942230
P-PDG_std            99.870017
P-PDG_std_L0         99.480069
target               80.112652
dtype: float64

=== Well: WELL-00002 ===
Colunas totalmente zeradas: []
Top 5 colunas com mais zeros:
P-PDG_kurtosis_L0    99.914333
P-PDG_skew_L0        99.914333
P-PDG_std_L0         99.677100
P-PDG_std            99.571664
P-PDG_max            99.426689
dtype: float64

=== Well: WELL-00006 ===
Colunas totalmente zeradas: []
Top 5 colunas com mais zeros:
P-PDG_skew_L0        98.177621
P-PDG_kurtosis_L0    98.177621
P-PDG_std            98.129024
target               83.817276
P-TPT_kurtosis_L0    50.091119
dtype: float64

=== Well: WELL-00007 ===
Colunas totalmente zeradas: []
Top 5 colunas com mais zeros:
target                    16.608997
P-JUS-CKGL_skew_L0         3.806228
P-JUS-CKGL_kurtosis_L0     3.806228
P-JUS-CKGL_std             3.46020

In [76]:
colunas_zeradas = df_agg2.columns[(df_agg2 == 0).all()]
print("Colunas totalmente zeradas:", list(colunas_zeradas))

Colunas totalmente zeradas: []


In [77]:
df_agg2.target.value_counts()

target
0.0    24709
1.0    12062
Name: count, dtype: int64

In [78]:
(df_agg2.isnull().sum()/len(df_agg2)).sort_values(ascending=False)

P-MON-CKP_std       0.001033
P-PDG_std           0.001033
T-TPT_std           0.001033
P-JUS-CKGL_std      0.001033
P-TPT_std           0.001033
                      ...   
P-TPT_skew_L1       0.000000
P-TPT_entropy_L1    0.000000
P-TPT_energy_L1     0.000000
P-TPT_std_L1        0.000000
well                0.000000
Length: 99, dtype: float64

In [79]:
null_pct = df_agg2.isna().mean()  # proporção de NaNs por coluna
cols_to_keep = null_pct[null_pct < 0.01].index  # apenas colunas com < 1% nulos
df_agg3 = df_agg2[cols_to_keep]
df_agg3.shape

(36771, 99)

In [80]:
(df_agg3.isnull().sum()/len(df_agg3)).sort_values(ascending=True)

timestamp              0.000000
T-JUS-CKP_energy_L0    0.000000
T-JUS-CKP_std_L0       0.000000
T-JUS-CKP_mean_L0      0.000000
T-JUS-CKP_max          0.000000
                         ...   
P-TPT_std              0.001033
T-JUS-CKP_std          0.001033
P-MON-CKP_std          0.001033
P-PDG_std              0.001033
T-TPT_std              0.001033
Length: 99, dtype: float64

In [81]:
df_agg3.head()

Unnamed: 0,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-PDG_mean_L0,P-PDG_std_L0,P-PDG_energy_L0,P-PDG_entropy_L0,P-PDG_skew_L0,...,P-JUS-CKGL_skew_L0,P-JUS-CKGL_kurtosis_L0,P-JUS-CKGL_mean_L1,P-JUS-CKGL_std_L1,P-JUS-CKGL_energy_L1,P-JUS-CKGL_entropy_L1,P-JUS-CKGL_skew_L1,P-JUS-CKGL_kurtosis_L1,target,well
0,2014-01-24 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.361752e-11,2.291413e-13,3.080235e-33,4.127087,-7.682201,-3.0,1.0,WELL-00001
1,2014-01-24 09:35:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
2,2014-01-24 09:40:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
3,2014-01-24 09:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
4,2014-01-24 09:50:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001


In [82]:
df_agg4 = df_agg3[~df_agg3.isnull().any(axis=1)]
df_agg4

Unnamed: 0,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-PDG_mean_L0,P-PDG_std_L0,P-PDG_energy_L0,P-PDG_entropy_L0,P-PDG_skew_L0,...,P-JUS-CKGL_skew_L0,P-JUS-CKGL_kurtosis_L0,P-JUS-CKGL_mean_L1,P-JUS-CKGL_std_L1,P-JUS-CKGL_energy_L1,P-JUS-CKGL_entropy_L1,P-JUS-CKGL_skew_L1,P-JUS-CKGL_kurtosis_L1,target,well
0,2014-01-24 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,2.361752e-11,2.291413e-13,3.080235e-33,4.127087,-7.682201,-3.000000,1.0,WELL-00001
1,2014-01-24 09:35:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,2.358742e-11,1.255045e-12,3.080795e-33,5.028653,-9.508595,-3.000000,1.0,WELL-00001
2,2014-01-24 09:40:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,2.358742e-11,1.255045e-12,3.080795e-33,5.028653,-9.508595,-3.000000,1.0,WELL-00001
3,2014-01-24 09:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,2.358742e-11,1.255045e-12,3.080795e-33,5.028653,-9.508595,-3.000000,1.0,WELL-00001
4,2014-01-24 09:50:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,2.358742e-11,1.255045e-12,3.080795e-33,5.028653,-9.508595,-3.000000,1.0,WELL-00001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090742,2019-04-03 14:35:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.310762,-1.170065,4.484580e-02,1.093957e+00,6.220747e-15,4.329146,2.420422,28.843250,1.0,WELL-00018
1090743,2019-04-03 14:40:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.618382,-1.112516,-1.742675e-01,3.551546e+00,6.597681e-14,3.314469,-4.840377,48.990056,1.0,WELL-00018
1090744,2019-04-03 14:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.478836,-0.833645,-9.177595e-02,1.956083e+00,2.009502e-14,3.870666,-6.508611,63.449771,1.0,WELL-00018
1090745,2019-04-03 14:50:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.867637,0.042685,-9.690600e-02,3.283631e+00,5.687419e-14,3.239146,-4.932309,59.059588,1.0,WELL-00018


In [83]:
(df_agg4.isnull().sum()/len(df_agg4)).sort_values(ascending=True).to_csv('null_pct.csv')

In [84]:
df_agg4.head()

Unnamed: 0,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-PDG_mean_L0,P-PDG_std_L0,P-PDG_energy_L0,P-PDG_entropy_L0,P-PDG_skew_L0,...,P-JUS-CKGL_skew_L0,P-JUS-CKGL_kurtosis_L0,P-JUS-CKGL_mean_L1,P-JUS-CKGL_std_L1,P-JUS-CKGL_energy_L1,P-JUS-CKGL_entropy_L1,P-JUS-CKGL_skew_L1,P-JUS-CKGL_kurtosis_L1,target,well
0,2014-01-24 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.361752e-11,2.291413e-13,3.080235e-33,4.127087,-7.682201,-3.0,1.0,WELL-00001
1,2014-01-24 09:35:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
2,2014-01-24 09:40:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
3,2014-01-24 09:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001
4,2014-01-24 09:50:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0,1.0,WELL-00001


In [85]:
df_agg4.target.value_counts()

target
0.0    24693
1.0    12040
Name: count, dtype: int64

In [86]:
df_agg4.target.value_counts().sum()

36733

In [87]:
df_agg4['well'].value_counts()

well
WELL-00002    15159
WELL-00006     8227
WELL-00001     6919
WELL-00014     2351
WELL-00010     2025
WELL-00017     1190
WELL-00018      294
WELL-00007      288
WELL-00015      184
WELL-00016       72
WELL-00009       24
Name: count, dtype: int64

In [88]:
df_agg4.target.value_counts()

target
0.0    24693
1.0    12040
Name: count, dtype: int64

In [89]:
df_agg4.target.value_counts(normalize=True)

target
0.0    0.672229
1.0    0.327771
Name: proportion, dtype: float64

In [90]:
df_agg4.dtypes

timestamp                 datetime64[ns]
P-PDG_mean                       float64
P-PDG_std                        float64
P-PDG_min                        float64
P-PDG_max                        float64
                               ...      
P-JUS-CKGL_entropy_L1            float64
P-JUS-CKGL_skew_L1               float64
P-JUS-CKGL_kurtosis_L1           float64
target                           float64
well                              object
Length: 99, dtype: object

In [91]:
df_agg4['well'].unique()

array(['WELL-00001', 'WELL-00002', 'WELL-00006', 'WELL-00007',
       'WELL-00009', 'WELL-00010', 'WELL-00014', 'WELL-00015',
       'WELL-00016', 'WELL-00017', 'WELL-00018'], dtype=object)

### Feature selection

In [92]:
def select_low_correlation_columns(df, threshold=0.99):
    """
    Retorna uma lista de colunas com correlação de Pearson
    menor que o threshold entre si.

    Parâmetros:
    -----------
    df : pd.DataFrame
        DataFrame com features numéricas.
    threshold : float
        Limite máximo de correlação permitido entre duas colunas.

    Retorna:
    --------
    selected_columns : list
        Lista de colunas selecionadas (menos correlacionadas).
    """
    corr_matrix = df.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Lista de colunas a remover
    removed_columns = [col for col in upper_tri.columns if any(upper_tri[col] >= threshold)]

    # Colunas selecionadas
    selected_columns = [col for col in df.columns if col not in removed_columns]
    
    return selected_columns

In [93]:
feature_modeling = [col for col in df_agg4.columns if col not in ['timestamp', 'well', 'target']]
feature_modeling

['P-PDG_mean',
 'P-PDG_std',
 'P-PDG_min',
 'P-PDG_max',
 'P-PDG_mean_L0',
 'P-PDG_std_L0',
 'P-PDG_energy_L0',
 'P-PDG_entropy_L0',
 'P-PDG_skew_L0',
 'P-PDG_kurtosis_L0',
 'P-PDG_mean_L1',
 'P-PDG_std_L1',
 'P-PDG_energy_L1',
 'P-PDG_entropy_L1',
 'P-PDG_skew_L1',
 'P-PDG_kurtosis_L1',
 'P-TPT_mean',
 'P-TPT_std',
 'P-TPT_min',
 'P-TPT_max',
 'P-TPT_mean_L0',
 'P-TPT_std_L0',
 'P-TPT_energy_L0',
 'P-TPT_entropy_L0',
 'P-TPT_skew_L0',
 'P-TPT_kurtosis_L0',
 'P-TPT_mean_L1',
 'P-TPT_std_L1',
 'P-TPT_energy_L1',
 'P-TPT_entropy_L1',
 'P-TPT_skew_L1',
 'P-TPT_kurtosis_L1',
 'T-TPT_mean',
 'T-TPT_std',
 'T-TPT_min',
 'T-TPT_max',
 'T-TPT_mean_L0',
 'T-TPT_std_L0',
 'T-TPT_energy_L0',
 'T-TPT_entropy_L0',
 'T-TPT_skew_L0',
 'T-TPT_kurtosis_L0',
 'T-TPT_mean_L1',
 'T-TPT_std_L1',
 'T-TPT_energy_L1',
 'T-TPT_entropy_L1',
 'T-TPT_skew_L1',
 'T-TPT_kurtosis_L1',
 'P-MON-CKP_mean',
 'P-MON-CKP_std',
 'P-MON-CKP_min',
 'P-MON-CKP_max',
 'P-MON-CKP_mean_L0',
 'P-MON-CKP_std_L0',
 'P-MON-CKP_energ

In [94]:
feature_model = select_low_correlation_columns(df_agg4[feature_modeling], threshold=0.99)
print("Colunas selecionadas:", feature_model)

Colunas selecionadas: ['P-PDG_mean', 'P-PDG_std', 'P-PDG_mean_L0', 'P-PDG_energy_L0', 'P-PDG_skew_L0', 'P-PDG_kurtosis_L0', 'P-PDG_std_L1', 'P-PDG_energy_L1', 'P-PDG_entropy_L1', 'P-PDG_skew_L1', 'P-PDG_kurtosis_L1', 'P-TPT_mean', 'P-TPT_std', 'P-TPT_energy_L0', 'P-TPT_entropy_L0', 'P-TPT_skew_L0', 'P-TPT_kurtosis_L0', 'P-TPT_mean_L1', 'P-TPT_std_L1', 'P-TPT_entropy_L1', 'P-TPT_skew_L1', 'P-TPT_kurtosis_L1', 'T-TPT_mean', 'T-TPT_std', 'T-TPT_energy_L0', 'T-TPT_skew_L0', 'T-TPT_kurtosis_L0', 'T-TPT_mean_L1', 'T-TPT_energy_L1', 'T-TPT_entropy_L1', 'T-TPT_skew_L1', 'T-TPT_kurtosis_L1', 'P-MON-CKP_mean', 'P-MON-CKP_std', 'P-MON-CKP_energy_L0', 'P-MON-CKP_skew_L0', 'P-MON-CKP_kurtosis_L0', 'P-MON-CKP_mean_L1', 'P-MON-CKP_std_L1', 'P-MON-CKP_entropy_L1', 'P-MON-CKP_skew_L1', 'P-MON-CKP_kurtosis_L1', 'T-JUS-CKP_mean', 'T-JUS-CKP_std', 'T-JUS-CKP_energy_L0', 'T-JUS-CKP_skew_L0', 'T-JUS-CKP_kurtosis_L0', 'T-JUS-CKP_mean_L1', 'T-JUS-CKP_std_L1', 'T-JUS-CKP_energy_L1', 'T-JUS-CKP_entropy_L1', 'T-

In [95]:
len(feature_modeling)

96

In [96]:
len(feature_model)

64

In [97]:
len(feature_modeling) - len(feature_model)

32

In [98]:
df_agg5 = df_agg4[['timestamp', 'well', 'target'] + feature_model]
df_agg5.shape

(36733, 67)

### Data split

In [99]:
# Lista para armazenar os dados de treinamento e teste
train = list()
test = list()

data = df_agg5.sort_values(by=['well', 'timestamp'])

# Iterar sobre cada poco
for poco_id, poco_data in data.groupby('well'):
    # Calcular o índice para dividir os dados em treinamento e teste
    split_index = int(0.8 * len(poco_data))
    
    # Dividir os dados do poco em treinamento e teste
    poco_train = poco_data.iloc[:split_index]
    poco_test = poco_data.iloc[split_index:]
    
    # Adicionar os dados de treinamento e teste à lista
    train.append(poco_train)
    test.append(poco_test)

# Concatenar os dados de treinamento e teste
train = pd.concat(train)
test = pd.concat(test)

# Verificar o tamanho dos conjuntos de treinamento e teste
print("Tamanho do conjunto de treinamento:", len(train))
print("Tamanho do conjunto de teste:", len(test))


Tamanho do conjunto de treinamento: 29383
Tamanho do conjunto de teste: 7350


In [100]:
print(f'Treino Poços: {train.well.unique()}')
print(f'Teste Poços: {test.well.unique()}')
print(f'Tamanho do treino: {len(train)}')
print(f'Tamanho do teste: {len(test)}')

Treino Poços: ['WELL-00001' 'WELL-00002' 'WELL-00006' 'WELL-00007' 'WELL-00009'
 'WELL-00010' 'WELL-00014' 'WELL-00015' 'WELL-00016' 'WELL-00017'
 'WELL-00018']
Teste Poços: ['WELL-00001' 'WELL-00002' 'WELL-00006' 'WELL-00007' 'WELL-00009'
 'WELL-00010' 'WELL-00014' 'WELL-00015' 'WELL-00016' 'WELL-00017'
 'WELL-00018']
Tamanho do treino: 29383
Tamanho do teste: 7350


In [101]:
# Verificar a proporção de anomalias
print(f"Proporção de anomalias no conjunto de treino: {train.target.mean():.2%}")
print(f"Proporção de anomalias no conjunto de teste: {test.target.mean():.2%}")

Proporção de anomalias no conjunto de treino: 33.32%
Proporção de anomalias no conjunto de teste: 30.63%


In [102]:
train.head()

Unnamed: 0,timestamp,well,target,P-PDG_mean,P-PDG_std,P-PDG_mean_L0,P-PDG_energy_L0,P-PDG_skew_L0,P-PDG_kurtosis_L0,P-PDG_std_L1,...,P-JUS-CKGL_std,P-JUS-CKGL_energy_L0,P-JUS-CKGL_skew_L0,P-JUS-CKGL_kurtosis_L0,P-JUS-CKGL_mean_L1,P-JUS-CKGL_std_L1,P-JUS-CKGL_energy_L1,P-JUS-CKGL_entropy_L1,P-JUS-CKGL_skew_L1,P-JUS-CKGL_kurtosis_L1
0,2014-01-24 09:30:00,WELL-00001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.361752e-11,2.291413e-13,3.080235e-33,4.127087,-7.682201,-3.0
1,2014-01-24 09:35:00,WELL-00001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0
2,2014-01-24 09:40:00,WELL-00001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0
3,2014-01-24 09:45:00,WELL-00001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0
4,2014-01-24 09:50:00,WELL-00001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.358742e-11,1.255045e-12,3.0807950000000003e-33,5.028653,-9.508595,-3.0


In [103]:
train.tail()

Unnamed: 0,timestamp,well,target,P-PDG_mean,P-PDG_std,P-PDG_mean_L0,P-PDG_energy_L0,P-PDG_skew_L0,P-PDG_kurtosis_L0,P-PDG_std_L1,...,P-JUS-CKGL_std,P-JUS-CKGL_energy_L0,P-JUS-CKGL_skew_L0,P-JUS-CKGL_kurtosis_L0,P-JUS-CKGL_mean_L1,P-JUS-CKGL_std_L1,P-JUS-CKGL_energy_L1,P-JUS-CKGL_entropy_L1,P-JUS-CKGL_skew_L1,P-JUS-CKGL_kurtosis_L1
1090683,2019-04-03 09:40:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4166.921226,1.0,0.069614,-1.252097,0.002945323,0.6273044,2.228172e-15,4.327048,-0.055067,10.927947
1090684,2019-04-03 09:45:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5949.398514,1.0,2.039079,2.976493,3.597291,47.10788,1.260184e-11,1.685855,6.496515,54.168772
1090685,2019-04-03 09:50:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1767.329397,1.0,6.600541,44.678471,1.828081,35.00233,6.911362e-12,1.344752,9.600895,108.592137
1090686,2019-04-03 09:55:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-6.236535e-11,4.648439e-11,3.391482e-35,4.734714,-11.090896,121.008028
1090687,2019-04-03 10:00:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,864.340021,1.0,0.054335,-1.331963,-0.08691473,2.773937,4.302943e-14,2.616352,-0.341558,3.417835


In [104]:
test.head()

Unnamed: 0,timestamp,well,target,P-PDG_mean,P-PDG_std,P-PDG_mean_L0,P-PDG_energy_L0,P-PDG_skew_L0,P-PDG_kurtosis_L0,P-PDG_std_L1,...,P-JUS-CKGL_std,P-JUS-CKGL_energy_L0,P-JUS-CKGL_skew_L0,P-JUS-CKGL_kurtosis_L0,P-JUS-CKGL_mean_L1,P-JUS-CKGL_std_L1,P-JUS-CKGL_energy_L1,P-JUS-CKGL_entropy_L1,P-JUS-CKGL_skew_L1,P-JUS-CKGL_kurtosis_L1
376538,2017-08-23 19:40:00,WELL-00001,0.0,-6887865.0,0.0,-9740912.0,1.0,0.0,0.0,3.998808e-11,...,0.0,1.0,0.0,0.0,-1.18056e-11,3.026399e-13,3.540944e-33,5.030072,10.998622,-3.0
376539,2017-08-23 19:45:00,WELL-00001,0.0,-6887865.0,0.0,-9740912.0,1.0,0.0,0.0,3.998808e-11,...,0.0,1.0,0.0,0.0,-1.18056e-11,3.026399e-13,3.540944e-33,5.030072,10.998622,-3.0
376540,2017-08-23 19:50:00,WELL-00001,0.0,-6887865.0,0.0,-9740912.0,1.0,0.0,0.0,3.998808e-11,...,0.0,1.0,0.0,0.0,-1.18056e-11,3.026399e-13,3.540944e-33,5.030072,10.998622,-3.0
376541,2017-08-23 19:55:00,WELL-00001,0.0,-6887865.0,0.0,-9740912.0,1.0,0.0,0.0,3.998808e-11,...,0.0,1.0,0.0,0.0,-1.18056e-11,3.026399e-13,3.540944e-33,5.030072,10.998622,-3.0
376542,2017-08-23 20:00:00,WELL-00001,0.0,-6887865.0,0.0,-9740912.0,1.0,0.0,0.0,5.143786e-11,...,0.0,1.0,0.0,0.0,-1.169941e-11,9.216281e-13,3.496813e-33,5.03287,7.737584,-3.0


In [105]:
test.tail()

Unnamed: 0,timestamp,well,target,P-PDG_mean,P-PDG_std,P-PDG_mean_L0,P-PDG_energy_L0,P-PDG_skew_L0,P-PDG_kurtosis_L0,P-PDG_std_L1,...,P-JUS-CKGL_std,P-JUS-CKGL_energy_L0,P-JUS-CKGL_skew_L0,P-JUS-CKGL_kurtosis_L0,P-JUS-CKGL_mean_L1,P-JUS-CKGL_std_L1,P-JUS-CKGL_energy_L1,P-JUS-CKGL_entropy_L1,P-JUS-CKGL_skew_L1,P-JUS-CKGL_kurtosis_L1
1090742,2019-04-03 14:35:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6274.887506,1.0,-0.310762,-1.170065,0.044846,1.093957,6.220747e-15,4.329146,2.420422,28.84325
1090743,2019-04-03 14:40:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7625.624412,1.0,0.618382,-1.112516,-0.174267,3.551546,6.597681e-14,3.314469,-4.840377,48.990056
1090744,2019-04-03 14:45:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7433.448938,1.0,-0.478836,-0.833645,-0.091776,1.956083,2.009502e-14,3.870666,-6.508611,63.449771
1090745,2019-04-03 14:50:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4526.52643,1.0,0.867637,0.042685,-0.096906,3.283631,5.687419e-14,3.239146,-4.932309,59.059588
1090746,2019-04-03 14:55:00,WELL-00018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8410.627837,1.0,0.127139,-1.390843,0.015592,1.731552,1.586987e-14,3.937233,1.124709,35.281449


### Feature Selection

In [106]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from boruta import BorutaPy

def selecionar_features_com_boruta(df: pd.DataFrame, target: str, task_type="classification"):
    X = df.drop(columns=[target]).values
    y = df[target].values
    
    if task_type == "classification":
        rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    elif task_type == "regression":
        rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    else:
        raise ValueError("task_type deve ser 'classification' ou 'regression'")
    
    # Inicializa Boruta
    boruta_selector = BorutaPy(
        estimator=rf,
        n_estimators="auto",  # Deixa o Boruta decidir o número ótimo de árvores
        verbose=0,
        alpha=0.05,
        random_state=42
    )
    
    boruta_selector.fit(X, y)
    
    # Cria DataFrame com resultados
    features = df.drop(columns=[target]).columns
    resultados = pd.DataFrame({
        "feature": features,
        "support": boruta_selector.support_,
        "ranking": boruta_selector.ranking_
    }).sort_values(by="ranking")
    
    selected_features = resultados[resultados["support"]]["feature"].tolist()
    
    return selected_features, resultados


In [107]:

selected, ranking = selecionar_features_com_boruta(train[feature_model + ['target']], "target", task_type="classification")
print("Features selecionadas:", selected)
print(ranking)

Features selecionadas: ['P-PDG_mean', 'P-JUS-CKGL_skew_L1', 'P-MON-CKP_mean', 'P-MON-CKP_std', 'P-MON-CKP_skew_L0', 'P-MON-CKP_kurtosis_L0', 'P-MON-CKP_mean_L1', 'P-MON-CKP_std_L1', 'P-MON-CKP_entropy_L1', 'P-MON-CKP_skew_L1', 'P-MON-CKP_kurtosis_L1', 'T-JUS-CKP_mean', 'T-JUS-CKP_std', 'T-TPT_skew_L1', 'T-JUS-CKP_skew_L0', 'T-JUS-CKP_mean_L1', 'T-JUS-CKP_std_L1', 'T-JUS-CKP_entropy_L1', 'T-JUS-CKP_skew_L1', 'T-JUS-CKP_kurtosis_L1', 'P-JUS-CKGL_mean', 'P-JUS-CKGL_std', 'P-JUS-CKGL_skew_L0', 'P-JUS-CKGL_kurtosis_L0', 'P-JUS-CKGL_mean_L1', 'P-JUS-CKGL_std_L1', 'P-JUS-CKGL_entropy_L1', 'T-JUS-CKP_kurtosis_L0', 'T-TPT_entropy_L1', 'T-TPT_kurtosis_L1', 'T-TPT_mean_L1', 'P-PDG_std', 'P-PDG_mean_L0', 'P-PDG_energy_L0', 'P-PDG_skew_L0', 'P-PDG_kurtosis_L0', 'P-PDG_std_L1', 'P-PDG_entropy_L1', 'P-PDG_skew_L1', 'P-PDG_kurtosis_L1', 'P-TPT_mean', 'P-TPT_std', 'P-TPT_entropy_L0', 'P-TPT_energy_L0', 'P-JUS-CKGL_kurtosis_L1', 'P-TPT_kurtosis_L0', 'T-TPT_kurtosis_L0', 'P-TPT_mean_L1', 'P-TPT_std_L1', 

In [108]:
len(selected)

56

In [109]:
train[selected + ['target', 'timestamp', 'well']].shape

(29383, 59)

In [110]:
train_processed = train[selected + ['target', 'timestamp', 'well']]
train_processed.shape

(29383, 59)

In [111]:
test_processed = test[selected + ['target', 'timestamp', 'well']]
test_processed.shape

(7350, 59)

In [112]:
train_processed.to_parquet('./dataset/train_agg_dwt.parquet.gzip',
              compression='gzip')

test_processed.to_parquet('./dataset/test_agg_dwt.parquet.gzip',
              compression='gzip')