# Instâncias reais

* **timestamp**: observations timestamps loaded into pandas DataFrame as its index;
* **P-PDG**: pressure variable at the Permanent Downhole Gauge (PDG);
* **P-TPT**: pressure variable at the Temperature and Pressure Transducer (TPT);
* **T-TPT**: temperature variable at the Temperature and Pressure Transducer (TPT);
* **P-MON-CKP**: pressure variable upstream of the production choke (CKP);
* **T-JUS-CKP**: temperature variable downstream of the production choke (CKP);
* **P-JUS-CKGL**: pressure variable upstream of the gas lift choke (CKGL);
* **T-JUS-CKGL**: temperature variable upstream of the gas lift choke (CKGL);
* **QGL**: gas lift flow rate;
* **class**: observations labels associated with three types of periods (normal, fault transient, and faulty steady state).


* **label**: instance label (event type);
* **well**: well name. Hand-drawn and simulated instances have fixed names. Real instances have names masked with incremental id;
* **id**: instance identifier. Hand-drawn and simulated instances have incremental id. Each real instance has an id generated from its first timestamp.

https://github.com/petrobras/3W

In [1]:
import sys
import os
import pandas as pd
import numpy as np

import joblib
import pickle

from pyod.models.ecod import ECOD

from pyod.models.suod import SUOD

from pyod.models.xgbod import XGBOD

from pyod.models.loda import LODA

from pyod.models.pca import PCA



import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDOneClassSVM
from sklearn.pipeline import make_pipeline
from sklearn.svm import OneClassSVM

from sklearn.model_selection import GroupKFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics  import average_precision_score, make_scorer, roc_curve,f1_score, precision_score, recall_score, fbeta_score, auc, roc_auc_score, accuracy_score, confusion_matrix, classification_report,precision_recall_curve
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit

### Function to calculate metrics

In [2]:
def metrics(y_test, y_pred):
    cm = list()
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm)                      
    plt.figure(figsize=(8,6))  
    sns.heatmap(cm_df, annot=True)
    print("Classification Report: \n", classification_report(y_test, y_pred, digits=5))  


    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TP = cm[1][1]

    print("Specificity:", TN/(TN+FP))


    roc_auc = roc_auc_score(y_test, y_pred, multi_class = 'ovr', average=None)
    gini = 2*roc_auc -1
    print("Gini: ",gini)
    print("ROC AUC:: ",roc_auc)

## Read data

In [3]:
df = pd.read_parquet('./dataset/real_instances.parquet.gzip') 

In [4]:
df.shape

(14516197, 12)

In [5]:
df.head()

Unnamed: 0_level_0,label,well,id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-02-01 02:02:07,0,WELL-00001,20170201020207,0.0,10092110.0,119.0944,1609800.0,84.59782,1564147.0,,0.0,0.0
2017-02-01 02:02:08,0,WELL-00001,20170201020207,0.0,10092000.0,119.0944,1618206.0,84.58997,1564148.0,,0.0,0.0
2017-02-01 02:02:09,0,WELL-00001,20170201020207,0.0,10091890.0,119.0944,1626612.0,84.58213,1564148.0,,0.0,0.0
2017-02-01 02:02:10,0,WELL-00001,20170201020207,0.0,10091780.0,119.0944,1635018.0,84.57429,1564148.0,,0.0,0.0
2017-02-01 02:02:11,0,WELL-00001,20170201020207,0.0,10091670.0,119.0944,1643424.0,84.56644,1564148.0,,0.0,0.0


In [6]:
df.id.nunique()

1013

In [7]:
df['label'].value_counts()

label
0    9903155
4    2462076
3     569152
5     552529
7     466338
1     312136
2     194233
6      56578
Name: count, dtype: int64

In [8]:
df['class'].value_counts()

class
0.0      10003293
4.0       2462076
3.0        569152
105.0      317565
107.0      283262
101.0       95658
102.0       65130
7.0         25870
2.0         16100
5.0         13031
6.0         12951
1.0         10417
106.0        6252
Name: count, dtype: int64

In [9]:
pd.crosstab(df['well'], df['label'])

label,0,1,2,3,4,5,6,7
well,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
WELL-00001,1652442,58468,0,17976,261457,0,0,73033
WELL-00002,3641159,24017,9121,0,807601,0,47869,0
WELL-00003,463543,0,35406,0,0,0,0,0
WELL-00004,85505,0,0,0,307674,0,8709,0
WELL-00005,979611,0,0,0,271525,0,0,0
WELL-00006,2058403,229651,0,0,0,0,0,305517
WELL-00007,14370,0,0,0,71651,0,0,0
WELL-00008,1008122,0,0,0,0,0,0,0
WELL-00009,0,0,6738,0,0,0,0,0
WELL-00010,0,0,9809,0,592220,0,0,0


In [10]:
pd.crosstab(df['class'], df['label'])

label,0,1,2,3,4,5,6,7
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,9439612,195376,52017,0,0,127930,36319,152039
1.0,0,10417,0,0,0,0,0,0
2.0,0,0,16100,0,0,0,0,0
3.0,0,0,0,569152,0,0,0,0
4.0,0,0,0,0,2462076,0,0,0
5.0,0,0,0,0,0,13031,0,0
6.0,0,0,0,0,0,0,12951,0
7.0,0,0,0,0,0,0,0,25870
101.0,0,95658,0,0,0,0,0,0
102.0,0,0,65130,0,0,0,0,0


In [11]:
df.drop_duplicates(subset=['id'], keep='last').groupby("label")["id"].count()

label
0    588
1      5
2     22
3     32
4    344
5     11
6      6
7      5
Name: id, dtype: int64

In [12]:
df.isnull().sum()/len(df)

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.000580
P-TPT         0.008007
T-TPT         0.008007
P-MON-CKP     0.077959
T-JUS-CKP     0.113279
P-JUS-CKGL    0.071501
T-JUS-CKGL    1.000000
QGL           0.191125
class         0.043775
dtype: float64

In [13]:
df[df['well']=='WELL-00005'].isnull().sum()/len(df[df['well']=='WELL-00005'])

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.000000
P-TPT         0.000000
T-TPT         0.000000
P-MON-CKP     0.000026
T-JUS-CKP     0.000000
P-JUS-CKGL    0.000018
T-JUS-CKGL    1.000000
QGL           1.000000
class         0.000000
dtype: float64

In [14]:
df[df['well']=='WELL-00003'].isnull().sum()/len(df[df['well']=='WELL-00003'])

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.006151
P-TPT         0.006151
T-TPT         0.006177
P-MON-CKP     0.000884
T-JUS-CKP     1.000000
P-JUS-CKGL    1.000000
T-JUS-CKGL    1.000000
QGL           0.001309
class         1.000000
dtype: float64

In [15]:
df[df['well']=='WELL-00005'].head()

Unnamed: 0_level_0,label,well,id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-03-31 06:00:14,0,WELL-00005,20170331060014,0.0,20792900.0,106.3653,11509150.0,67.44353,1325708000.0,,,0.0
2017-03-31 06:00:15,0,WELL-00005,20170331060014,0.0,20793320.0,106.3653,11509150.0,67.4435,1325647000.0,,,0.0
2017-03-31 06:00:16,0,WELL-00005,20170331060014,0.0,20793730.0,106.3653,11509150.0,67.44349,1325586000.0,,,0.0
2017-03-31 06:00:17,0,WELL-00005,20170331060014,0.0,20794150.0,106.3653,11509150.0,67.44347,1325526000.0,,,0.0
2017-03-31 06:00:18,0,WELL-00005,20170331060014,0.0,20794570.0,106.3653,11509150.0,67.44345,1325465000.0,,,0.0


In [16]:
df[df['well']=='WELL-00005']['QGL'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: QGL, dtype: float64

In [17]:
df[df['well']=='WELL-00004'].isnull().sum()/len(df[df['well']=='WELL-00004'])

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.000848
P-TPT         0.001617
T-TPT         0.001590
P-MON-CKP     0.000000
T-JUS-CKP     0.001660
P-JUS-CKGL    1.000000
T-JUS-CKGL    1.000000
QGL           1.000000
class         0.001426
dtype: float64

In [18]:
df[df['well']=='WELL-00008'].isnull().sum()/len(df[df['well']=='WELL-00008'])

label         0.000000
well          0.000000
id            0.000000
P-PDG         0.000000
P-TPT         0.000000
T-TPT         0.000000
P-MON-CKP     1.000000
T-JUS-CKP     1.000000
P-JUS-CKGL    0.000011
T-JUS-CKGL    1.000000
QGL           1.000000
class         0.000000
dtype: float64

In [19]:
df = df.reset_index()
df.head()

Unnamed: 0,timestamp,label,well,id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2017-02-01 02:02:07,0,WELL-00001,20170201020207,0.0,10092110.0,119.0944,1609800.0,84.59782,1564147.0,,0.0,0.0
1,2017-02-01 02:02:08,0,WELL-00001,20170201020207,0.0,10092000.0,119.0944,1618206.0,84.58997,1564148.0,,0.0,0.0
2,2017-02-01 02:02:09,0,WELL-00001,20170201020207,0.0,10091890.0,119.0944,1626612.0,84.58213,1564148.0,,0.0,0.0
3,2017-02-01 02:02:10,0,WELL-00001,20170201020207,0.0,10091780.0,119.0944,1635018.0,84.57429,1564148.0,,0.0,0.0
4,2017-02-01 02:02:11,0,WELL-00001,20170201020207,0.0,10091670.0,119.0944,1643424.0,84.56644,1564148.0,,0.0,0.0


In [20]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [21]:
df['target'] = 1
df.loc[(df['label']==0)&(df['class']==0), "target"] = 0

In [22]:
window = '5T'  # Ajuste para '5T', '10T', etc.

def aggregate_by_well(data, window):
    return data.resample(window).agg({
        'P-PDG': ['mean', 'std', 'min', 'max'],
        'P-TPT': ['mean', 'std', 'min', 'max'],
        'T-TPT': ['mean', 'std', 'min', 'max'],
        'P-MON-CKP': ['mean', 'std', 'min', 'max'],
        'T-JUS-CKP': ['mean', 'std', 'min', 'max'],
        'P-JUS-CKGL': ['mean', 'std', 'min', 'max'],
        'target': 'max'  # Se houver pelo menos um evento anômalo na janela
    })

df_agg = df.groupby('well').apply(lambda x: x.set_index('timestamp').pipe(aggregate_by_well, window)).reset_index()

# Renomeando colunas
df_agg.columns = ['_'.join(col).strip('_') for col in df_agg.columns]
df_agg.dropna(inplace=True)
df_agg = df_agg.rename(columns={"target_max": "target"})

In [23]:
df_agg.isnull().sum()/len(df_agg)

well               0.0
timestamp          0.0
P-PDG_mean         0.0
P-PDG_std          0.0
P-PDG_min          0.0
P-PDG_max          0.0
P-TPT_mean         0.0
P-TPT_std          0.0
P-TPT_min          0.0
P-TPT_max          0.0
T-TPT_mean         0.0
T-TPT_std          0.0
T-TPT_min          0.0
T-TPT_max          0.0
P-MON-CKP_mean     0.0
P-MON-CKP_std      0.0
P-MON-CKP_min      0.0
P-MON-CKP_max      0.0
T-JUS-CKP_mean     0.0
T-JUS-CKP_std      0.0
T-JUS-CKP_min      0.0
T-JUS-CKP_max      0.0
P-JUS-CKGL_mean    0.0
P-JUS-CKGL_std     0.0
P-JUS-CKGL_min     0.0
P-JUS-CKGL_max     0.0
target             0.0
dtype: float64

In [24]:
df_agg[df_agg['well']=='WELL-00005'].head()

Unnamed: 0,well,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-TPT_mean,P-TPT_std,P-TPT_min,P-TPT_max,...,P-MON-CKP_max,T-JUS-CKP_mean,T-JUS-CKP_std,T-JUS-CKP_min,T-JUS-CKP_max,P-JUS-CKGL_mean,P-JUS-CKGL_std,P-JUS-CKGL_min,P-JUS-CKGL_max,target
1453544,WELL-00005,2017-03-31 06:00:00,0.0,0.0,0.0,0.0,20792220.0,2278.632459,20787670.0,20796870.0,...,11513320.0,67.440932,0.001504,67.43834,67.44353,1324780000.0,886986.8,1323465000.0,1326799000.0,0.0
1453545,WELL-00005,2017-03-31 06:05:00,0.0,0.0,0.0,0.0,20792330.0,1925.677344,20789970.0,20796870.0,...,11513320.0,67.435602,0.001578,67.43288,67.43832,1324890000.0,1031168.0,1322932000.0,1327199000.0,0.0
1453546,WELL-00005,2017-03-31 06:10:00,0.0,0.0,0.0,0.0,20792690.0,1925.482973,20789970.0,20796870.0,...,11514980.0,67.430145,0.001578,67.42743,67.43286,1324700000.0,654480.7,1323465000.0,1326532000.0,0.0
1453547,WELL-00005,2017-03-31 06:15:00,0.0,0.0,0.0,0.0,20792810.0,2114.330003,20789970.0,20796870.0,...,11514680.0,67.424688,0.001578,67.42197,67.42741,1324849000.0,529733.0,1323599000.0,1326265000.0,0.0
1453548,WELL-00005,2017-03-31 06:20:00,0.0,0.0,0.0,0.0,20792700.0,1798.732333,20789970.0,20794570.0,...,11516650.0,67.419231,0.001578,67.41651,67.42195,1324269000.0,1227978.0,1321865000.0,1326932000.0,0.0


In [25]:
df_agg['well'].value_counts()

well
WELL-00002    15171
WELL-00006     8237
WELL-00001     6919
WELL-00005     4206
WELL-00014     2351
WELL-00010     2025
WELL-00017     1277
WELL-00018      294
WELL-00007      288
WELL-00016      203
WELL-00015      184
WELL-00009       24
Name: count, dtype: int64

In [26]:
df_agg.target.value_counts()

target
0.0    27987
1.0    13192
Name: count, dtype: int64

In [27]:
df_agg.target.value_counts(normalize=True)

target
0.0    0.679643
1.0    0.320357
Name: proportion, dtype: float64

In [28]:
df_agg.dtypes

well                       object
timestamp          datetime64[ns]
P-PDG_mean                float64
P-PDG_std                 float64
P-PDG_min                 float64
P-PDG_max                 float64
P-TPT_mean                float64
P-TPT_std                 float64
P-TPT_min                 float64
P-TPT_max                 float64
T-TPT_mean                float64
T-TPT_std                 float64
T-TPT_min                 float64
T-TPT_max                 float64
P-MON-CKP_mean            float64
P-MON-CKP_std             float64
P-MON-CKP_min             float64
P-MON-CKP_max             float64
T-JUS-CKP_mean            float64
T-JUS-CKP_std             float64
T-JUS-CKP_min             float64
T-JUS-CKP_max             float64
P-JUS-CKGL_mean           float64
P-JUS-CKGL_std            float64
P-JUS-CKGL_min            float64
P-JUS-CKGL_max            float64
target                    float64
dtype: object

In [29]:
df_agg['well'].unique()

array(['WELL-00001', 'WELL-00002', 'WELL-00005', 'WELL-00006',
       'WELL-00007', 'WELL-00009', 'WELL-00010', 'WELL-00014',
       'WELL-00015', 'WELL-00016', 'WELL-00017', 'WELL-00018'],
      dtype=object)

In [30]:
# valid = df_agg[df_agg['well'].isin(['WELL-00005'])]
# data= df_agg[~(df_agg['well'].isin(['WELL-00005']))]

In [31]:
# valid.shape

In [32]:
# data.shape

In [33]:
# Lista para armazenar os dados de treinamento e teste
train = list()
test = list()

data = df_agg.sort_values(by=['well', 'timestamp'])

# Iterar sobre cada poco
for poco_id, poco_data in data.groupby('well'):
    # Calcular o índice para dividir os dados em treinamento e teste
    split_index = int(0.8 * len(poco_data))
    
    # Dividir os dados do poco em treinamento e teste
    poco_train = poco_data.iloc[:split_index]
    poco_test = poco_data.iloc[split_index:]
    
    # Adicionar os dados de treinamento e teste à lista
    train.append(poco_train)
    test.append(poco_test)

# Concatenar os dados de treinamento e teste
train = pd.concat(train)
test = pd.concat(test)

# Verificar o tamanho dos conjuntos de treinamento e teste
print("Tamanho do conjunto de treinamento:", len(train))
print("Tamanho do conjunto de teste:", len(test))


Tamanho do conjunto de treinamento: 32938
Tamanho do conjunto de teste: 8241


In [34]:
print(f'Treino Poços: {train.well.unique()}')
print(f'Teste Poços: {test.well.unique()}')
print(f'Tamanho do treino: {len(train)}')
print(f'Tamanho do teste: {len(test)}')

Treino Poços: ['WELL-00001' 'WELL-00002' 'WELL-00005' 'WELL-00006' 'WELL-00007'
 'WELL-00009' 'WELL-00010' 'WELL-00014' 'WELL-00015' 'WELL-00016'
 'WELL-00017' 'WELL-00018']
Teste Poços: ['WELL-00001' 'WELL-00002' 'WELL-00005' 'WELL-00006' 'WELL-00007'
 'WELL-00009' 'WELL-00010' 'WELL-00014' 'WELL-00015' 'WELL-00016'
 'WELL-00017' 'WELL-00018']
Tamanho do treino: 32938
Tamanho do teste: 8241


In [35]:
# Verificar a proporção de anomalias
print(f"Proporção de anomalias no conjunto de treino: {train.target.mean():.2%}")
print(f"Proporção de anomalias no conjunto de teste: {test.target.mean():.2%}")

Proporção de anomalias no conjunto de treino: 33.05%
Proporção de anomalias no conjunto de teste: 27.97%


In [36]:
train.head()

Unnamed: 0,well,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-TPT_mean,P-TPT_std,P-TPT_min,P-TPT_max,...,P-MON-CKP_max,T-JUS-CKP_mean,T-JUS-CKP_std,T-JUS-CKP_min,T-JUS-CKP_max,P-JUS-CKGL_mean,P-JUS-CKGL_std,P-JUS-CKGL_min,P-JUS-CKGL_max,target
0,WELL-00001,2014-01-24 09:30:00,0.0,0.0,0.0,0.0,18433410.0,0.0,18433410.0,18433410.0,...,9397099.0,74.800278,1.9e-05,74.80025,74.80031,-300917.8,0.0,-300917.8,-300917.8,1.0
1,WELL-00001,2014-01-24 09:35:00,0.0,0.0,0.0,0.0,18433410.0,0.0,18433410.0,18433410.0,...,9397272.0,74.800168,4.6e-05,74.80009,74.80025,-300917.8,0.0,-300917.8,-300917.8,1.0
2,WELL-00001,2014-01-24 09:40:00,0.0,0.0,0.0,0.0,18433410.0,0.0,18433410.0,18433410.0,...,9397446.0,74.80001,4.7e-05,74.79993,74.80009,-300917.8,0.0,-300917.8,-300917.8,1.0
3,WELL-00001,2014-01-24 09:45:00,0.0,0.0,0.0,0.0,18433410.0,0.0,18433410.0,18433410.0,...,9397621.0,74.799852,4.7e-05,74.79977,74.79993,-300917.8,0.0,-300917.8,-300917.8,1.0
4,WELL-00001,2014-01-24 09:50:00,0.0,0.0,0.0,0.0,18433410.0,0.0,18433410.0,18433410.0,...,9397795.0,74.799693,4.7e-05,74.79961,74.79977,-300917.8,0.0,-300917.8,-300917.8,1.0


In [37]:
train.tail()

Unnamed: 0,well,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-TPT_mean,P-TPT_std,P-TPT_min,P-TPT_max,...,P-MON-CKP_max,T-JUS-CKP_mean,T-JUS-CKP_std,T-JUS-CKP_min,T-JUS-CKP_max,P-JUS-CKGL_mean,P-JUS-CKGL_std,P-JUS-CKGL_min,P-JUS-CKGL_max,target
1885685,WELL-00018,2019-04-03 09:40:00,0.0,0.0,0.0,0.0,8431093.0,712.726134,8429578.0,8432542.0,...,1483602.0,74.213973,0.110205,74.0,74.36111,9397205.0,4166.921226,9390495.0,9404494.0,1.0
1885686,WELL-00018,2019-04-03 09:45:00,0.0,0.0,0.0,0.0,8430980.0,678.090498,8429752.0,8432400.0,...,1457574.0,74.0,0.0,74.0,74.0,9410833.0,5949.398514,9404543.0,9427083.0,1.0
1885687,WELL-00018,2019-04-03 09:50:00,0.0,0.0,0.0,0.0,8430743.0,911.513084,8429560.0,8433425.0,...,1457574.0,74.0,0.0,74.0,74.0,9427372.0,1767.329397,9427083.0,9442866.0,1.0
1885688,WELL-00018,2019-04-03 09:55:00,0.0,0.0,0.0,0.0,8430795.0,485.347977,8429913.0,8432119.0,...,1474926.0,74.0,0.0,74.0,74.0,9444548.0,340.325177,9444444.0,9446233.0,1.0
1885689,WELL-00018,2019-04-03 10:00:00,0.0,0.0,0.0,0.0,8430998.0,533.796551,8430143.0,8432209.0,...,1466250.0,74.043331,0.045737,74.0,74.13541,9454149.0,4561.802382,9446285.0,9461890.0,1.0


In [38]:
test.head()

Unnamed: 0,well,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-TPT_mean,P-TPT_std,P-TPT_min,P-TPT_max,...,P-MON-CKP_max,T-JUS-CKP_mean,T-JUS-CKP_std,T-JUS-CKP_min,T-JUS-CKP_max,P-JUS-CKGL_mean,P-JUS-CKGL_std,P-JUS-CKGL_min,P-JUS-CKGL_max,target
376538,WELL-00001,2017-08-23 19:40:00,-6887865.0,0.0,-6887865.0,-6887865.0,9686840.0,6237.435951,9675544.0,9695493.0,...,1953795.0,84.367123,0.230657,83.58832,84.70229,-140331.8,0.0,-140331.8,-140331.8,0.0
376539,WELL-00001,2017-08-23 19:45:00,-6887865.0,0.0,-6887865.0,-6887865.0,9691381.0,7063.428971,9679208.0,9703555.0,...,1950179.0,83.149974,0.159885,82.87943,83.56806,-140331.8,0.0,-140331.8,-140331.8,0.0
376540,WELL-00001,2017-08-23 19:50:00,-6887865.0,0.0,-6887865.0,-6887865.0,9710671.0,3047.495846,9703636.0,9715443.0,...,1798996.0,83.693227,0.168539,83.40104,83.95795,-140331.8,0.0,-140331.8,-140331.8,0.0
376541,WELL-00001,2017-08-23 19:55:00,-6887865.0,0.0,-6887865.0,-6887865.0,9700290.0,5400.73636,9683720.0,9708045.0,...,1835164.0,83.880155,0.040774,83.80988,83.95042,-140331.8,0.0,-140331.8,-140331.8,0.0
376542,WELL-00001,2017-08-23 20:00:00,-6887865.0,0.0,-6887865.0,-6887865.0,9687219.0,5027.774671,9675544.0,9695493.0,...,1937881.0,83.739377,0.040907,83.66887,83.80942,-140331.8,0.0,-140331.8,-140331.8,0.0


In [39]:
test.tail()

Unnamed: 0,well,timestamp,P-PDG_mean,P-PDG_std,P-PDG_min,P-PDG_max,P-TPT_mean,P-TPT_std,P-TPT_min,P-TPT_max,...,P-MON-CKP_max,T-JUS-CKP_mean,T-JUS-CKP_std,T-JUS-CKP_min,T-JUS-CKP_max,P-JUS-CKGL_mean,P-JUS-CKGL_std,P-JUS-CKGL_min,P-JUS-CKGL_max,target
1885744,WELL-00018,2019-04-03 14:35:00,0.0,0.0,0.0,0.0,8488497.0,584.765582,8487240.0,8489451.0,...,1556954.0,73.778307,0.055563,73.68403,73.88296,9815721.0,6274.887506,9803767.0,9825073.0,1.0
1885745,WELL-00018,2019-04-03 14:40:00,0.0,0.0,0.0,0.0,8487855.0,421.387007,8486641.0,8488494.0,...,1561687.0,73.974298,0.044264,73.88385,74.04514,9788532.0,7625.624412,9778909.0,9803679.0,1.0
1885746,WELL-00018,2019-04-03 14:45:00,0.0,0.0,0.0,0.0,8490193.0,1484.580661,8486886.0,8492350.0,...,1561687.0,73.759228,0.105725,73.58911,73.95679,9767823.0,7433.448938,9751978.0,9778843.0,1.0
1885747,WELL-00018,2019-04-03 14:50:00,0.0,0.0,0.0,0.0,8490488.0,500.503155,8489481.0,8491817.0,...,1579039.0,73.591766,0.056312,73.50347,73.68403,9740071.0,4526.52643,9733150.0,9751838.0,1.0
1885748,WELL-00018,2019-04-03 14:55:00,0.0,0.0,0.0,0.0,8490426.0,481.073006,8489338.0,8491435.0,...,1526983.0,73.489479,0.1113,73.32291,73.65018,9719440.0,8410.627837,9706768.0,9733099.0,1.0


In [40]:
train.to_parquet('./dataset/train_agg_full.parquet.gzip',
              compression='gzip')

test.to_parquet('./dataset/test_agg_full.parquet.gzip',
              compression='gzip')

# valid.to_parquet('./dataset/valid_agg.parquet.gzip',
#               compression='gzip')