# Logs data filtering for DT prediction model

In [1]:
import os
import sys
import pickle
import numpy as np
import pandas as pd

sys.path.insert(0, "/notebooks/goryachev/petroflow")

from petroflow import Well, WellDataset
from petroflow.batchflow import Pipeline
from petroflow.models.logs_prediction.utils import build_dataset

#### Filtering pipeline actions, explained.

* `keep_logs(PROPER_COL)` — drop all columns from logs, except `PROPER_COL`
* `drop_nans(PROPER_COL)` — split logs into non-nan segments
* `drop_short_segments(CROP_LENGTH)` — drop segments shorter than `CROP_LENGTH`
* `reindex(REINDEXATION_STEP, attrs='logs')` — reindex all logs to `REINDEXATION_STEP` step
* `interpolate(attrs="logs", limit_direction="both")` — fill nans produces by interpolation
* `norm_mean_std()` — normalize logs data
* `add_depth_log()` — add `DEPTH` column to logs
* `apply(lambda x: x / 1000, src='DEPTH')` — convert `DEPTH` column to kilometers
* `rename_logs({'DEPTH': 'DEPTH KM'})` — rename `DEPTH` column to avoid error on dump

In [2]:
RAW_DATASET_PATH = "../data/raw/*"
raw_dataset = WellDataset(path=RAW_DATASET_PATH, dirs=True)

INPUTS_COL = ['GK', 'NKTD', 'GZ1']
TARGET_COL = ['DT']
PROPER_COL = INPUTS_COL + TARGET_COL

CROP_SIZE = 64
REINDEXATION_STEP = 0.1
CROP_LENGTH = CROP_SIZE * REINDEXATION_STEP

filtering_template = (Pipeline()
    .keep_logs(PROPER_COL)
    .drop_nans(PROPER_COL)
    .drop_short_segments(CROP_LENGTH)
    .reindex(REINDEXATION_STEP, attrs='logs')
    .interpolate(attrs="logs", limit_direction="both")
    .norm_mean_std()
    .add_depth_log()
    .apply(lambda x: x / 1000, src='DEPTH')
    .rename_logs({'DEPTH': 'DEPTH KM'})
)

filtering_pipeline = raw_dataset >> filtering_template
filtered_batch = filtering_pipeline.next_batch(raw_dataset.size)
filtered_dataset = build_dataset(filtered_batch)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



#### Save filtered dataset.

In [3]:
FILTERED_DATASET_PATH = "../data/filtered/"
for well in filtered_dataset.wells:
    well.dump(FILTERED_DATASET_PATH)