In [1]:
import os
import json
from shutil import copytree

import numpy as np
import pandas as pd
import lasio

In [2]:
RAW_DATASET_PATH = "/path/to/raw/dataset/"
PROCESSED_DATASET_PATH = "/path/to/processed/dataset/"
FIELD = "Field"
WELL_LIST = ["well_1", "well_2"]

In [3]:
fes = pd.read_excel(os.path.join(RAW_DATASET_PATH, "FES_.xlsx"))

In [4]:
def process_well(well_name, dump_path):
    dump_path = os.path.join(dump_path, well_name)
    if not os.path.exists(dump_path):
        os.makedirs(dump_path)

    las = lasio.read(os.path.join(RAW_DATASET_PATH, well_name, "logs.las"))
    logs = las.df().reset_index().set_index("DEPTH")
    logs.reset_index().to_feather(os.path.join(dump_path, "logs.feather"))
    
    meta = {
        "name": well_name,
        "field": FIELD,
        "depth_from": las.header["Well"]["STRT"].value,
        "depth_to": las.header["Well"]["STOP"].value,
    }
    with open(os.path.join(dump_path, "meta.json"), "w") as f:
        json.dump(meta, f)

    layers = pd.read_csv(os.path.join(RAW_DATASET_PATH, well_name, "layers.csv"), sep=";", encoding="cp1251")
    layers = (layers[["DEPTH_FROM", "DEPTH_TO", "LAYER"]].round({"DEPTH_FROM": 2, "DEPTH_TO": 2})
                                                         .drop_duplicates()
                                                         .sort_values(by=["DEPTH_FROM"])
                                                         .set_index(["DEPTH_FROM", "DEPTH_TO"])
    )
    layers.reset_index().to_feather(os.path.join(dump_path, "layers.feather"))
    
    inclination = pd.read_csv(os.path.join(RAW_DATASET_PATH, well_name, "inclination.csv"), sep=";")
    inclination.reset_index(drop=True).to_feather(os.path.join(dump_path, "inclination.feather"))

    well_fes = fes[(fes["Well"] == well_name) & (fes["Sheet"] != "Лист1")]

    core_properties = well_fes[["Глубина, м", "Пористость, %", "Прониц_парал.txt"]]
    core_properties.columns = ["DEPTH", "POROSITY", "PERMEABILITY"]
    core_properties = core_properties.round({"DEPTH": 2}).sort_values(by=["DEPTH"]).set_index("DEPTH")
    core_properties.reset_index().to_feather(os.path.join(dump_path, "core_properties.feather"))

    boring_intervals = well_fes[["2_Верх_интервала.txt", "3_Низ_интервала.txt", "5_Вынос.txt"]]
    boring_intervals.columns = ["DEPTH_FROM", "DEPTH_TO", "CORE_RECOVERY"]
    boring_intervals = (boring_intervals.round({"DEPTH_FROM": 2, "DEPTH_TO": 2})
                                        .drop_duplicates()
                                        .sort_values(by=["DEPTH_FROM"])
                                        .set_index(["DEPTH_FROM", "DEPTH_TO"])
    )
    boring_intervals.reset_index().to_feather(os.path.join(dump_path, "boring_intervals.feather"))

    core_logs = pd.read_excel(os.path.join(RAW_DATASET_PATH, "Привязка", well_name + ".xls"), header=2)[2:]
    core_logs = core_logs[["Глубина до привязки, м", "Общая радиоактивность, API", "Объемная плотность, г/см3"]]
    core_logs.columns = ["DEPTH", "GK", "DENSITY"]
    core_logs = (core_logs.round({"DEPTH": 2})
                          .sort_values(by=["DEPTH"])
                          .set_index("DEPTH")
    )
    core_logs.reset_index().to_feather(os.path.join(dump_path, "core_logs.feather"))

    samples = pd.read_csv(os.path.join(RAW_DATASET_PATH, well_name, "samples.csv"), sep=";", encoding="cp1251")
    samples = samples[["DEPTH_FROM", "DEPTH_TO", "SAMPLE", "PHOTO QC"]]
    samples.columns = ["DEPTH_FROM", "DEPTH_TO", "SAMPLE", "QC"]
    samples = (samples.round({"DEPTH_FROM": 2, "DEPTH_TO": 2})
                      .sort_values(by=["DEPTH_FROM"])
                      .set_index(["DEPTH_FROM", "DEPTH_TO"])
    )
    samples.reset_index().to_feather(os.path.join(dump_path, "samples.feather"))

    if os.path.exists(os.path.join(RAW_DATASET_PATH, well_name, "samples_dl")):
        copytree(os.path.join(RAW_DATASET_PATH, well_name, "samples_dl"),
                 os.path.join(dump_path, "samples_dl"))

    if os.path.exists(os.path.join(RAW_DATASET_PATH, well_name, "samples_uv")):
        copytree(os.path.join(RAW_DATASET_PATH, well_name, "samples_uv"),
                 os.path.join(dump_path, "samples_uv"))

In [5]:
for well_name in WELL_LIST:
    process_well(well_name, PROCESSED_DATASET_PATH)