In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import collections
import matplotlib.pyplot as plt
import torch
import time
import collections
from functools import reduce

sys.path.insert(0, os.path.join("..", "..", ".."))

from petroflow import CoreBatch, CoreIndex
from petroflow.batchflow.models.torch import ResNet18
from petroflow.batchflow import Dataset, Pipeline, V, B, W, F, C, I, L
from petroflow.batchflow.research import Research, Option, KV, Results

import utils

In [2]:
PATH = '/notebooks/data/august_dataset/cropped_wells'

## Pipelines

In [3]:
BATCH_SIZE = 8
N_EPOCH = 50
SHAPE = (3, 500, 250)

model_config = {'initial_block/inputs': 'images',
                'inputs/images/shape': SHAPE,
                'inputs/labels/classes': F(lambda x: len(x.pipeline.config['labels_mapping'])),
                'initial_block/inputs': 'images',
                'optimizer': 'Adam',
                'output': ['proba', 'labels'],
                'device': C('device'),
                'loss': 'ce'#, dict(weight=V('weight')))}
               }

train_ppl = (Pipeline()
    .add_namespace(utils)
    .set_dataset(C('dataset'))
    .load(uv=False, dst=['dl'])
    .normalize(src='dl', dst='dl')
    .add_namespace(np)
    .create_labels(C('annotation').LITHOLOGY.loc)
    .update(B('labels'), B('labels').tolist())
    .to_array(src='dl', dst='images', dtype='float32')
    .make_random_crops(src=['images'], dst=['images'], channels='last', shape=SHAPE[1:], n_crops=1)
    .concatenate(B('images'), axis=0, save_to=B('images'))
    .init_variable('loss', default=None)
    .transpose(B('images'), axes=(0, 3, 1, 2), save_to=B('images'))
    .encode(B('labels'), C('labels_mapping'), save_to=B('labels'))
    .init_model('dynamic', ResNet18, 'model', model_config)
    .train_model('model', B('images'), B('labels'), use_lock=True, fetches='loss',
             save_to=V('loss', mode='w'))
    .run_later(64, n_epochs=100, shuffle=42, drop_last=True)
)

def init_research(iteration, experiment, ppl_train, ppl_test):
    ppl = experiment[ppl_train].pipeline
    cfg = experiment[ppl_train].config.config()
    ann, ds_train, ds_test, _ = utils.get_input_data(PATH, cfg['lithology'])
    labels_mapping, reverse_mapping, counter, weights = utils.get_statistics(ds_train, ann)
    #labels_mapping = {v: k for k,v in enumerate(cfg['lithology'])}

    new_cfg = {
        'lithology': cfg['lithology'],
        'annotation': ann,
        'dataset': ds_train,
        'dataset_test': ds_test,
        'labels_mapping': labels_mapping
    }
    
    experiment[ppl_train].pipeline.set_config(new_cfg)
    experiment[ppl_test].pipeline.set_config(new_cfg)
    
    import pickle
    
    with open(os.path.join(experiment[ppl_train].path, 'inputs.pkl'), 'wb') as f:
        pickle.dump({
            'annotation': ann,
            'dataset': ds_train,
            'dataset_test': ds_test,
            'labels_mapping': labels_mapping,
            'reverse_mapping': reverse_mapping,
            'counter': counter,
            'weights': weights
        }, f)


def get_model(iteration, experiment, pipeline):
    experiment[pipeline].pipeline.get_model_by_name('model').save(
        os.path.join(experiment[pipeline].path, 'model.torch')
    )

In [4]:
test_ppl = (Pipeline()
    .add_namespace(utils)
    .set_dataset(C('dataset_test'))
    .load(uv=False, dst=['dl'])
    .normalize(src='dl', dst='dl')
    .add_namespace(np)
    .create_labels(C('annotation').LITHOLOGY.loc)
    .update(B('labels'), B('labels').tolist())
    .to_array(src='dl', dst='images', dtype='float32')
    .make_random_crops(src=['images'], dst=['images'], channels='last', shape=SHAPE[1:], n_crops=1)
    .concatenate(B('images'), axis=0, save_to=B('images'))
    .init_variable('metrics', default=None)
    .transpose(B('images'), axes=(0, 3, 1, 2), save_to=B('images'))
    .encode(B('labels'), C('labels_mapping'), save_to=B('labels'))
    .import_model('model', C('import_from'))
    .predict_model('model', B('images'), fetches='proba', save_to=B('proba'))
    .gather_metrics('class', targets=B('labels'), predictions=B('proba'),
                    fmt='proba', axis=-1, save_to=V('metrics', mode='u'))
    .run_later(64, shuffle=False, n_epochs=1, bar=False, drop_last=False)
)

# Research

In [5]:
# ! rm -r research

In [6]:
options = [
    ['Песчаник', 'Аргиллит'],
    ['Песчаник', 'Алевролит'],
    ['Аргиллит', 'Алевролит'],
#    ['Аргиллит', 'Алевролит', 'Песчаник']
]

options = Option('lithology', [KV(item, '_'.join(item)) for item in options])

research = (Research()
            .add_grid(options)
            .add_function(init_research, execute='#0', ppl_train='train', ppl_test='test',
                          returns='mapping', name='init', logging=True)
            .add_pipeline(train_ppl, variables='loss', name='train')
            .add_pipeline(test_ppl, name='test', run=True, execute='last',
                          import_from='train', variables='metrics', logging=True)
            .get_metrics('test',
                         metrics_var='metrics', 
                         metrics_name='f1_score',
                         returns='f1_score',
                         execute='last')
            .add_function(get_model, execute='last', pipeline='train')
           )

# research.run(1, None, name='research', workers=4, gpu=[0,1,2,3], bar=True)

In [85]:
Results('research').load(lithology='Аргиллит_Алевролит', names=['test']).metrics.iloc[0].evaluate('accuracy')

0.6653386454183267

In [88]:
lithology = 'Аргиллит_Алевролит'

def get_lithology(lithology):
    path = 'research/results/lithology_{}/0/'.format(lithology)

    with open(os.path.join(path, 'inputs.pkl'), 'rb') as f:
        inputs = pickle.load(f)

    annotation = inputs['annotation']
    ds = inputs['dataset_test']
    labels_mapping = inputs['labels_mapping']

    test_ppl = (Pipeline()
        .add_namespace(utils)
        .set_dataset(C('dataset_test'))
        .load(uv=False, dst=['dl'])
        .normalize(src='dl', dst='dl')
        .add_namespace(np)
        .create_labels(C('annotation').LITHOLOGY.loc)
        .update(B('labels'), B('labels').tolist())
        .to_array(src='dl', dst='images', dtype='float32')
        .make_random_crops(src=['images'], dst=['images'], channels='last', shape=SHAPE[1:], n_crops=1)
        .concatenate(B('images'), axis=0, save_to=B('images'))
        .init_variable('metrics', default=None)
        .init_variable('predictions', default=[])
        .transpose(B('images'), axes=(0, 3, 1, 2), save_to=B('images'))
        .encode(B('labels'), C('labels_mapping'), save_to=B('labels'))
        .init_model('dynamic', ResNet18, 'model', config={
                    'device': 'gpu:0', 'load/path': os.path.join(path, 'model.torch')
        })
        .predict_model('model', B('images'), fetches='proba', save_to=B('proba'))
        .update(V('predictions', mode='a'), B('proba').argmax(axis=1))
        .init_variable('ind', default=[])
        .update(V('ind', mode='a'), B().indices)
        .run_later(64, shuffle=False, n_epochs=1, drop_last=False)
    )

    test_ppl.set_config({
        'annotation': annotation,
        'dataset_test': ds,
        'labels_mapping': labels_mapping
    })

    test_ppl.run()

    df = annotation.loc[ds.indices]
    df['pred'] = [inputs['reverse_mapping'][item] for item in np.concatenate(test_ppl.v('predictions'))]

    return df

In [91]:
res = dict()
for lithology in [
    ['Песчаник', 'Аргиллит'],
    ['Песчаник', 'Алевролит'],
    ['Аргиллит', 'Алевролит'],
]:
    lithology = '_'.join(lithology)
    res[lithology] = get_lithology(lithology)

In [103]:
df = res['Аргиллит_Алевролит']
df_true = df[df.LITHOLOGY == df.pred]
arg1 = df_true[df_true.LITHOLOGY == 'Аргиллит'].index

df = res['Песчаник_Аргиллит']
df_true = df[df.LITHOLOGY == df.pred]
arg2 = df_true[df_true.LITHOLOGY == 'Аргиллит'].index

true_arg = np.intersect1d(arg1, arg2)

df = res['Аргиллит_Алевролит']
df_true = df[df.LITHOLOGY == df.pred]
arg1 = df_true[df_true.LITHOLOGY == 'Алевролит'].index

df = res['Песчаник_Алевролит']
df_true = df[df.LITHOLOGY == df.pred]
arg2 = df_true[df_true.LITHOLOGY == 'Алевролит'].index

true_alev = np.intersect1d(arg1, arg2)

df = res['Песчаник_Аргиллит']
df_true = df[df.LITHOLOGY == df.pred]
arg1 = df_true[df_true.LITHOLOGY == 'Песчаник'].index

df = res['Песчаник_Алевролит']
df_true = df[df.LITHOLOGY == df.pred]
arg2 = df_true[df_true.LITHOLOGY == 'Песчаник'].index

true_sand = np.intersect1d(arg1, arg2)

In [128]:
df = res['Аргиллит_Алевролит']
df_true = df[df.LITHOLOGY != df.pred]
arg1 = df_true[df_true.LITHOLOGY == 'Аргиллит'].index

df = res['Песчаник_Аргиллит']
df_true = df[df.LITHOLOGY != df.pred]
arg2 = df_true[df_true.LITHOLOGY == 'Аргиллит'].index

wrong_arg = np.union1d(arg1, arg2)

df = res['Аргиллит_Алевролит']
df_true = df[df.LITHOLOGY != df.pred]
arg1 = df_true[df_true.LITHOLOGY == 'Алевролит'].index

df = res['Песчаник_Алевролит']
df_true = df[df.LITHOLOGY != df.pred]
arg2 = df_true[df_true.LITHOLOGY == 'Алевролит'].index

wrong_alev = np.union1d(arg1, arg2)

df = res['Песчаник_Аргиллит']
df_true = df[df.LITHOLOGY != df.pred]
arg1 = df_true[df_true.LITHOLOGY == 'Песчаник'].index

df = res['Песчаник_Алевролит']
df_true = df[df.LITHOLOGY != df.pred]
arg2 = df_true[df_true.LITHOLOGY == 'Песчаник'].index

wrong_sand = np.union1d(arg1, arg2)

In [153]:
results = [
    pd.DataFrame({'SAMPLE': np.random.choice(true_arg, min(len(true_arg), 50), replace=False), 'LABEL': 'GOOD_ARG'}),
    pd.DataFrame({'SAMPLE': np.random.choice(true_alev, min(len(true_alev), 50), replace=False), 'LABEL': 'GOOD_ALEV'}),
    pd.DataFrame({'SAMPLE': np.random.choice(true_sand, min(len(true_sand), 50), replace=False), 'LABEL': 'GOOD_SAND'}),
    pd.DataFrame({'SAMPLE': np.random.choice(wrong_arg, min(len(wrong_arg), 50), replace=False), 'LABEL': 'BAD_ARG'}),
    pd.DataFrame({'SAMPLE': np.random.choice(wrong_alev, min(len(wrong_alev), 50), replace=False), 'LABEL': 'BAD_ALEV'}),
    pd.DataFrame({'SAMPLE': np.random.choice(wrong_sand, min(len(wrong_sand), 50), replace=False), 'LABEL': 'BAD_SAND'}),
]

In [154]:
results = pd.concat(results)

In [155]:
results.to_csv('examples.csv')

In [189]:
import PIL, tqdm

for i, item in tqdm.tqdm(enumerate(results.SAMPLE)):
    src = os.path.join(PATH, '_'.join(item.split('_')[:2]), 'samples_dl','_'.join(item.split('_')[2:]))
    dst = os.path.join('/notebooks/data/august_dataset/examples', '_'.join(item.split('_')[2:]))
    img = PIL.Image.open(src)
    pos = np.random.choice(img.size[1] - 500)
    img = img.crop(box=((0, pos, 250, pos+500)))
    img.save(dst)

300it [00:23, 12.73it/s]


In [181]:
pos

2925

In [164]:
# ! mkdir '/notebooks/data/august_dataset/examples'

In [190]:
len(os.listdir('/notebooks/data/august_dataset/examples'))

300

In [15]:
# val_metrics = test_ppl.get_variable('metrics')
# print(val_metrics._confusion_matrix)

# for m in ['specificity', 'sensitivity', 'accuracy', 'f1_score']:
#     print(m, ':', val_metrics.evaluate(m))

In [16]:
# import PIL
# import cv2

# image = PIL.Image.open(os.path.join(PATH, '9281_новопортовское', 'samples_dl', '2048.69_2049.19.png',))
# image = np.array(image).astype('uint8')
# print(image.ndim)
# # cv2.equalizeHist(image)

In [17]:
# bad = ['5144_новопортовское_2287.13_2288.04.png',
#        '9281_новопортовское_2048.69_2049.19.png',
#        '115_восточно-мессояхское_2454.25_2457.2.png',
#        '115_восточно-мессояхское_2454.25_2457.2.png']

In [23]:
# index = inputs['dataset'].index.create_subset(np.array(bad))
# ds = Dataset(index, batch_class=CoreBatch)

def my_print(x):
    if x != 64:
        print(x)

train_ppl = (Pipeline()
    .init_variable('ind', None)
    .my_print(L(len)(L(np.unique)(B().indices)))
    .update(V('ind'), B().indices)
    .add_namespace(utils)
    .set_dataset(C('dataset'))
#     .load(uv=False, dst=['dl'])
#     .normalize(src='dl', dst='dl')
#     .add_namespace(np)
#     .create_labels(C('annotation').LITHOLOGY.loc)
#     .update(B('labels'), B('labels').tolist())
#     .to_array(src='dl', dst='images', dtype='float32')
#     .make_random_crops(src=['images'], dst=['images'], channels='last', shape=SHAPE[1:], n_crops=1)
#     .concatenate(B('images'), axis=0, save_to=B('images'))
#     .init_variable('loss', default=None)
#     .transpose(B('images'), axes=(0, 3, 1, 2), save_to=B('images'))
#     .encode(B('labels'), C('labels_mapping'), save_to=B('labels'))
#     .init_model('dynamic', ResNet18, 'model', model_config)
#     .train_model('model', B('images'), B('labels'), use_lock=True, fetches='loss',
#              save_to=V('loss', mode='w'))
    .run_later(64, n_epochs=100, bar=False, shuffle=42, drop_last=True)
)

train_ppl.set_config({
    'annotation': inputs['annotation'],
    'dataset': inputs['dataset'],
    'labels_mapping': inputs['labels_mapping'],
    'device': 'gpu:0'
})

train_ppl.run()

<petroflow.batchflow.batchflow.pipeline.Pipeline at 0x7fa675034ac8>

In [23]:
# [np.array(img).shape for img in b.dl]

[(1250, 250, 3), (2275, 250, 3)]

In [19]:
from collections import Counter
set(list(Counter(train_ppl.v('ind')).values()))

{1, 2}