# Lithology prediction by core images

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import collections
import matplotlib.pyplot as plt
import torch
import time

os.environ["CUDA_VISIBLE_DEVICES"]="1"#,1,2,4"

sys.path.insert(0, os.path.join("..", "..", ".."))

from petroflow import CoreBatch, CoreIndex
from petroflow.batchflow.models.torch import ResNet18
from petroflow.batchflow import Dataset, FilesIndex, Pipeline, V, B, inbatch_parallel, I, W, F, ImagesBatch

In [2]:
PATH = '/notebooks/data/august_dataset/cropped_wells/'

In [3]:
# load_ppl = (Pipeline()
#     .load(uv=False, dst='dl')
#     .normalize(src='dl', dst='dl')
# )

In [4]:
# import PIL, cv2

# image = PIL.Image.open('/notebooks/data/august_dataset/cropped_wells/770_воргенское/samples_dl/2634.13_2635.87.png')
# # image = np.array(image)
# # image = cv2.cvtColor(image.astype('uint8'), cv2.COLOR_RGB2YCrCb)
# # image[:,:,0] = cv2.equalizeHist(image[:,:,0])
# # image = cv2.cvtColor(image, cv2.COLOR_YCrCb2RGB)

In [5]:
# b = (load_ppl << ds).next_batch(3)

In [6]:
# np.array(b.dl[0]).shape

## Dataset

In [7]:
index = CoreIndex(path=PATH)

annotation = pd.read_feather('/notebooks/data/august_dataset/cropped_wells/annotation.feather')
annotation['SAMPLE'] = annotation['WELL'] + '_' + annotation['SAMPLE']
annotation = annotation.set_index('SAMPLE')

annotation = annotation[annotation['LITHOLOGY'].isin(['Песчаник', 'Алевролит', 'Аргиллит', 'Уголь'])]

np.random.seed(42)

train_wells = np.random.choice(annotation.WELL.unique(), int(len(annotation.WELL.unique()) * 0.8), replace=False)
test_wells = np.setdiff1d(annotation.WELL.unique(), train_wells)

index = index.create_subset(annotation.index.values)
train_index = index.create_subset(annotation[annotation.WELL.isin(train_wells)].index.values)
test_index = index.create_subset(annotation[annotation.WELL.isin(test_wells)].index.values)

ds = Dataset(index, CoreBatch)
ds_train = Dataset(train_index, CoreBatch)
ds_test = Dataset(test_index, CoreBatch)

In [8]:
print('Train/test ratio:', len(ds_train.indices) / len(ds_test.indices))

Train/test ratio: 3.79701230228471


In [9]:
load_ppl = (
    Pipeline()
    .load(uv=False, dst=['dl'])
    .create_labels(annotation.LITHOLOGY.loc)
    .update(B('labels'), B('labels').tolist())
)

In [10]:
counter_ppl = (
    Pipeline()
    .init_variable('lithology', default=[])
    .update(V('lithology', mode='e'), B('labels'))
)

In [11]:
# import collections

# ppl = (load_ppl + counter_ppl << ds_train)
# (ppl
#  .after
#  .add_namespace(collections)
#  .init_variable('counter')
#  .Counter(V('lithology'), save_to=V('counter'))
# )

# ppl.run(10, bar=True, n_epochs=1)
# ppl.v('counter')

In [12]:
# labels_mapping = {i: k for k, i in enumerate(ppl.v('counter'))}

# with open('resnet/labels_mapping', 'wb') as f:
#     pickle.dump(labels_mapping, f)

with open('resnet/labels_mapping', 'rb') as f:
    labels_mapping = pickle.load(f)

reverse_mapping = {v: k for k, v in labels_mapping.items()}
labels_mapping

{'Алевролит': 1, 'Аргиллит': 0, 'Песчаник': 3, 'Уголь': 2}

In [13]:
def encode(labels, mapping):
    return np.array([mapping[item] for item in labels])

BATCH_SIZE = 8
N_EPOCH = 50
SHAPE = (3, 500, 250)

model_config = {'initial_block/inputs': 'images',
                'inputs/images/shape': SHAPE,
                'inputs/labels/classes': len(labels_mapping),
                'initial_block/inputs': 'images',
                'optimizer': 'Adam',
                'output': 'proba',
                'device': 'gpu:0',
                'loss': 'ce'}

train_tmp = (Pipeline()
    .add_namespace(np)
    .to_array(src='dl', dst='images', dtype='float32')
    .make_random_crops(src=['images'], dst=['images'], channels='last', shape=SHAPE[1:], n_crops=1)
    .concatenate(B('images'), axis=0, save_to=B('images'))
    .init_variable('loss', default=[])
    .transpose(B('images'), axes=(0, 3, 1, 2), save_to=B('images'))
    .encode(B('labels'), labels_mapping, save_to=B('labels'))
    .init_model('dynamic', ResNet18, 'model', model_config)
    .train_model('model', B('images'), B('labels'), use_lock=True, fetches='loss',
             save_to=V('loss', mode='a'))
)

In [None]:
train_ppl = (load_ppl + train_tmp << ds_train)
train_ppl.run(16, n_epochs=100, shuffle=42, bar=True, prefetch=3)#, bar_desc=W(V('loss')[-1]))

  0%|          | 57/27007 [00:26<5:23:47,  1.39it/s]

In [None]:
plt.plot(np.array(train_ppl.v('loss')))
plt.plot(pd.Series(np.array(train_ppl.v('loss'))).rolling(100).mean())

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import numpy as np

with open('resnet/loss', 'rb') as f:
    loss = pickle.load(f)

In [None]:
plt.plot(loss)
plt.plot(pd.Series(np.array(loss)).rolling(100).mean())

In [None]:
train_ppl.get_model_by_name('model').save('resnet/model1.torch')

with open('resnet/loss', 'wb') as f:
    pickle.dump(train_ppl.get_variable('loss'), f)

with open('resnet/dataset', 'wb') as f:
    pickle.dump(ds, f)

In [None]:
test_tmp = (Pipeline()
    .add_namespace(np)
    .to_array(src='dl', dst='images', dtype='float32')
    .make_random_crops(src=['images'], dst=['images'], channels='last', shape=SHAPE[1:], n_crops=1)
    .concatenate(B('images'), axis=0, save_to=B('images'))
    .init_variable('metrics', default=None)
    .transpose(B('images'), axes=(0, 3, 1, 2), save_to=B('images'))
    .encode(B('labels'), labels_mapping, save_to=B('labels'))
    .init_model('dynamic', ResNet18, 'model', config={
                    'device': 'gpu:0', 'load/path': 'resnet/model1.torch'
                })
    .predict_model('model', B('images'), fetches='proba', save_to=B('proba'))
    .gather_metrics('class', targets=B('labels'), predictions=B('proba'),
                    fmt='proba', axis=-1, save_to=V('metrics', mode='u'))
)

In [None]:
test_ppl = (load_ppl + test_tmp << ds_test)
test_ppl.run(64, n_epochs=1, bar=True)

In [None]:
val_metrics = test_ppl.get_variable('metrics')
print(val_metrics._confusion_matrix)

for m in ['specificity', 'sensitivity', 'accuracy', 'f1_score']:
    print(m, ':', val_metrics.evaluate(m))

In [None]:
example_ppl = (load_ppl + test_tmp << ds_test)
b = example_ppl.next_batch(64, shuffle=True)

In [None]:
i = 0
for i in range(len(b.images)):
    image = b.images[i].transpose((2, 1, 0))
    target = reverse_mapping[b.labels[i]]
    pred = reverse_mapping[b.proba[i].argmax()]


    plt.figure(figsize=(5, 10))
    plt.imshow(image / 255)
    plt.title(target + '     ' + pred, color='g' if target == pred else 'r')
    plt.show()