In [1]:
import os
import sys
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import shutil

sys.path.insert(0, os.path.join("..", "..", ".."))

from petroflow import Well, WellBatch, WS, WellDataset
from petroflow.batchflow.models.torch import UNet, ResNet18, ResNet34
from petroflow.batchflow import Dataset, DatasetIndex, FilesIndex, Pipeline, V, B, action, inbatch_parallel, I, W, F, L, ImagesBatch, R, P

Auxilary functions/classes

In [2]:
import torch
from torch import nn
import torch.nn.functional as F

class MyWellBatch(WellBatch):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, pixels_per_cm=25, **kwargs)
    
    @action
    def create_images_batch(self, core_dl, core_uv, targets):
        index = DatasetIndex(len(core_dl))
        batch = ImagesBatch(index)
        batch = batch.add_components(('core_dl', 'core_uv', 'targets'),
                                     (np.array(core_dl).astype(np.uint8),
                                      np.array(core_uv).astype(np.uint8),
                                      np.array(targets)))
        return batch

class softCrossEntropy(nn.Module):
    def __init__(self, weights):
        super().__init__()
        self.weights = weights
        return

    def forward(self, inputs, target):
        """
        :param inputs: predictions
        :param target: target labels
        :return: loss
        """
        log_likelihood = - F.log_softmax(inputs, dim=1)
        log_likelihood *= self.weights
        sample_num, class_num = target.shape
        loss = torch.sum(torch.mul(log_likelihood, target)) / sample_num

        return loss

def mydump(iteration):
    import pickle
    with open('flag', 'wb') as f:
        pickle.dump(iteration, f)

Constants

In [3]:
BATCH_SIZE = 32
N_CROPS = 16
N_EPOCH = 500
LENGTH = 0.1
SHAPE = (6, int(2500 * LENGTH), 250)

CLASSES = ['GRAVEL', 'SAND', 'ALEURITE', 'CLAY']

Filter wells without grain table

In [4]:
index = FilesIndex(path='/notebooks/data/september_dataset/core_photo/*/*', dirs=True)
ds = Dataset(index=index, batch_class=MyWellBatch)

filter_ppl = (ds.p
              .init_variable('wells', default=[])
              .has_attr('grain')
              .update(V('wells', mode='e'), B().indices)
              .run(10, n_epochs=1, shuffle=False, bar=True))

filtered_index = index.create_subset(filter_ppl.v('wells'))
ds = Dataset(index=filtered_index, batch_class=MyWellBatch)
ds.split()

100%|██████████| 21/21 [00:00<00:00, 78.17it/s]


Compute weights for loss

In [5]:
weights = pd.read_feather('/notebooks/data/september_dataset/grain.feather')[CLASSES].mean() / 100
weights = 1 / weights
weights = np.clip(weights, 0, 20)

In [6]:
crops_template = (Pipeline()
    .create_segments(src='samples', connected=True)
    .create_segments(src='grain', length=2 * LENGTH)
    .random_crop(length=LENGTH, n_crops=N_CROPS)
    .drop_short_segments(LENGTH)
)

In [7]:
components_template = (Pipeline()
    .update(B('core'), (WS('core_dl').ravel(), WS('core_uv').ravel()))
    .update(B('targets'), WS('grain')[CLASSES].mean().values.ravel())
    .create_images_batch(B('core')[0], B('core')[1], B('targets'))
    .to_pil(src=['core_dl', 'core_uv'], dst=['core_dl', 'core_uv'])
)

augmentation_template = (Pipeline()
    .cutout(shape=P((R(np.arange(100, 200)), R(np.arange(100, 200)))),
            origin=P((R('uniform'), R('uniform'))),
            color=0, src=['core_dl', 'core_uv'], dst=['core_dl', 'core_uv'])
)

concat_template = (Pipeline()
    .add_namespace(np)
    .to_array(src=['core_dl', 'core_uv'], dst=['core_dl', 'core_uv'])
    .concatenate((B('core_dl'), B('core_uv')), axis=-1, save_to=B('crops'))
    .transpose(B('crops'), axes=(0, 3, 1, 2), save_to=B('crops'))
    .nan_to_num(B('crops'), save_to=B('crops'))
    .array(B('crops'), dtype='float32', save_to=B('crops'))
    .array(B('targets'), dtype='float32', save_to=B('targets'))
)

model_config = {'initial_block/inputs': 'images',
                'inputs/images/shape': SHAPE,
                'inputs/labels/classes': 4,
                'optimizer': 'Adam',#('SGD', dict(lr=0.01, momentum=0.95)),
                'device': 'gpu:1',
                'output': 'proba',
                'microbatch': N_CROPS,
                'loss': (softCrossEntropy, {'weights': torch.tensor(weights).to('cuda:1')})}
        
train_template = (Pipeline()
    .init_variable('loss_history', default=[])
    .init_model('dynamic', ResNet18, 'model', model_config)
    .mydump(I())
    .train_model('model', B('crops'), B('targets') / 100, fetches='loss', save_to=V('loss_history', mode='a'))
    
)

In [8]:
ppl = (crops_template + 
       components_template +
       augmentation_template @ 0.33 +
       concat_template +
       train_template
      ) << ds.train

In [9]:
#b = ppl.next_batch(16)
#b.crops.shape

In [10]:
# plt.figure(figsize=(10, 10))
# for i, crop in enumerate(np.array(b.crops_dl)):
#     plt.subplot(BATCH_SIZE, N_CROPS, i+1)
#     plt.title(("{:3.0f}, "*4)[:-2].format(*b.targets[i]))
#     plt.imshow(crop / 255)
#     plt.xticks([])
#     plt.yticks([])

In [11]:
# pd.read_feather('/notebooks/data/september_dataset/core_photo/Восточно-Мессояхское/123_восточно-мессояхское/grain.feather')

In [None]:
ppl.run(BATCH_SIZE, n_epochs=N_EPOCH, bar=True, shuffle=True,
        drop_last=True, bar_desc=W(V('loss_history')[-1]))



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

0.15614183:   1%|          | 12/2000 [04:09<11:17:57, 20.46s/it]

In [None]:
import pickle
ppl.get_model_by_name('model').save('model.torch')

with open('loss', 'wb') as f:
    pickle.dump(ppl.get_variable('loss_history'), f)

with open('dataset', 'wb') as f:
    pickle.dump(ds, f)

In [None]:
with open('loss', 'rb') as f:
    loss = pickle.load(f)

In [None]:
import pandas as pd
plt.plot(loss)
plt.plot(pd.Series(np.array(loss)).rolling(100).mean())

In [None]:
split_test = (Pipeline()
    .create_segments(src='samples', connected=True)
    .create_segments(src='grain', length=LENGTH)
)

test_template = (Pipeline()
    .init_variable('lithology', default=[])
    .update(V('lithology', mode='e'), WS('grain').iloc[0].ravel())
    .init_variable('predictions', default=[])
    .init_model('dynamic', ResNet18, 'model', config={
                    'device': 'cpu', 'load/path': 'model.torch'
                })
    .predict_model('model', B('crops'), fetches='proba', save_to=V('predictions', mode='e')) 
)

In [None]:
ppl = (split_test + reg_template + test_template) << ds.test
ppl.run(1, n_epochs=1, bar=True)

In [None]:
target = pd.concat(ppl.v('lithology'), axis=1).transpose()

In [None]:
values = np.array(ppl.v('predictions')) * 100
pred = pd.DataFrame({
    'GRAVEL_PRED': values[:, 0],
    'SAND_PRED': values[:, 1],
    'ALEURITE_PRED': values[:, 2],
    'CLAY_PRED': values[:, 3]
})

In [None]:
results = pd.concat([target.reset_index(), pred], axis=1)

In [None]:
results

In [None]:
results['LITHOLOGY'] = results.LITHOLOGY.apply(lambda x: x.split(' ')[0].capitalize())

In [None]:
classes = ['Песчаник', 'Алевролит', 'Глина']
colors = ['r', 'g', 'b', 'y']

mask = [np.isin(item, classes) for item in results.LITHOLOGY]

In [None]:
# grain.columns[3:7]
from sklearn.manifold import TSNE
tsne_res = TSNE(n_components=2, n_iter_without_progress=500).fit_transform(results[
    ['GRAVEL_PRED', 'SAND_PRED', 'ALEURITE_PRED', 'CLAY_PRED']
])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 5))

for label, color in zip(classes, colors):
    plt.scatter(tsne_res[(results.LITHOLOGY == label).values, 0],
                tsne_res[(results.LITHOLOGY == label).values, 1], color=color, label=label)

plt.title('Данные гранулометрии')
plt.legend()
plt.xticks([])
plt.yticks([])