In [1]:
import os
import sys
while not os.getcwd().endswith('ml'):
    os.chdir('..')
sys.path.insert(0, os.getcwd())

# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
import math
import collections
import copy
import cv2
import pandas as pd
import numpy as np
import random
from shutil import copyfile
from pathlib import Path
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from ml_helpers.image_helpers import display_images
from ml_helpers.common_helpers import display_history_metrics

from sklearn.model_selection import train_test_split
import pickle

from kaggle_problems.rosneft_proppant.workspace.common import bins, bin2high, TARGET_SHAPE, SUB_IMG_CNT


In [3]:
DATA_DIR = "kaggle_problems/rosneft_proppant/workspace/data/"
MODEL_DIR = "kaggle_problems/rosneft_proppant/workspace/models"
GENERATED_DIR = "kaggle_problems/rosneft_proppant/data/generated/"
GENERATED_LABELS_DIR = GENERATED_DIR + "labels"
DF_RATE = 1

sources = ['bw'] #'colored']
source_to_fraction = {
    'bw': 'bw',
    'colored': 'colored',
    'threshed': 'bw'
}

fraction_sievs = {
    'bw': ['16', '18', '20', '25', '30', '35', '40']
}

COEF_COMPRESS = 4

In [4]:
def enrich_fraction(train):
    for fraction in source_to_fraction.values():
        img_numbers = set([int(img[0:-len(".jpg")]) // SUB_IMG_CNT // SUB_IMG_CNT for img in os.listdir(DATA_DIR + fraction + "_main_area") if img.endswith('.jpg')])
        train.loc[train.ImageId.isin(img_numbers), 'fraction'] = fraction
    return train

def get_fraction_sievs(data, fraction):
    data_fraction = data[data.fraction == fraction]
    result_bins = []
    for b in bins:
        if data_fraction[b].sum() > 1e-5:
            result_bins.append(b)
    return result_bins
        

def common_df_processing(data):
    data = data[~data['prop_count'].isnull()]

    data["filename"] = data['ImageId'].astype(str) + '.jpg'
    return data

def get_test(source):
    validation = pd.read_csv("{}labels/train.csv".format(DATA_DIR))
    
    validation.fraction = None
    validation = enrich_fraction(validation)

    validation = validation[~validation.fraction.isnull()]
    
    fraction = source_to_fraction[source]
    validation = validation[validation['fraction'] == source_to_fraction[source]]

    validation = common_df_processing(validation)
    return validation

def get_train(source):
    train = pd.read_csv("{}/generated_{}_train.csv".format(GENERATED_LABELS_DIR, source))
    train.prop_count = train.prop_count.astype(np.float64)
    
    train = common_df_processing(train)
    return train

### Model

In [5]:
class CntExtraction(Model):
    def __init__(self, fraction):
        super(CntExtraction, self).__init__()
        self.FilterSize1 = 10


        self.pipes = [
            tf.keras.layers.Conv2D(filters=self.FilterSize1, kernel_size=(3, 3), strides=(2, 2), activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
            tf.keras.layers.Dropout(rate=0.3),

            tf.keras.layers.Conv2D(filters=self.FilterSize1, kernel_size=(3, 3), strides=(2, 2), activation='tanh'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
            tf.keras.layers.Dropout(rate=0.3),

            tf.keras.layers.Conv2D(filters=self.FilterSize1, kernel_size=(3, 3), strides=(2, 2), activation='tanh'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
            tf.keras.layers.Dropout(rate=0.3),

            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(100, activation='relu'),
            tf.keras.layers.Dense(1, activation='relu'),
        ]
        

    def call(self, x, *args, **kwargs):
        for pipe in self.pipes:
            x = pipe(x)
        return x
    

In [6]:
model = CntExtraction('bw')

#### Train Input Generator

In [7]:
def get_train_val_datagen(train, source, train_size=0.8):
    train_fraction, val_fraction = train_test_split(train, train_size=train_size, random_state=123)
    
    bins_fraction = fraction_sievs[fraction]
    
    datagen = ImageDataGenerator()

    train_generator = datagen.flow_from_dataframe(
            train_fraction.sample(n=int(len(train_fraction) * DF_RATE)),
            directory="kaggle_problems/rosneft_proppant/data/generated/{}_img".format(source),
            x_col='filename', 
            y_col='prop_count',
            target_size=(TARGET_SHAPE[0] // COEF_COMPRESS // SUB_IMG_CNT, TARGET_SHAPE[1] // COEF_COMPRESS // SUB_IMG_CNT),
            batch_size=64,
            class_mode="other"
    )
    
    val_generator = datagen.flow_from_dataframe(
        val_fraction.sample(n=int(len(val_fraction) * DF_RATE)),
        directory="kaggle_problems/rosneft_proppant/data/generated/{}_img".format(source),
        x_col='filename', 
        y_col='prop_count',
        target_size=(TARGET_SHAPE[0] // COEF_COMPRESS // SUB_IMG_CNT, TARGET_SHAPE[1] // COEF_COMPRESS // SUB_IMG_CNT),
        batch_size=64,
        class_mode="other"
    )
    return train_generator, val_generator

In [8]:
# def get_train_val_datagen(train, validation, source):
#     fraction = source_to_fraction[source]
#     bins_fraction = fraction_sievs[fraction]
    
#     datagen = ImageDataGenerator()

#     train_generator = datagen.flow_from_dataframe(
#             train.sample(n=int(len(train) * DF_RATE)),
#             directory="kaggle_problems/rosneft_proppant/data/generated/{}_img".format(source),
#             x_col='filename', 
#             y_col=bins_fraction,
#             target_size=(TARGET_SHAPE[0] // COEF_COMPRESS, TARGET_SHAPE[1] // COEF_COMPRESS),
#             batch_size=64,
#             class_mode='other',
#     )
    
#     validation_generator = datagen.flow_from_dataframe(
#             validation.sample(n=int(len(validation) * DF_RATE)),
#             directory="kaggle_problems/rosneft_proppant/data/{}_main_area".format(source),
#             x_col='filename', 
#             y_col=bins_fraction,
#             target_size=(TARGET_SHAPE[0] // COEF_COMPRESS, TARGET_SHAPE[1] // COEF_COMPRESS),
#             batch_size=64,
#             class_mode='other',
#     )
    
#     return train_generator, validation_generator

#### Input generator checking

In [9]:
# img, labels = get_train_val_datagen(train, 'bw')[0].next()
# display_images(img[0:8].astype(int), 4)

#### Callbacks

In [10]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [11]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

earlystop = EarlyStopping(patience=10)

callbacks = [earlystop, learning_rate_reduction]

In [12]:
EPS = 1e-5
def metric(true, predicted):
    true = tf.math.maximum(true,tf.ones_like( true ))
    return tf.keras.backend.mean(tf.math.abs(true - predicted) / true)


In [13]:
for source, i in zip(sources, range(len(sources))):
    train = get_train(source)
    fraction = source_to_fraction[source]
    
    
    model = CntExtraction(fraction)
    model.compile(
        loss='mse',
        optimizer='rmsprop',
    )

    train_datagen, val_datagen = get_train_val_datagen(train, source)
    
    history = model.fit(
        x=train_datagen,
        epochs=100,
        validation_data=val_datagen,
        callbacks=callbacks
    )
    
    Path(MODEL_DIR).mkdir(exist_ok=True, parents=True)
    
    with open(MODEL_DIR + "/history_model_benchmark_{}.pickle".format(source), 'wb') as f:
        pickle.dump(history.history, f)
        
    model.save(MODEL_DIR + "/model_benchmark_{}".format(source))



Found 720 validated image filenames.
Found 180 validated image filenames.
  ...
    to  
  ['...']
Train for 12 steps, validate for 3 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 00024: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 00033: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 00038: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 00041: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 42/100
Epoc

In [14]:
img, labels = get_train_val_datagen(train, 'bw')[0].next()

res = model.predict(x=img)



Found 720 validated image filenames.
Found 180 validated image filenames.


In [15]:
metric(labels, res)

<tf.Tensor: shape=(), dtype=float64, numpy=33.93138974280649>

In [16]:
# [(int(true), float(pred)) for (true, pred) in zip(labels, res)]

### TEST

In [17]:
def get_test_datagen(test, source):
    fraction = source_to_fraction[source]
    
    datagen = ImageDataGenerator()

    test_generator = datagen.flow_from_dataframe(
            test.sample(n=int(len(test))),
            directory="kaggle_problems/rosneft_proppant/workspace/data/{}_main_area".format(fraction),
            x_col='filename',
            y_col='ImageId',
            target_size=(TARGET_SHAPE[0] // COEF_COMPRESS // SUB_IMG_CNT, TARGET_SHAPE[1] // COEF_COMPRESS // SUB_IMG_CNT),
            batch_size=512,
            class_mode='other',
    )
    return test_generator


In [18]:
test = get_test("bw")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
sub_imgs = []
real_img = []
for img in test.ImageId:
    sub_imgs.extend(range(img * SUB_IMG_CNT * SUB_IMG_CNT, (img + 1) * SUB_IMG_CNT * SUB_IMG_CNT))
    real_img.extend([img for i in range(SUB_IMG_CNT * SUB_IMG_CNT)])
sub_imgs = [str(i) + ".jpg" for i in sub_imgs]

In [20]:
test_datagen = get_test_datagen(pd.DataFrame({"filename": sub_imgs, "ImageId": real_img}), 'bw')

Found 549 validated image filenames.


In [21]:
model = tf.keras.models.load_model(MODEL_DIR + "/model_benchmark_{}".format('bw'))

In [22]:
train = get_train('bw')
model.evaluate(get_train_val_datagen(train, 'bw')[0])

Found 720 validated image filenames.
Found 180 validated image filenames.
  ...
    to  
  ['...']


3587.0137532552085

In [23]:
preds = model.predict(test_datagen)

In [24]:
print(max(preds))

[1376.1759]


In [25]:
aggreg_cnt = collections.defaultdict(int)
for image_num, pred in zip(real_img, preds):
    aggreg_cnt[image_num] += int(pred)
    
print(aggreg_cnt)

defaultdict(<class 'int'>, {6: 2097, 31: 2315, 41: 1906, 42: 4103, 47: 1931, 69: 4786, 71: 2776, 73: 1793, 100: 3564, 112: 4058, 117: 3522, 151: 3050, 152: 3840, 161: 3964, 179: 1412, 186: 2465, 191: 3887, 199: 3896, 205: 2946, 214: 2155, 215: 1807, 216: 2658, 227: 3953, 237: 3364, 240: 4423, 307: 2826, 308: 4332, 312: 2773, 348: 1459, 351: 5161, 359: 4144, 383: 5052, 416: 2395, 424: 3729, 440: 2600, 449: 3331, 452: 3667, 492: 1929, 496: 2050, 498: 3380, 524: 3471, 547: 2900, 553: 2290, 579: 4267, 592: 4082, 616: 2951, 653: 3642, 655: 3733, 674: 2223, 678: 3143, 682: 2319, 683: 5551, 689: 3124, 709: 2221, 711: 5947, 714: 3250, 732: 4595, 736: 5183, 751: 3770, 768: 2058, 776: 2366})


In [26]:
defaultdict(<class 'int'>, {6: 7549, 31: 11275, 41: 5850, 42: 5875, 47: 11829, 69: 9829, 71: 7550, 73: 9170, 100: 10823, 112: 5889, 117: 9189, 151: 13850, 152: 8307, 161: 10690, 179: 8785, 186: 8090, 191: 13614, 199: 9164, 205: 9314, 214: 8934, 215: 11915, 216: 7945, 227: 8223, 237: 11938, 240: 8155, 307: 12821, 308: 6196, 312: 8587, 348: 8558, 351: 9712, 359: 7594, 383: 11820, 416: 9806, 424: 8824, 440: 11779, 449: 10007, 452: 8605, 492: 9129, 496: 5592, 498: 7969, 524: 10570, 547: 11009, 553: 9549, 579: 10877, 592: 7845, 616: 6783, 653: 11501, 655: 11596, 674: 12109, 678: 9458, 682: 8411, 683: 7279, 689: 8084, 709: 11495, 711: 8775, 714: 10245, 732: 11428, 736: 6242, 751: 6232, 768: 8550, 776: 6410})




SyntaxError: invalid syntax (<ipython-input-26-95d810cafca5>, line 1)

In [None]:
test

In [None]:
print(res)

In [None]:
model.summary()

In [None]:
def get_bins_metric(predicted, true):
    return 0.5 * np.sum((predicted - true) ** 2 / (predicted + true)) / predicted.shape[0]

def get_bins_metric_by_image(predicted, true):
    return np.sum(0.5 * (predicted - true) ** 2 / (predicted + true), axis=1)

def get_bins_metric_by_bins(predicted, true):
    return np.sum(0.5 * (predicted - true) ** 2 / (predicted + true), axis=0)

In [None]:
#print("Total bin loss: {}".format(get_bins_metric(predicted_labels, all_labels)))

In [None]:
# for source, i in zip(sources, range(len(sources))):
#     fraction = source_to_fraction[source]
#     print(source + "-" * 100)
#     from keras.utils.generic_utils import get_custom_objects

#     get_custom_objects().update({'metric': metric})

#     with open(MODEL_DIR + "/history_model_benchmark_{}.pickle".format(source), 'rb') as f:
#         history = pickle.load(f)

#     model = tf.keras.models.load_model(MODEL_DIR + "/model_benchmark_{}".format(source), 
#                                        compile=False)
#     model.compile(
#         loss=metric,
#         optimizer='rmsprop',
#        # metrics=['mse']
#     )                                                                               

#     display_history_metrics(history, source)
#     print(source + '-' * 100)
    
#     train_datagen = get_train_val_datagen(train, fraction)[0]

#     predicted_labels = []
#     all_labels = []
#     train_fraction = train[train['fraction'] == fraction]

#     for i in range(int(train_fraction.shape[0]) // 16):
#         imgs, labels = train_datagen.next()
#         predicted_labels.extend(model.predict(imgs))
#         all_labels.extend(labels)
#     predicted_labels = np.array(predicted_labels)
#     all_labels = np.array(all_labels)

#     losses_by_img = get_bins_metric_by_image(predicted_labels, all_labels)
#     plt.hist(losses_by_img, bins=100)
#     plt.show()

#     losses_by_bins = get_bins_metric_by_bins(predicted_labels, all_labels)
#     plt.hist(losses_by_bins, bins=100)
#     plt.show()
#     print("-" * 50)

In [None]:
!jupyter nbconvert --to script kaggle_problems/rosneft_proppant/cnn_try.ipynb