# Record maximum value of each of the RGBY channels in raw meta data

In [1]:
! rsync -a /kaggle/input/mmdetection-v280/mmdetection /
! pip install /kaggle/input/mmdetection-v280/src/mmpycocotools-12.0.3/mmpycocotools-12.0.3/
! pip install /kaggle/input/hpapytorchzoo/pytorch_zoo-master/
! pip install /kaggle/input/hpacellsegmentation/HPA-Cell-Segmentation/
! pip install /kaggle/input/iterative-stratification/iterative-stratification-master/

! cp -r /kaggle/input/kgl-humanprotein-data/kgl_humanprotein_data /
! cp -r /kaggle/input/humanpro/kgl_humanprotein /

import sys
sys.path.append('/kgl_humanprotein/')

Processing /kaggle/input/mmdetection-v280/src/mmpycocotools-12.0.3/mmpycocotools-12.0.3
Building wheels for collected packages: mmpycocotools
  Building wheel for mmpycocotools (setup.py) ... [?25l- \ | / - \ | / - done
[?25h  Created wheel for mmpycocotools: filename=mmpycocotools-12.0.3-cp37-cp37m-linux_x86_64.whl size=272908 sha256=fc7fdcf735de49f70e4ccb674846f8007c24cf1aacbcf7eee45dd81e22d45e0e
  Stored in directory: /root/.cache/pip/wheels/80/e0/da/3288fdf3965b5c9090f368462db9d28be2c82013f51821090a
Successfully built mmpycocotools
Installing collected packages: mmpycocotools
Successfully installed mmpycocotools-12.0.3
Processing /kaggle/input/hpapytorchzoo/pytorch_zoo-master
Building wheels for collected packages: pytorch-zoo
  Building wheel for pytorch-zoo (setup.py) ... [?25l- \ done
[?25h  Created wheel for pytorch-zoo: filename=pytorch_zoo-0.0.0-py3-none-any.whl size=30139 sha256=a64e73b472f697bdcb1b3ffb4272166df25e06fe8e28d0311e98f4be5ba7

In [2]:
import os
import time
from pathlib import Path
import shutil
import zipfile
import functools
import multiprocessing
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import KFold,StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
from torch.backends import cudnn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import DataParallel
import matplotlib.pyplot as plt
from tqdm import tqdm

from kgl_humanprotein.utils.common_util import *
from kgl_humanprotein.config.config import *
from kgl_humanprotein.data_process import *
from kgl_humanprotein.datasets.tool import image_to_tensor
from kgl_humanprotein.networks.imageclsnet import init_network
from kgl_humanprotein.layers.loss import *
from kgl_humanprotein.layers.scheduler import *
from kgl_humanprotein.utils.augment_util import train_multi_augment2
from kgl_humanprotein.utils.log_util import Logger
from kgl_humanprotein.run.train import *

run on 230322a9e782


In [3]:
%cd /kaggle

/kaggle


In [4]:
dir_data = Path('/kaggle/input')
dir_mdata = Path('/kaggle/mdata')

In [5]:
CPU_COUNT = multiprocessing.cpu_count()

In [6]:
%%time

n_subsets = 5
df_cells = combine_subsets_metadata(dir_data, n_subsets)

Processing subset 4...CPU times: user 1.84 s, sys: 1.83 s, total: 3.67 s
Wall time: 8.9 s


In [7]:
len(df_cells) / 1000 * 20 / 60**2

2.7215055555555554

In [8]:
nsample = len(df_cells)
df_cells = df_cells.sample(nsample).reset_index(drop=True)

In [9]:
def load_image(r, dir_data):
    dir_img = (dir_data
               /f'humanpro-train-cells-subset{r.subset}'
               /f'humanpro_train_cells_subset{r.subset}'/'train'/'images_384')
    img = load_RGBY_image(dir_img, r.Id)
    return img

def get_max_rgby(img):
    return np.array([np.max(img[...,i]) for i in range(img.shape[-1])])

def iget_max_rgby(i):
    r = df_cells.iloc[i]
    img = load_image(r, dir_data)
    chmax = get_max_rgby(img)
    return chmax

In [10]:
%%time

with multiprocessing.Pool(processes=CPU_COUNT) as pool:
    chmaxs = []
    for chmax in pool.imap(iget_max_rgby, range(len(df_cells))):
        chmaxs.append(chmax)
    
chmaxs = np.stack(chmaxs, axis=0)

CPU times: user 1min 59s, sys: 28.6 s, total: 2min 27s
Wall time: 2h 5min 6s


In [11]:
df_cells[['max_red', 'max_green', 'max_blue', 'max_yellow']] = chmaxs

In [12]:
df_cells.head()

Unnamed: 0,Id,rle,bbox,Target,max_green,subset,max_red,max_blue,max_yellow
0,f0f9a766-bba2-11e8-b2b9-ac1f6b6435d0_41,{'counts': b'iVWV37io13L2N2N2aPN5fn1e0L3L4N2N1...,"[1635, 1146, 1872, 1562]",2,250,4,194,128,238
1,8ec3411e-bbbc-11e8-b2ba-ac1f6b6435d0_12,{'counts': b'eYo<W1hn22N2N4K9TO`NiRMb1Vm2^NiRM...,"[138, 1224, 486, 1525]",5|0,254,2,170,59,197
2,6d2d0024-bbc8-11e8-b2bc-ac1f6b6435d0_3,{'counts': b'\\add24d<0`CX1_a1jNojN`1oT1bNojN`...,"[1354, 310, 1846, 1098]",9|12|4|3,255,2,236,255,255
3,a5450940-bbc1-11e8-b2bb-ac1f6b6435d0_3,{'counts': b'TPdk22ko14N10001O1O1O001O1O1O1O2N...,"[1466, 0, 2048, 446]",6,153,3,255,138,255
4,0d716f64-bbc1-11e8-b2bb-ac1f6b6435d0_0,{'counts': b'Z1d1\\n10001N2N2N2N100O2N1O10O10O...,"[0, 0, 570, 424]",0,251,0,255,226,255


In [13]:
df_cells.to_feather('/kaggle/working/train.feather')

In [14]:
# CHANNEL_THRES = 120

# is_dark_red = df_cells.max_red < CHANNEL_THRES
# is_dark_green = df_cells.max_green < CHANNEL_THRES
# is_dark_blue = df_cells.max_blue < CHANNEL_THRES
# is_dark_yellow = df_cells.max_yellow < CHANNEL_THRES
# is_dark = is_dark_red & is_dark_green & is_dark_blue & is_dark_yellow

In [15]:
# df_dark = df_cells[is_dark].reset_index(drop=True)

In [16]:
# nplot = 10
# df = df_dark.sample(nplot).reset_index(drop=True) if len(df_dark) > nplot else df_dark.copy()

In [17]:
# ncols = 4
# nrows = (nplot - 1) // ncols + 1
# fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(4*ncols, 4*nrows))
# axs = axs.flatten()
# for ax in axs:
#     ax.axis('off')
# for ax, (_, r) in zip(axs, df.iterrows()):
#     img = load_image(r, dir_data)
#     chmaxs = get_max_rgby(img)
#     ax.imshow(img[...,[0,1,2]])
#     ax.set_title(f'{r.Target} {chmaxs}')
    
# plt.tight_layout()