# Classifier Training

In [1]:
! rsync -a /kaggle/input/mmdetection-v280/mmdetection /
! pip install /kaggle/input/mmdetection-v280/src/mmpycocotools-12.0.3/mmpycocotools-12.0.3/
! pip install /kaggle/input/hpapytorchzoo/pytorch_zoo-master/
! pip install /kaggle/input/hpacellsegmentation/HPA-Cell-Segmentation/
! pip install /kaggle/input/iterative-stratification/iterative-stratification-master/

! cp -r /kaggle/input/kgl-humanprotein-data/kgl_humanprotein_data /
! cp -r /kaggle/input/humanpro/kgl_humanprotein /

import sys
sys.path.append('/kgl_humanprotein/')

Processing /kaggle/input/mmdetection-v280/src/mmpycocotools-12.0.3/mmpycocotools-12.0.3
Building wheels for collected packages: mmpycocotools
  Building wheel for mmpycocotools (setup.py) ... [?25l- \ | / - \ | / - done
[?25h  Created wheel for mmpycocotools: filename=mmpycocotools-12.0.3-cp37-cp37m-linux_x86_64.whl size=272906 sha256=8f8785e8e8d2895ed1acee813300b4f8aed66cee1bad7dd55335b6fe483c6852
  Stored in directory: /root/.cache/pip/wheels/80/e0/da/3288fdf3965b5c9090f368462db9d28be2c82013f51821090a
Successfully built mmpycocotools
Installing collected packages: mmpycocotools
Successfully installed mmpycocotools-12.0.3
Processing /kaggle/input/hpapytorchzoo/pytorch_zoo-master
Building wheels for collected packages: pytorch-zoo
  Building wheel for pytorch-zoo (setup.py) ... [?25l- done
[?25h  Created wheel for pytorch-zoo: filename=pytorch_zoo-0.0.0-py3-none-any.whl size=30139 sha256=bb75f7da229f0298f5639cae2ec4d9e45bcf6711e748ee8b4c4358875a2758e8

In [2]:
import os
import time
from pathlib import Path
import shutil
import zipfile
import functools
import multiprocessing
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import KFold,StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
from torch.backends import cudnn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import DataParallel
import matplotlib.pyplot as plt
from tqdm import tqdm

from kgl_humanprotein.utils.common_util import *
from kgl_humanprotein.config.config import *
from kgl_humanprotein.data_process import *
from kgl_humanprotein.datasets.tool import image_to_tensor
from kgl_humanprotein.datasets.protein_dataset import *
from kgl_humanprotein.networks.imageclsnet import init_network
from kgl_humanprotein.layers.loss import *
from kgl_humanprotein.layers.scheduler import *
from kgl_humanprotein.utils.augment_util import train_multi_augment2
from kgl_humanprotein.utils.log_util import Logger
from kgl_humanprotein.run.train import *

run on 8b77f733f61f


In [3]:
%cd /kaggle

/kaggle


## Combine subsets' meta data

In [4]:
dir_data = Path('/kaggle/input')
dir_mdata = Path('/kaggle/mdata')
n_subsets = 5

In [5]:
%%time
df_cells = combine_subsets_metadata(dir_data, n_subsets)

Processing subset 4...CPU times: user 1.55 s, sys: 870 ms, total: 2.42 s
Wall time: 6.98 s


In [6]:
dir_mdata_raw = dir_mdata/'raw'
dir_mdata_raw.mkdir(exist_ok=True, parents=True)

df_cells.to_feather(dir_mdata_raw/'train.feather')

In [7]:
del df_cells

## Filter samples

In [8]:
# Limit number of samples per label

def cap_number_per_label(df_cells, cap=10_000, idx_start=0):
    df_cells_cap = pd.DataFrame()
    for label in df_cells.Target.unique():
        df = df_cells[df_cells.Target==label]
        if len(df) > cap:
            df = df.iloc[idx_start:idx_start + cap]
        df_cells_cap = df_cells_cap.append(df, ignore_index=True)
    return df_cells_cap

In [9]:
df_cells = pd.read_feather(dir_mdata_raw/'train.feather')

In [10]:
# Take only multi-label samples
n_labels = df_cells['Target'].apply(lambda o: len(o.split('|')))
df_cells = df_cells[n_labels > 1]

In [11]:
len(df_cells) / 100 * 6.5 / 60**2

4.362222222222222

In [12]:
# For testing, just take a few samples
n_sample = len(df_cells)  # 100

df_cells = df_cells.sample(n_sample).reset_index(drop=True)

In [13]:
df_cells.Target.value_counts()

16|0        29562
2|0         13113
12|0        11575
7|0         11129
0|13        10728
            ...  
9|1             8
16|6|2|0        8
14|1|2|0        8
9|7|13          8
16|7|3          7
Name: Target, Length: 413, dtype: int64

In [14]:
df_cells.to_feather(dir_mdata_raw/'train.feather')

## One-hot encode labels

In [15]:
%%time
generate_meta(dir_mdata, 'train.feather')

CPU times: user 1min 20s, sys: 1.09 s, total: 1min 22s
Wall time: 1min 21s


## Validation

In [16]:
# set cuda visible device

gpu_id = '0'

os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id
cudnn.benchmark = True

# set random seeds
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [17]:
# Load trained model

arch = 'class_densenet121_dropout'
num_classes = len(LABEL_NAME_LIST)
in_channels = 4
resume = Path('/kaggle/input/humanpro-classifier-crop/results/models/'
              'external_crop256_focal_slov_hardlog_class_densenet121_dropout_i384_aug2_5folds/'
              'fold0/final.pth')

model_params = {}
model_params['architecture'] = arch
model_params['num_classes'] = num_classes
model_params['in_channels'] = in_channels
model = init_network(model_params)

# move network to gpu
model = DataParallel(model)
model.to(DEVICE)

checkpoint = torch.load(resume)
model.module.load_state_dict(checkpoint['state_dict'])

>> Using pre-trained model.


<All keys matched successfully>

In [18]:
# Load validation samples

img_size = 384
crop_size = 256 
batch_size = 64
workers = 3
pin_memory = True
valid_file = Path(dir_mdata/'meta'/'train_meta.feather')


assert valid_file.exists()
    
valid_dataset = ProteinDataset(dir_data, valid_file, img_size=img_size, is_trainset=True,
                               return_label=True, in_channels=in_channels, transform=None,
                               crop_size=crop_size, random_crop=False)
    
valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset),
                          batch_size=batch_size, drop_last=False, 
                          num_workers=workers, pin_memory=pin_memory)

In [19]:
# loss function

criterion = FocalSymmetricLovaszHardLogLoss().to(DEVICE)
focal_loss = FocalLoss().to(DEVICE)

In [20]:
from torch.autograd import Variable
import torch.nn.functional as F
from sklearn.metrics import f1_score

def validate(valid_loader, model, criterion, epoch, focal_loss, threshold=0.5):
    batch_time = AverageMeter()
    losses = AverageMeter()
    accuracy = AverageMeter()

    # switch to evaluate mode
    model.eval()

    probs_list = []
    labels_list = []
    logits_list = []
    loss_list = []
    acc_list = []

    end = time.time()
    for it, iter_data in enumerate(valid_loader, 0):
        images, labels, indices = iter_data
        images = Variable(images.to(DEVICE))
        labels = Variable(labels.to(DEVICE))

        outputs = model(images)
        loss = criterion(outputs, labels, epoch=epoch)

        logits = outputs
        probs = F.sigmoid(logits)
        acc = multi_class_acc(probs, labels)

        probs_list.append(probs.cpu().detach().numpy())
        labels_list.append(labels.cpu().detach().numpy())
        logits_list.append(logits.cpu().detach().numpy())
        loss_list.append(loss.item())
        acc_list.append(acc.item())

        losses.update(loss.item())
        accuracy.update(acc.item())

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

    probs = np.vstack(probs_list)
    y_true = np.vstack(labels_list)
    logits = np.vstack(logits_list)
    loss = np.vstack(loss_list)
    acc = np.vstack(acc_list)
    valid_focal_loss = focal_loss.forward(torch.from_numpy(logits), torch.from_numpy(y_true))

    y_pred = probs > threshold
    kaggle_score = f1_score(y_true, y_pred, average='macro')

#     return losses.avg, accuracy.avg, valid_focal_loss, kaggle_score
    return losses.avg, accuracy.avg, valid_focal_loss, kaggle_score, probs, y_true, logits, loss, acc

In [21]:
%%time 

with torch.no_grad():
    (valid_loss, valid_acc, valid_focal_loss, kaggle_score, 
     probs, y_true, logits, loss, acc) = validate(valid_loader, model, criterion, 0, focal_loss)

CPU times: user 8min 40s, sys: 2min 52s, total: 11min 33s
Wall time: 1h 17min 24s


In [22]:
loss = []
for i in range(len(logits)):
    loss.append(criterion(torch.from_numpy(logits[[i]]), torch.from_numpy(y_true[[i]])))
    
loss = np.array(loss)

In [23]:
prediction_threshold = 0.3

preds = (probs > prediction_threshold).astype(np.int16)

ptargets = [np.where(pred)[0] for pred in preds]
ptargets = ['|'.join(str(class_id) for class_id in target) for target in ptargets]

ntargets = [np.where(pred)[0] for pred in y_true * preds]
ntargets = ['|'.join(str(class_id) for class_id in target) for target in ntargets]

In [24]:
df_cells.rename({'Target': 'original_target'}, axis=1, inplace=True)
df_cells['predicted_target'] = ptargets
df_cells['predicted_target_loss'] = loss.flatten()
df_cells['Target'] = ntargets

In [25]:
df_cells.to_feather('/kaggle/working/train.feather')

In [26]:
df_cells.head()

Unnamed: 0,Id,rle,bbox,original_target,max_green,subset,predicted_target,predicted_target_loss,Target
0,3d386b44-bba3-11e8-b2b9-ac1f6b6435d0_16,{'counts': b'`o]g04ko12N2L6F>H1O1M3M3O1O1M3fNZ...,"[374, 1726, 874, 2048]",16|5,255,1,5|6,4.119574,5.0
1,e4cd80b2-bba1-11e8-b2b9-ac1f6b6435d0_16,{'counts': b'`oU3480Wo14cPNO\\o1;M3I7O1O1M3I7O...,"[50, 1955, 150, 2048]",14|0,255,4,4,1.931588,
2,d6855b58-bbbd-11e8-b2ba-ac1f6b6435d0_11,{'counts': b'YeeU2c0\\o12N2N4L4L2N2N4Lj0VO2N1O...,"[1114, 1293, 1970, 1910]",14|4,255,4,14,2.521235,14.0
3,ec7dca56-bbc4-11e8-b2bc-ac1f6b6435d0_13,{'counts': b'\\RUl0<co12N2L6G9L2N2N4K8I2N2L4M4...,"[450, 1007, 814, 1310]",2|0,255,4,0|6,2.24248,0.0
4,76bed770-bbb9-11e8-b2ba-ac1f6b6435d0_10,{'counts': b'nUUi0\\14PO0H00^m2[2M3L4K5N1O2N2N...,"[268, 851, 1150, 1517]",0|13,50,2,9|13,3.471165,13.0


In [27]:
# ntop = 5
# idxs = loss.flatten().argsort()[::-1][:ntop]

# ncols = 4
# nrows = (ntop - 1) // ncols + 1
# fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 5))
# axs = axs.flatten()
# for ax in axs:
#     ax.axis('off')

# for ax, i in zip(axs, idxs):
#     img = valid_dataset[i][0].permute(1, 2, 0).numpy()
#     ax.imshow(img[...,[0, 3, 2]])
    
#     ax.set_title(f'GT {df_cells.original_target.iloc[i]}    Pred {ptargets[i]}  Loss {loss[i].item():.2f}')