# Histopathologic Cancer Detection

## 1. Import Library

In [None]:
import os
import random
from glob import glob
from random import shuffle
import cv2

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Tensorflow and Keras: Deep Learning(DL)
import tensorflow as tf
print('Tensorflow Version:', tf.__version__)
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalMaxPooling2D, GlobalAveragePooling2D, GlobalAveragePooling2D, Flatten, Concatenate
from tensorflow.keras.applications.densenet import DenseNet169, preprocess_input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint

# 데이터 증강을 위한 패키지
import imgaug.augmenters as iaa
import imgaug as ia

# 경고 메시지 무시
import warnings
warnings.filterwarnings('ignore')

Tensorflow Version: 2.12.0


### 1-1. Seed Fixed

In [None]:
def seed_everything(seed: int = 42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

seed_everything(42)

## 2. Load Dataset

In [None]:
# 데이터 불러오기
df_train = pd.read_csv('train_labels.csv')
df_train.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [None]:
id_label_map = {k: v for k, v in zip(df_train.id.values, df_train.label.values)}

In [None]:
def get_id_from_file_path(file_path):
    """
    이미지 파일 경로로 id를 얻는 함수
    """
    return file_path.split(os.path.sep)[-1].replace('.tif', '')

In [None]:
# train 파일과 test 파일 불러오기
train_files = glob('train/*.tif')
test_files = glob('test/*tif')

print('[train files size (label O)]', len(train_files))
print('[test files size (label X)]', len(test_files))

[train files size (label O)] 220025
[test files size (label X)] 57458


In [None]:
# train set과 validation set으로 구분
train, val = train_test_split(train_files, test_size=0.1, random_state=42)

### 2-1. Data augmentation

In [None]:
def chunker(seq, size):
    """
    seq를 분할하는 함수
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
def get_seq():
    sometimes = lambda aug: iaa.Sometimes(0.5, aug)
    seq = iaa.Sequential(
        [
            # 대부분 이미지에 다음의 오그멘테이션 적용
            iaa.Fliplr(0.5), # 이미지 수평 방향으로 뒤집기
            iaa.Flipud(0.2), # 이미지 수직 방향으로 뒤집기
            sometimes(iaa.Affine(
                scale={"x": (0.9, 1.1), "y": (0.9, 1.1)}, # 크기 변환
                translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)}, # 이동 변환
                rotate=(-10, 10), # 회전 변환
                shear=(-5, 5), # 전단 변환
                order=[0, 1], # 이미지 보간(쌍선형 보간법)
                cval=(0, 255), # 상수 모드인 경우 0과 255사이의 cval 사용
                mode=ia.ALL # scikit-image의 뒤틀림 모드 중 하나 사용
            )),

            # 오그멘테이션 리스트 중에서 일부 조합만 랜덤하게 적용하여 강한 오그멘테이션의 적용 빈도를 낮춤
            iaa.SomeOf((0, 5),
                [
                    sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # 이미지 슈퍼 픽셀로 표현
                    iaa.OneOf([
                        iaa.GaussianBlur((0, 1.0)), # 0과 1사이의 시그마 변수를 사용한 이미지 블러링
                        iaa.AverageBlur(k=(3, 5)), # 커널의 크기가 3과 5사이의 로컬 평균값을 사용한 이미지 블러링
                        iaa.MedianBlur(k=(3, 5)), # 커널의 크기가 3과 5사이의 로컬 중앙값을 사용한 이미지 블러링
                    ]),
                    iaa.Sharpen(alpha=(0, 1.0), lightness=(0.9, 1.1)), # 이미지 샤프닝 기법
                    iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # 이미지 엠보싱 기법

                    # 에지 또는 방향 에지 검색, blobby mask를 사용한 결과를 원본 이미지와 혼합
                    iaa.SimplexNoiseAlpha(iaa.OneOf([
                        iaa.EdgeDetect(alpha=(0.5, 1.0)),
                        iaa.DirectedEdgeDetect(alpha=(0.5, 1.0), direction=(0.0, 1.0)),
                    ])),
                    iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.01*255), per_channel=0.5), # 가우스 노이즈 추가
                    iaa.OneOf([
                        iaa.Dropout((0.01, 0.05), per_channel=0.5), # 픽셀의 최대 10%까지 임의로 제거
                        iaa.CoarseDropout((0.01, 0.03), size_percent=(0.01, 0.02), per_channel=0.2),
                    ]),
                    iaa.Invert(0.01, per_channel=True), # 이미지 색 반전
                    iaa.Add((-2, 2), per_channel=0.5), # 이미지의 밝기 조정
                    iaa.AddToHueAndSaturation((-1, 1)), # 색조와 채도 조정

                    # 전체 이미지의 밝기 조정 또는 하위 영역의 밝기 조정
                    iaa.OneOf([
                        iaa.Multiply((0.9, 1.1), per_channel=0.5),
                        iaa.FrequencyNoiseAlpha(
                            exponent=(-1, 0),
                            first=iaa.Multiply((0.9, 1.1), per_channel=True),
                            second=iaa.ContrastNormalization((0.9, 1.1))
                        )
                    ]),
                    sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # 픽셀을 로컬로 이동(임의 강도 포함)
                    sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))), # 이미지의 일부 움직임 처리
                    sometimes(iaa.PerspectiveTransform(scale=(0.01, 0.1)))
                ],
                random_order=True
            )
        ],
        random_order=True
    )
    return seq

In [None]:
def data_gen(list_files, id_label_map, batch_size, augment=False):
    seq = get_seq()
    while True:
        shuffle(list_files)
        for batch in chunker(list_files, batch_size):
            X = [cv2.imread(x) for x in batch]
            Y = [id_label_map[get_id_from_file_path(x)] for x in batch]
            if augment:
                X = seq.augment_images(X)
            X = [preprocess_input(x) for x in X]

            yield np.array(X), np.array(Y)

## 3. Model
- 평가지표: area under the ROC curve(AUC)

### 3-1. Model Definition
- `DenseNet169`모델 사용

In [None]:
def get_model_classifier():
    inputs = Input((96, 96, 3))
    base_model = DenseNet169(include_top=False, input_shape=(96, 96, 3))

    x = base_model(inputs)
    out1 = GlobalMaxPooling2D()(x)
    out2 = GlobalAveragePooling2D()(x)
    out3 = Flatten()(x)

    out = Concatenate(axis=-1)([out1, out2, out3])
    out = Dropout(0.5)(out)
    outputs = Dense(1, activation='sigmoid', name='output_1')(out)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(0.0001), loss=binary_crossentropy, metrics=['acc'])

    model.summary()

    return model

In [None]:
model = get_model_classifier()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 96, 96, 3)]  0           []                               
                                                                                                  
 densenet169 (Functional)       (None, 3, 3, 1664)   12642880    ['input_1[0][0]']                
                                                                                                  
 global_max_pooling2d (GlobalMa  (None, 1664)        0           ['densenet169[0][0]']            
 xPooling2D)                                                                                      
                                             

### 3-2. Model Training Step
**모델 학습**
- 1단계: 배치 사이즈(batch size)=32, 에폭(epoch)=2
- 2단계: 배치 사이즈(batch size)=64, 에폭(epoch)=6

In [None]:
# 모델 저장하기
h5_path = 'model.h5'
checkpoint = ModelCheckpoint(h5_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# 배치 사이즈(batch size)=32, 에폭(epoch)=2 : 모델 학습
batch_size = 32
history = model.fit_generator(
    data_gen(train, id_label_map, batch_size, augment=True),
    validation_data=data_gen(val, id_label_map, batch_size),
    epochs=2,
    verbose=1,
    callbacks=[checkpoint],
    steps_per_epoch=len(train) // batch_size,
    validation_steps=len(val) // batch_size)

# 배치 사이즈(batch size)=64, 에폭(epoch)=6 : 모델 학습
batch_size = 64
history = model.fit_generator(
    data_gen(train, id_label_map, batch_size, augment=True),
    validation_data=data_gen(val, id_label_map, batch_size),
    epochs=6,
    verbose=1,
    callbacks=[checkpoint],
    steps_per_epoch=len(train) // batch_size,
    validation_steps=len(val) // batch_size)

Epoch 1/2
Epoch 1: val_acc improved from -inf to 0.89729, saving model to model.h5
Epoch 2/2
Epoch 2: val_acc improved from 0.89729 to 0.93146, saving model to model.h5
Epoch 1/6
Epoch 1: val_acc improved from 0.93146 to 0.94133, saving model to model.h5
Epoch 2/6
Epoch 2: val_acc improved from 0.94133 to 0.95583, saving model to model.h5
Epoch 3/6
Epoch 3: val_acc did not improve from 0.95583
Epoch 4/6
Epoch 4: val_acc improved from 0.95583 to 0.95779, saving model to model.h5
Epoch 5/6
Epoch 5: val_acc improved from 0.95779 to 0.96235, saving model to model.h5
Epoch 6/6
Epoch 6: val_acc did not improve from 0.96235


## 4. Submission

### 4-1. Model Test Step

In [None]:
# 모델 로드하기
model.load_weights('model.h5')

In [None]:
preds = []
ids = []

for batch in chunker(test_files, batch_size):
    X = [preprocess_input(cv2.imread(x)) for x in batch]
    ids_batch = [get_id_from_file_path(x) for x in batch]
    X = np.array(X)
    preds_batch = ((model.predict(X).ravel()*model.predict(X[:, ::-1, :, :]).ravel()*model.predict(X[:, ::-1, ::-1, :]).ravel()*model.predict(X[:, :, ::-1, :]).ravel())**0.25).tolist()
    preds += preds_batch
    ids += ids_batch

In [None]:
submission = pd.DataFrame({'id':ids, 'label':preds})
submission.head()

Unnamed: 0,id,label
0,45bc9f78a688df66516f0fdb0d1815a10454f09d,0.998742
1,d66fa4288e74f9122e47093cfd7f09ffa31c493a,0.000114
2,328f42b7cee36aadb484c0aae20239e7a86f2360,0.001099
3,73a55794536e3051ac2de99ce4ecad04341edc6f,0.004453
4,9dead04bd738455c9698ee566c7a512413a54eda,0.545126


In [None]:
submission.to_csv('submission.csv', index=False)