# Diabetic Process

In [None]:
# #First Do This.
# !kaggle competitions download -c diabetic-retinopathy-detection

# !unzip diabetic-retinopathy-detection.zip

# !unzip sampleSubmission.csv.zip
# !unzip trainLabels.csv.zip

# !cat test.zip* > ./test.zip
# !unzip test.zip

# !cat train.zip* > ./train.zip
# !unzip train.zip

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread('./train_preprocess/10_left.jpeg')
plt.imshow(img)
plt.show()
img = mpimg.imread('./train_preprocess/10_right.jpeg')
plt.imshow(img)
plt.show()
img = mpimg.imread('./train_preprocess/13_left.jpeg')
plt.imshow(img)
plt.show()
img = mpimg.imread('./train_preprocess/13_right.jpeg')
plt.imshow(img)
plt.show()

## 문제점
1. 이미지의 크기가 다르다
2. 몇몇의 이미지는 음영이 고르지 않다.
3. 몇몇의 이미지는 cup과 disc의 위치가 중앙에 잡혀있다.

## Image preprocess
1. Left image flip => 좌안의 이미지를 반전시킨다.
2. Image histogram equalization => 이미지 평활화를 통해 음영을 고르게 한다.
3. Image resize => 이미지의 크기가 제각각이므로 resize한다.
4. (Optional) ROI cut process => 만약 해당 도메인에서 관심 영역이 정해져있다면 해당 관심영역만을 절단하여 사용한다.

### Image resize

In [None]:
# Import libraries
from PIL import Image
import glob
import os
from shutil import copyfile

# Get filenames
image_files_list = list(glob.glob('train/*'))

# Directory check
new_dir = 'train_preprocess/'
if not os.path.isdir(new_dir):
    os.mkdir(new_dir)

# OS image resize
for image_file in image_files_list:
    im = Image.open(image_file)
    im = im.resize((448, 448))
    new_image_path = new_dir + image_file.split("/")[-1]
    im.save(new_image_path)

### Left image flip process

In [None]:
# Import libraries
from PIL import Image
import glob

# Get filenames
image_files_list = list(glob.glob('train_preprocess/*'))

# OS image flip algorithm
for image_file in image_files_list:
    if image_file.split("/")[-1].split("_")[1].split(".")[0] == "left":
        im = Image.open(image_file)
        im = im.transpose(Image.FLIP_LEFT_RIGHT)
        im.save(image_file)

### Image histogram equalization

In [None]:
# Import libraries
import cv2

# Get filenames
image_files_list = list(glob.glob('train_preprocess/*'))

# OS image flip algorithm
for image_file in image_files_list:
    # OpenCV의 Equaliztion함수
    image = cv2.imread(image_file)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2YUV)
    image[:,:,0] = cv2.equalizeHist(image[:,:,0])
    image = cv2.cvtColor(image, cv2.COLOR_YUV2BGR)
    cv2.imwrite(image_file, image)

In [None]:
!zip train_preprocess.zip ./train_preprocess/*

## Model pipeline
1. csv(data frame)을 기준으로 image를 차례차례 불러오는 data_generator를 만들기
2. image plotting을 통해 image generator 결과 보이기
3. 모델 선언 및 학습하기
4. 결과 보이기

### 1. train_df sampling
(1) train_df를 2번 label에 맞춰 각각 under & over sampling을 진행함.

In [None]:
# train_df = pd.read_csv("trainLabels.csv")

# df_class_0 = train_df[train_df['level'] == 0]
# df_class_1 = train_df[train_df['level'] == 1]
# df_class_2 = train_df[train_df['level'] == 2]
# df_class_3 = train_df[train_df['level'] == 3]
# df_class_4 = train_df[train_df['level'] == 4]

# print(train_df.level.value_counts())
# count_class = train_df.level.value_counts()[2]

# df_class_0_under = df_class_0.sample(count_class)
# df_class_1_over = df_class_1.sample(count_class, replace=True)
# df_class_3_over = df_class_3.sample(count_class, replace=True)
# df_class_4_over = df_class_4.sample(count_class, replace=True)
# train_df = pd.concat([df_class_0_under, df_class_1_over, df_class_2, df_class_3_over, df_class_4_over], axis=0)

# train_df['level'].hist(figsize = (10, 5))

# train_df['image']= train_df['image'] + '.jpeg'
# train_df = shuffle(train_df)
# train_df.to_csv('train_df.csv', sep=',', na_rep='')

(2) train_df를 정상 label에 맞춰 각각 over sampling을 진행함.

In [None]:
# train_df = pd.read_csv("trainLabels.csv")

# df_class_0 = train_df[train_df['level'] == 0]
# df_class_1 = train_df[train_df['level'] == 1]
# df_class_2 = train_df[train_df['level'] == 2]
# df_class_3 = train_df[train_df['level'] == 3]
# df_class_4 = train_df[train_df['level'] == 4]

# print(train_df.level.value_counts())
# count_class = train_df.level.value_counts()[0]

# df_class_1_over = df_class_1.sample(count_class, replace=True)
# df_class_2_over = df_class_2.sample(count_class, replace=True)
# df_class_3_over = df_class_3.sample(count_class, replace=True)
# df_class_4_over = df_class_4.sample(count_class, replace=True)
# train_df = pd.concat([df_class_0, df_class_1_over, df_class_2_over, df_class_3_over, df_class_4_over], axis=0)

# train_df['level'].hist(figsize = (10, 5))

# train_df['image']= train_df['image'] + '.jpeg'
# train_df = shuffle(train_df)
# train_df.to_csv('train_df.csv', sep=',', na_rep='')

### 2. Deep learning 진행하기
(1) 성능 향상시키기
(2) 딥러닝에서 발생하는 여러 문제 해결하기

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import os, datetime

from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam


In [2]:
# memory 관련 이슈를 해결하기 위한 코드
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


In [3]:
# 위의 작업들로 생성된 train_df을 불러옴
train_df = pd.read_csv("train_df.csv")
train_df
train_df['level']= train_df['level'].astype(str)

FileNotFoundError: [Errno 2] File b'train_df.csv' does not exist: b'train_df.csv'

In [None]:
# 사전 설정을 위한 세팅
epochs = 128
image_size = 448
batch = 64

# Directory check
new_dir = 'ckpt/'
if not os.path.isdir(new_dir):
    os.mkdir(new_dir)

In [None]:
train_image_generator = ImageDataGenerator(rescale=1./255, validation_split=0.2) # Generator for our training, validation data
test_image_generator = ImageDataGenerator(rescale=1./255) # Generator for our test data

In [None]:
train_data_gen = train_image_generator.flow_from_dataframe(dataframe=train_df,
                                                           directory="./train_preprocess",
                                                           x_col="image",
                                                           y_col="level",
                                                           class_mode="categorical",
                                                           shuffle=True,
                                                           target_size=(image_size,image_size),
                                                           batch_size=batch,
                                                           subset='training')

valid_data_gen = train_image_generator.flow_from_dataframe(dataframe=train_df,
                                                           directory="./train_preprocess",
                                                           x_col="image",
                                                           y_col="level",
                                                           class_mode="categorical",
                                                           shuffle=True,
                                                           target_size=(image_size,image_size),
                                                           batch_size=batch,
                                                           subset='validation')

In [None]:
sample_training_images, _ = next(train_data_gen)

# This function will plot images in the form of a grid with 1 row and 5 columns where images are placed in each column.
def plotImages(images_arr):
    fig, axes = plt.subplots(1, 5, figsize=(20,20))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()
    
plotImages(sample_training_images[:5])

In [None]:
model = Sequential([
    Conv2D(8, 3, padding='same', activation='relu',
           input_shape=(image_size, image_size ,3)),
    Dropout(0.5),
    Conv2D(16, 1, strides=2, padding='same', activation='relu'),
    Conv2D(16, 3, padding='same', activation='relu'),
    Conv2D(16, 1, padding='same', activation='relu'),
    Dropout(0.5),
    Conv2D(8, 1, strides=2, padding='same', activation='relu'),
#     Conv2D(8, 3, padding='same', activation='relu', activity_regularizer=l1_l2(l1=5e-6, l2=5e-6)),
    Conv2D(8, 3, padding='same', activation='relu'),
    Conv2D(8, 1, padding='same', activation='relu'),
    Dropout(0.5),
    Flatten(),
    Dense(128, activation="relu", activity_regularizer=l1_l2(l1=2e-5, l2=2e-5)),
    Dense(5, activation="softmax"),
])

In [None]:
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 448, 448, 8)       224       
_________________________________________________________________
dropout (Dropout)            (None, 448, 448, 8)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 224, 224, 16)      144       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 224, 224, 16)      2320      
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 224, 224, 16)      272       
_________________________________________________________________
dropout_1 (Dropout)          (None, 224, 224, 16)      0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 112, 112, 8)       1

In [None]:
callbacks = [
    ModelCheckpoint(
        # Path where to save the model
        # The two parameters below mean that we will overwrite
        # the current checkpoint if and only if
        # the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath="ckpt/mymodel_best.h5",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_loss",
        verbose=1,
    ),
#     EarlyStopping(
#         # Stop training when `val_loss` is no longer improving
#         monitor="val_loss",
#         # "no longer improving" being defined as "no better than 1e-2 less"
#         min_delta=1e-4,
#         # "no longer improving" being further defined as "for at least 2 epochs"
#         patience=10,
#         verbose=1,
#     ),
    TensorBoard(logdir, histogram_freq=1)
]

In [None]:
history = model.fit(
    train_data_gen,
    epochs=epochs,
    validation_data=valid_data_gen,
    callbacks=callbacks,
)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss=history.history['loss']
val_loss=history.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(range(len(acc)), acc, label='Training Accuracy')
plt.plot(range(len(val_acc)), val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(range(len(loss)), loss, label='Training Loss')
plt.plot(range(len(val_loss)), val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()