# Import Libraries

In [None]:
import numpy as np
import pandas as pd 
import os
import cv2
from matplotlib import pyplot as plt
from kaggle_datasets import KaggleDatasets

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers as L
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn import metrics

%matplotlib inline

# TPU setup

In [None]:
# 
# TPUの初期化
# 
try:
#     TPUのハードウェア情報を獲得。TPUが利用できない環境ではエラー
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#     TPU利用可能の確認
    print('Running on TPU:', tpu.master())
# 上記でエラー（例外）が出た場合の処理
except ValueError:
    tpu = None

# TPUが利用できる場合（Accelerator TPU）
if tpu:
#   リモートクラスタに接続してTPUを初期化
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
#   データの並列処理を使用してトレーニングを分散する
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
# TPUが利用できない場合（Accelerator None）
else:
    strategy = tf.distribute.get_strategy()

# 並列処理のレベルに関する決定をAUTOで行う
AUTO = tf.data.experimental.AUTOTUNE
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

BATCH_SIZE = 8 * strategy.num_replicas_in_sync
IMG_SIZE = 768

print('Batch size:', BATCH_SIZE)

# Get train and test data

In [None]:
train = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/train.csv')
test = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/test.csv')
sub = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv')

print(train.head())

train_path = train.image_id.apply(lambda x: f'{GCS_DS_PATH}/images/{x}.jpg').values
test_path = test.image_id.apply(lambda x: f'{GCS_DS_PATH}/images/{x}.jpg').values
train_label = train.loc[:, 'healthy':].values

# Get class weights

In [None]:
class_weight = compute_class_weight('balanced', np.unique(np.argmax(train_label, axis=1)), np.argmax(train_label, axis=1))
plt.bar(range(4), class_weight)

# Lets see some images

In [None]:
# 2×2で表示
fig, ax = plt.subplots(2, 2)
# サンプル読み込み
img = cv2.imread('/kaggle/input/plant-pathology-2020-fgvc7/images/Train_0.jpg')
img1 = cv2.imread('/kaggle/input/plant-pathology-2020-fgvc7/images/Train_1.jpg')
img2 = cv2.imread('/kaggle/input/plant-pathology-2020-fgvc7/images/Train_2.jpg')
img3 = cv2.imread('/kaggle/input/plant-pathology-2020-fgvc7/images/Train_3.jpg')
# 場所指定した書き出し
ax[0, 0].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
ax[0, 1].imshow(cv2.cvtColor(img1, cv2.COLOR_BGR2RGB))
ax[1, 0].imshow(cv2.cvtColor(img2, cv2.COLOR_BGR2RGB))
ax[1, 1].imshow(cv2.cvtColor(img3, cv2.COLOR_BGR2RGB))

# Decode images

In [None]:
# データ変換（デコード）の定義
def decode_image(filename, label=None, image_size=(IMG_SIZE, IMG_SIZE)):
#     生データを読み込み
    bits = tf.io.read_file(filename)
#     画像のテンソルにデコード
    image = tf.image.decode_jpeg(bits, channels=3)
#     0-255のRGBを0-1に変換する（normalize）
    image = tf.cast(image, tf.float32) / 255.0
#     画像サイズを1365×2048から768×768にする
    image = tf.image.resize(image, image_size)
    
#     imageをreturn
    if label is None:
        return image
    else:
        return image, label

# データ変換（増幅）の定義
def data_augment(image, label=None):
#     ランダムに水平方向に反転
    image = tf.image.random_flip_left_right(image)
#     ランダムに垂直方向に反転
    image = tf.image.random_flip_up_down(image)
    
#     imageをreturn
    if label is None:
        return image
    else:
        return image, label

In [None]:
# 教師データをデコード
train_dataset = (
#     TFR形式でデータを読み書きする
    tf.data.TFRecordDataset
#     配列をスライスしてデータセットを構築する
    .from_tensor_slices((train_path, train_label))
#     データ変換（デコード）を並列化して行う
    .map(decode_image, num_parallel_calls=AUTO)
#     データ変換（増幅）を並列化して行う
    .map(data_augment, num_parallel_calls=AUTO)
    .cache()
    .repeat()
    .shuffle(1024) #ランダム要素
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

# テストデータのデコード
test_dataset = (
    tf.data.TFRecordDataset
    .from_tensor_slices(test_path)
    .map(decode_image, num_parallel_calls=AUTO)
    .map(data_augment, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
)

# Define the parameters

In [None]:
EPOCHS = 40
LR_START = 0.0001
LR_MAX = 0.00005 * strategy.num_replicas_in_sync
LR_MIN = 0.0001
LR_RAMPUP_EPOCHS = 10
LR_SUSTAIN_EPOCHS = 4
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr

lr = tf.keras.callbacks.LearningRateScheduler(lrfn)

y = [lrfn(x) for x in range(EPOCHS)]
plt.plot(y)

# ResNet

In [None]:
# ResNet50,DenseNet101,DenseNet152が使用可能
from tensorflow.keras.applications import ResNet152

with strategy.scope():
    rsn152 = ResNet152(include_top=False, weights='imagenet', input_shape=(IMG_SIZE, IMG_SIZE, 3))

    model_rsn152 = Sequential()
    model_rsn152.add(rsn152)
    model_rsn152.add(L.GlobalAveragePooling2D())
    model_rsn152.add(L.Dense(4, activation='softmax'))
    model_rsn152.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model_rsn152.summary())

# Model

In [None]:
mc_rsn152 = tf.keras.callbacks.ModelCheckpoint('weights_rsn152.h5', monitor='loss', save_best_only=True, save_weights_only=True)
history = model_rsn152.fit(train_dataset, epochs=EPOCHS, callbacks=[lr, mc_rsn152], steps_per_epoch=train_label.shape[0] // BATCH_SIZE)

In [None]:
with strategy.scope():
    model_rsn152.load_weights('weights_rsn152.h5')

# Predict

In [None]:
probs_rsn152 = model_rsn152.predict(test_dataset, verbose=1)
sub_rsn152 = sub
sub_rsn152.loc[:, 'healthy':] = probs_rsn152
sub_rsn152.to_csv('submission_rsn152.csv', index=False)
sub_rsn152.head()