**Nhóm: A-Team**

**Thành viên:**

*     Vũ Quốc Anh - 20C11003
*     Văn Khải Nguyên - 20C11009
*     Hà Hữu Pháp - 20C11010
*     Nguyễn Trần Duy Minh - 20C11041
*     Phạm Anh Việt - 20C11060

# 1. Cài đặt môi trường

## 1.1 Cài đặt mạng EfficientNet

In [None]:
!pip install efficientnet

## 1.2 Import các thư viện cần thiết

In [None]:
import math, re, os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from kaggle_datasets import KaggleDatasets
import tensorflow as tf
import tensorflow.keras.layers as L
import efficientnet.tfkeras as efn
from keras.applications.densenet import DenseNet201
from sklearn import metrics
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint

## 1.3 Phát hiện TPU

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

## 1.4 Load dataset từ Google Cloud Service

In [None]:
# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path('plant-pathology-2020-fgvc7')

# Configuration
EPOCHS = 40
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
IM_Z = 768 # kích thước cao rộng của hình

In [None]:
def format_path(st):
    return GCS_DS_PATH + '/images/' + st + '.jpg'

# 2. Tổng quan về bộ dữ liệu

## 2.1 Đọc thông tin bộ dataset
* Số lượng mẫu bộ train và test
* Số lượng mẫu theo từng class

In [None]:
train = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/train.csv')
test = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/test.csv')
sub = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv')

train_paths = train.image_id.apply(format_path).values
test_paths = test.image_id.apply(format_path).values

N = len(train_paths)
print(f"- Độ dài bộ train: {len(train_paths)}")
print(f"- Độ dài bộ test: {len(test_paths)}")

N_healthy = train['healthy'].mean() * N
N_multiple_diseases = train['multiple_diseases'].mean() * N
N_rust = train['rust'].mean() * N
N_scab = train['scab'].mean() * N

print(f'Healthy: {N_healthy} mẫu')
print(f'Multiple diseases: {N_multiple_diseases} mẫu')
print(f'Rust: {N_rust} mẫu')
print(f'Scab: {N_scab} mẫu')

train.describe()

## 2.2 Chia bộ train / validation và tách nhãn thành one-hot vector

In [None]:
# Chia train / validation
# Lấy label theo dạng one-hot vector

train_labels = train.loc[:, 'healthy':].values

print(f"- Train labels:\n {train_labels}")

train_paths, valid_paths, train_labels, valid_labels = train_test_split(
    train_paths, train_labels, test_size=0.1, random_state=2020)

print(f" - Độ dài bộ train: {len(train_paths)}")
print(f" - Độ dài bộ validation: {len(valid_paths)}")

# 3. Cài đặt mô hình sử dụng Ensemble learning và Transfer learning

## 3.1 Hàm helper
* decode_image(): load hình và tiền xử lý (scale về 0->1 và resize)
* data_augment(): tăng cường data bằng cách lật theo chiều ngang và chiều dọc

In [None]:
def decode_image(filename, label=None, image_size=(IM_Z, IM_Z)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    
    if label is None:
        return image
    else:
        return image, label

def data_augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    
    if label is None:
        return image
    else:
        return image, label

## 3.2 Chuẩn bị data

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_paths, train_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .cache()
    .map(data_augment, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(512)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((valid_paths, valid_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_paths)
    .map(decode_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
)

## 3.3 Các hàm callback: 
* Hàm thay đổi learning rate động
* Checkpoint callback

In [None]:
def build_lrfn(lr_start=0.00001, lr_max=0.000075, 
               lr_min=0.000001, lr_rampup_epochs=20, 
               lr_sustain_epochs=0, lr_exp_decay=.8):
    lr_max = lr_max * strategy.num_replicas_in_sync

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    
    return lrfn

ch_p = ModelCheckpoint(filepath="model_ef1.h5", monitor='val_loss', save_weights_only=True,
                                                 verbose=1)

## 3.4 Tạo mạng transfer learning 1 dùng pretrained model là EfficientNetB7

In [None]:
with strategy.scope():
    model = tf.keras.Sequential([
        efn.EfficientNetB7(
            input_shape=(IM_Z, IM_Z, 3),
            weights='imagenet',
            include_top=False
        ),
        L.GlobalAveragePooling2D(),
        L.Dense(train_labels.shape[1], activation='softmax')
    ])
        
    model.compile(
        optimizer='adam',
        loss = 'categorical_crossentropy',
        metrics=['categorical_accuracy']
    )
#     model.summary()

In [None]:
lrfn = build_lrfn()
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)
STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE 

## 3.5 Train mạng transfer learning 1

In [None]:
SAVED_MODEL = 1

if SAVED_MODEL == 0:
    history = model.fit(
        train_dataset, 
        epochs=EPOCHS, 
        callbacks=[lr_schedule, ch_p],
        steps_per_epoch=STEPS_PER_EPOCH,
        validation_data=valid_dataset
    )

In [None]:
if SAVED_MODEL == 0:
    # summarize history for accuracy

    print(history.history.keys())

    plt.plot(history.history['categorical_accuracy'])
    plt.plot(history.history['val_categorical_accuracy'])
    plt.title('model categorical accuracy')
    plt.ylabel('categorical accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()

In [None]:
if SAVED_MODEL == 1:
    model.load_weights("../input/pathology-model-effb7/model_ef1.h5")


In [None]:
CROPPED = 0
CROP_RATIO = 0.9
def decode_image_crop(filename, label=None, image_size=(IM_Z, IM_Z)):
    global CROP_RATIO
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.image.central_crop(image, CROP_RATIO)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)

    if label is None:
        return image
    else:
        return image, label

def dummy_data_augment(image, label=None):
    if label is None:
        return image
    else:
        return image, label
    
# data_augments = [dummy_data_augment]
# data_augments = [dummy_data_augment, tf.image.flip_left_right]
# data_augments = [dummy_data_augment, tf.image.flip_left_right, tf.image.flip_up_down]
data_augments = [dummy_data_augment, tf.image.flip_left_right, tf.image.flip_up_down]


In [None]:
TTA_test_dataset_crop = (
    tf.data.Dataset
    .from_tensor_slices(test_paths)
    .map(decode_image_crop, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
)

In [None]:
if CROPPED == 1:
    prob_crop = model.predict(TTA_test_dataset_crop)
    sub.loc[:, 'healthy':] = prob_crop
    sub.to_csv("submission_prop-crop-" + str(CROP_RATIO) + ".csv", index=False)
    sub.head()

### Visualize image after augmentation methods

In [None]:
# def decode_image_no_scale(filename, label=None, image_size=(IM_Z, IM_Z)):
#     bits = tf.io.read_file(filename)
#     image = tf.image.decode_jpeg(bits, channels=3)
#     image = tf.image.resize(image, image_size)
    
#     if label is None:
#         return image
#     else:
#         return image, label
    
# test_images = [(tf.data.Dataset.from_tensor_slices(test_paths[0:1]).map(decode_image_no_scale).map(d_augment)) for d_augment in data_augments]
# fig = plt.figure(figsize=(10, 10),)

# for dataset in test_images:
#     for elem in dataset:
#         # now later you get a new subplot; change the geometry of the existing
#         n = len(fig.axes)
#         for i in range(n):
#             fig.axes[i].change_geometry(n+1, 1, i+1)

#         # add the new
#         ax = fig.add_subplot(n+1, 1, n+1)
#         ax.imshow(elem.numpy().astype(int))

# plt.show()

## 3.6 Thực hiện TTA

In [None]:
TTA_test_datasets = [(
    tf.data.Dataset
    .from_tensor_slices(test_paths)
    .map(decode_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
    .map(d_augment, num_parallel_calls=AUTO)
) for d_augment in data_augments]

In [None]:
combined_probs_TTAs = [model.predict(dataset) for dataset in TTA_test_datasets]

In [None]:
probs_TTA = np.zeros(probs_TTAs[0].shape)

for p in combined_probs_TTAs:
    probs_TTA += p

if CROPPED == 1:
    probs_TTA = (probs_TTA + prob_crop) / (len(combined_probs_TTAs) + 1)
else:
    probs_TTA = probs_TTA / len(combined_probs_TTAs)
    
len(combined_probs_TTAs)

## 3.9 Lưu kết quả

In [None]:
submission_name = 'submission_eff_TTA'

for d_aug in data_augments:
    submission_name += "-"
    submission_name += d_aug.__name__

### CROP
if CROPPED == 1:    
    props_TTA = probs_TTA + prob_crop
    submission_name += "-crop_" + str(CROP_RATIO)
### CROP

submission_name += ".csv"
submission_name

In [None]:
sub.loc[:, 'healthy':] = probs_TTA
sub.to_csv(submission_name, index=False)
sub.head()