# 1 DenseNet121 Model****

烟台苹果甲天下，驰名海外，烟台苹果既好看又好吃。生活在盛产苹果的故乡，从小啃着苹果长大，与果园和果树打交道，参加果园管理，施肥、浇水、去病虫害和收获，各类苹果的味道、果树的味道、农药的味道、树枝树叶树皮的味道，对果园熟悉得不能再熟悉。<br/>
过往在果园劳动的场景历历在目，所以看到有这样一个数据集，立即被吸引了。本案例是对DenseNet121与MobileNetV2的模型性能做比较。<br/>
写得很浅，以后有时间再回来补充完善。<br/>
主要参照了作者：<a href='https://www.kaggle.com/tarunpaparaju'>Tarun Paparaju</a> 的作品，有一定的参数改良，重新进行了模块化设计，便于迁移到Pycharm等IDE环境下运行。

### Acknowledgements
### 参考：<a href = 'https://www.kaggle.com/tarunpaparaju/plant-pathology-2020-eda-models'>Plant Pathology 2020 : EDA + Models </a>

In [None]:
#配置TPU运行模式
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
# 探测TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
print("REPLICAS: ", strategy.num_replicas_in_sync)

## 1.1 数据预处理

In [None]:
"""
程序：handle_data.py
功能：数据预处理
设计：董相志，upsunny2008@163.com
日期：2021.3.12
"""
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.model_selection import train_test_split
import tensorflow as tf

tqdm.pandas()

# 数据集观察
def data_observation(csv_file_path: str, image_path: str, index: int):
    data = pd.read_csv(csv_file_path)
    print(data.head())  # 显示数据集
    image = cv2.imread(image_path + data['image_id'][index] + '.jpg')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    fig = px.imshow(cv2.resize(image, (205, 136)))
    fig.show()


# 分类观察
def category_observation(csv_file_path: str, image_path: str, cond=[0, 0, 0, 0], cond_cols=["healthy"]):
    data = pd.read_csv(csv_file_path)
    train_images = []
    SAMPLE_LEN = 40
    for i in range(SAMPLE_LEN):
        image = cv2.imread(image_path + data['image_id'][i] + '.jpg')
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        train_images.append(image)
    cond_0 = "healthy == {}".format(cond[0])
    cond_1 = "scab == {}".format(cond[1])
    cond_2 = "rust == {}".format(cond[2])
    cond_3 = "multiple_diseases == {}".format(cond[3])
    cond_list = []
    for col in cond_cols:
        if col == "healthy":
            cond_list.append(cond_0)
        if col == "scab":
            cond_list.append(cond_1)
        if col == "rust":
            cond_list.append(cond_2)
        if col == "multiple_diseases":
            cond_list.append(cond_3)
    data = data[:SAMPLE_LEN]
    for cond in cond_list:
        data = data.query(cond)
    print(data)
    images = []
    for index in data.index:
        images.append(train_images[index])
    cols, rows = 2, min([2, len(images) // 2])
    fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(8, rows * 6 / 2))
    for col in range(cols):
        for row in range(rows):
            ax[row, col].imshow(images[row * 2 + col])
            ax[row, col].set_title(data['image_id'][data.index[row * 2 + col]])
    plt.show()


# 类别分布
def category_distribution(csv_file_path: str):
    train_data = pd.read_csv(csv_file_path)
    fig = px.parallel_categories(train_data[["healthy", "scab", "rust", "multiple_diseases"]],
                                 color="healthy", color_continuous_scale="sunset",
                                 title="Parallel categories plot of targets",
                                 width=500, height=300)
    fig.show()

    fig = go.Figure([go.Pie(labels=train_data.columns[1:],
                            values=train_data.iloc[:, 1:].sum().values)])
    fig.update_layout(title_text="Pie chart of targets", template="simple_white")
    fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
    fig.data[0].marker.line.width = 0.5
    fig.show()


# 数据随机增强
def data_augmentation(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    if label is None:
        return image
    else:
        return image, label


# 生成图像路径
def format_path(image_id):
    return GCS_DS_PATH + '/images/' + image_id + '.jpg'


# 图像数据加载与解码函数
def decode_image(filename, label=None, image_size=(512, 512)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.image.resize(image, image_size)
    image = tf.cast(image, tf.float32) / 255.0
    if label is None:
        return image
    else:
        return image, label


# 数据集划分
def data_split(train_file_path: str, test_file_path: str):
    train_data = pd.read_csv(train_file_path)  # 训练集
    test_data = pd.read_csv(test_file_path)  # 测试集
    train_paths = train_data.image_id.apply(format_path).values
    train_labels = np.float32(train_data.loc[:, 'healthy':'scab'].values)
    test_paths = test_data.image_id.apply(format_path).values
    train_paths, valid_paths, train_labels, valid_labels = train_test_split(train_paths, train_labels, test_size=0.15,
                                                                            random_state=2021)

    AUTO = tf.data.experimental.AUTOTUNE
    # 构建训练集
    train_dataset = (
        tf.data.Dataset
            .from_tensor_slices((train_paths, train_labels))
            .map(decode_image, num_parallel_calls=AUTO)
            .map(data_augmentation, num_parallel_calls=AUTO)
            .repeat()
            .shuffle(512)
            .batch(BATCH_SIZE)
            .prefetch(AUTO)
    )
    # 构建验证集
    valid_dataset = (
        tf.data.Dataset
            .from_tensor_slices((valid_paths, valid_labels))
            .map(decode_image, num_parallel_calls=AUTO)
            .batch(BATCH_SIZE)
            .cache()
            .prefetch(AUTO)
    )
    # 构建测试集
    test_dataset = (
        tf.data.Dataset
            .from_tensor_slices(test_paths)
            .map(decode_image, num_parallel_calls=AUTO)
            .batch(BATCH_SIZE)
    )
    return train_dataset, valid_dataset, test_dataset


## 1.2 建模与评估

In [None]:
"""
程序：handle_model.py
功能：建模过程
设计：董相志，upsunny2008@163.com
日期：2021.3.12
"""
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import cv2
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv2D,Dropout,GlobalAveragePooling2D
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

# 模型定义
def model_define(train_labels, batch_size: int=BATCH_SIZE):
    steps_per_epoch = train_labels.shape[0] // batch_size
    model = Sequential(name='MyDenseNet121')
    dense_net = DenseNet121(include_top=False, weights='imagenet', input_shape=(512, 512, 3))
    model.add(dense_net)
    model.add(GlobalAveragePooling2D())
    model.add(Dense(train_labels.shape[1], activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['categorical_accuracy'])
    model.summary()
    return model

#学习率调度函数
def build_lrfn(lr_start=0.00001, lr_max=0.00005, 
               lr_min=0.00001, lr_rampup_epochs=5, 
               lr_sustain_epochs=0, lr_exp_decay=.8):
    lr_max = lr_max * strategy.num_replicas_in_sync

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) *\
                 lr_exp_decay**(epoch - lr_rampup_epochs\
                                - lr_sustain_epochs) + lr_min
        return lr
    return lrfn


# 模型训练
def model_train(model, train_dataset, valid_dataset, epochs, steps_per_epoch, saved_path):
    # 定义回调函数：学习率调度
    lrfn = build_lrfn(lr_sustain_epochs=2,lr_exp_decay=.9)
    lr_schedule = LearningRateScheduler(lrfn, verbose=1)
    # 定义回调函数：提前终止训练
    early_stop = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0, 
                               patience=10, verbose=1, restore_best_weights=True)
    # 定义回调函数：保存最优模型
    best_model = ModelCheckpoint(saved_path, monitor='val_categorical_accuracy', 
                                 verbose=1, save_best_only=True,
                                 save_weights_only=False, mode='max')
    history = model.fit(train_dataset,epochs=epochs,
                        callbacks=[lr_schedule, best_model,early_stop],
                        steps_per_epoch=steps_per_epoch,
                        validation_data=valid_dataset)
    return history


# 模型评估
def model_estimate(history, epochs):
    training = history.history['categorical_accuracy']
    validation = history.history['val_categorical_accuracy']
    ylabel = "Accuracy"
    title = "Accuracy vs. Epochs"
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(x=np.arange(1, epochs + 1), mode='lines+markers', 
                   y=training, marker=dict(color="dodgerblue"),
                   name="Train"))
    fig.add_trace(
        go.Scatter(x=np.arange(1, epochs + 1), mode='lines+markers', 
                   y=validation, marker=dict(color="darkorange"),
                   name="Val"))
    fig.update_layout(title_text=title, yaxis_title=ylabel, 
                      xaxis_title="Epochs", width=500, height=300)
    fig.show()


def load_image(image_id):
    file_path = '/kaggle/input/plant-pathology-2020-fgvc7/images/' + image_id + ".jpg"
    image = cv2.imread(file_path)
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)


def process(img):
    return cv2.resize(img / 255.0, (512, 512)).reshape(-1, 512, 512, 3)


# 模型预测
def model_predict(saved_model, img):
    model = load_model(saved_model)
    preds = model.layers[2](model.layers[1](model.layers[0](process(img)))).numpy()[0]
    return preds

# 显示预测结果
def displayResult(img, preds):
    fig = make_subplots(rows=1, cols=2)
    colors = {"Healthy": px.colors.qualitative.Plotly[0], "Scab": px.colors.qualitative.Plotly[0],
              "Rust": px.colors.qualitative.Plotly[0], "Multiple diseases": px.colors.qualitative.Plotly[0]}
    pred = ''
    if list.index(preds.tolist(), max(preds)) == 0:
        pred = "Healthy"
    if list.index(preds.tolist(), max(preds)) == 1:
        pred = "Scab"
    if list.index(preds.tolist(), max(preds)) == 2:
        pred = "Rust"
    if list.index(preds.tolist(), max(preds)) == 3:
        pred = "Multiple diseases"
    colors[pred] = px.colors.qualitative.Plotly[1]
    colors["Healthy"] = "seagreen"
    colors = [colors[val] for val in colors.keys()]
    fig.add_trace(go.Image(z=cv2.resize(img, (205, 136))), row=1, col=1)
    fig.add_trace(go.Bar(x=["Healthy", "Multiple diseases", "Rust", "Scab"],
                         y=preds, marker=dict(color=colors)), row=1, col=2)

    fig.update_layout(height=400, width=800, title_text="DenseNet Predictions",
                      showlegend=False)
    fig.show()


# 显示前四幅图像预测结果
def display_four_examples(saved_model,train_data):
    train_images = train_data["image_id"][:4].progress_apply(load_image)
    preds = model_predict(saved_model,train_images[2])
    displayResult(train_images[2], preds)
    preds = model_predict(saved_model,train_images[0])
    displayResult(train_images[0], preds)
    preds = model_predict(saved_model,train_images[3])
    displayResult(train_images[3], preds)
    preds = model_predict(saved_model,train_images[1])
    displayResult(train_images[1], preds)


# 对测试集做预测，保存预测结果
def test_dataset_predict(saved_model, test_dataset):
    model = load_model(saved_model)
    sub_path = "/kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv"
    sub = pd.read_csv(sub_path)
    probs_densenet = model.predict(test_dataset, verbose=1)
    sub.loc[:, 'healthy':] = probs_densenet
    sub.to_csv('submission_densenet.csv', index=False)
    print(sub.head())


In [None]:
# 数据集观察
train_path = '/kaggle/input/plant-pathology-2020-fgvc7/train.csv'
test_path ='/kaggle/input/plant-pathology-2020-fgvc7/test.csv'
image_path = '/kaggle/input/plant-pathology-2020-fgvc7/images/'
print('观察训练集：')
data_observation(train_path, image_path, 3)
print('观察测试集：')
data_observation(test_path, image_path, 80)

In [None]:
# 分类观察
print('=========healthy叶片观察===========')
category_observation(train_path,image_path,cond=[1, 0, 0, 0], cond_cols=["healthy"])
print('=========scab叶片观察=========')
category_observation(train_path,image_path,cond=[0, 1, 0, 0], cond_cols=["scab"])
print('=========rust叶片观察=========')
category_observation(train_path,image_path,cond=[0, 0, 1, 0], cond_cols=["rust"])
print('========= multiple_diseases叶片观察=========')
category_observation(train_path,image_path,cond=[0, 0, 0, 1], cond_cols=["multiple_diseases"])

In [None]:
# 类别分布
category_distribution(train_path)

In [None]:
# 数据随机增强
data = pd.read_csv(train_path)
image = cv2.imread(image_path + data['image_id'][0] + '.jpg')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_new = data_augmentation(image)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 6))
ax[0].imshow(image)
ax[0].set_title('Original Image', fontsize=14)
ax[1].imshow(image_new)
ax[1].set_title('New Image', fontsize=14)
plt.show()

In [None]:
# 划分数据集
train_dataset, valid_dataset, test_dataset=data_split(train_path,test_path)
print('完成数据集划分！')
print(valid_dataset)


In [None]:
with strategy.scope():
    # 模型定义
    train_data = pd.read_csv(train_path)
    train_labels = np.float32(train_data.loc[:, 'healthy':'scab'].values)
    batch_size=BATCH_SIZE
    model = model_define(train_labels=train_labels,batch_size=batch_size)

In [None]:
# 模型训练
model_save_dir = 'models'
if not os.path.exists(model_save_dir):
    os.makedirs(model_save_dir)
saved_path = os.path.join(os.getcwd(), model_save_dir) + '/densenet121.h5' # 图片存放路径
print(saved_path)
epochs = 20
steps_per_epoch =train_labels.shape[0] // BATCH_SIZE

history = model_train(model,train_dataset,valid_dataset,epochs,steps_per_epoch,saved_path)


In [None]:
# 模型评估
model_estimate(history,epochs)

In [None]:
# 模型预测，显示前四幅图像预测结果
display_four_examples(saved_path,train_data)


In [None]:
%%time
# 对测试集做预测，保存预测结果
test_dataset_predict(saved_model=saved_path,test_dataset=test_dataset)

# 2 MobileNetV2

In [None]:
"""
程序：handle_model.py
功能：建模过程
设计：董相志，upsunny2008@163.com
日期：2021.3.12
"""
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import cv2
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv2D, GlobalAveragePooling2D,Dropout
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler,ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

# 模型定义
def mobile_model_define(train_labels, batch_size: int=BATCH_SIZE):
    steps_per_epoch = train_labels.shape[0] // batch_size
    model = Sequential(name='MyMobileNetV2')
    mobile_net = MobileNetV2(include_top=False, weights='imagenet', input_shape=(512, 512, 3))   
#     mobile_net.trainable = False
    model.add(mobile_net)
#     model.add(Conv2D(64,3,activation='relu'))
#     model.add(Dropout(0.2))
    model.add(GlobalAveragePooling2D())
    model.add(Dense(train_labels.shape[1], activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['categorical_accuracy'])
    model.summary()
    return model

#学习率调度函数
def mobile_build_lrfn(lr_start=0.00001, lr_max=0.00005, 
               lr_min=0.00001, lr_rampup_epochs=5, 
               lr_sustain_epochs=0, lr_exp_decay=.8):
    lr_max = lr_max * strategy.num_replicas_in_sync

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) *\
                 lr_exp_decay**(epoch - lr_rampup_epochs\
                                - lr_sustain_epochs) + lr_min
        return lr
    return lrfn


# 模型训练
def mobile_model_train(model, train_dataset, valid_dataset, epochs, steps_per_epoch, saved_path):
    # 定义回调函数：学习率调度
    lrfn = build_lrfn(lr_sustain_epochs=2,lr_exp_decay=.9)
    lr_schedule = LearningRateScheduler(lrfn, verbose=1)
    # 定义回调函数：提前终止训练
    early_stop = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0, 
                               patience=10, verbose=1, restore_best_weights=True)
    # 定义回调函数：保存最优模型
    best_model = ModelCheckpoint(saved_path, monitor='val_categorical_accuracy', 
                                 verbose=1, save_best_only=True,
                                 save_weights_only=False, mode='max')
    history = model.fit(train_dataset,epochs=epochs,
                        callbacks=[lr_schedule,best_model,early_stop],
                        steps_per_epoch=steps_per_epoch,
                        validation_data=valid_dataset)
    return history


# 模型评估
def mobile_model_estimate(history, epochs):
    training = history.history['categorical_accuracy']
    validation = history.history['val_categorical_accuracy']
    ylabel = "Accuracy"
    title = "Accuracy vs. Epochs"
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(x=np.arange(1, epochs + 1), mode='lines+markers', 
                   y=training, marker=dict(color="dodgerblue"),
                   name="Train"))
    fig.add_trace(
        go.Scatter(x=np.arange(1, epochs + 1), mode='lines+markers', 
                   y=validation, marker=dict(color="darkorange"),
                   name="Val"))
    fig.update_layout(title_text=title, yaxis_title=ylabel, 
                      xaxis_title="Epochs", width=500, height=300)
    fig.show()


def mobile_load_image(image_id):
    file_path = '/kaggle/input/plant-pathology-2020-fgvc7/images/' + image_id + ".jpg"
    image = cv2.imread(file_path)
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)


def mobile_process(img):
    return cv2.resize(img / 255.0, (512, 512)).reshape(-1, 512, 512, 3)


# 模型预测
def mobile_model_predict(saved_model, img):
    model = load_model(saved_model)
    preds = model.layers[2](model.layers[1](model.layers[0](mobile_process(img)))).numpy()[0]
    return preds


# 显示预测结果
def mobile_displayResult(img, preds):
    fig = make_subplots(rows=1, cols=2)
    colors = {"Healthy": px.colors.qualitative.Plotly[0], "Scab": px.colors.qualitative.Plotly[0],
              "Rust": px.colors.qualitative.Plotly[0], "Multiple diseases": px.colors.qualitative.Plotly[0]}
    pred = ''
    if list.index(preds.tolist(), max(preds)) == 0:
        pred = "Healthy"
    if list.index(preds.tolist(), max(preds)) == 1:
        pred = "Scab"
    if list.index(preds.tolist(), max(preds)) == 2:
        pred = "Rust"
    if list.index(preds.tolist(), max(preds)) == 3:
        pred = "Multiple diseases"
    colors[pred] = px.colors.qualitative.Plotly[1]
    colors["Healthy"] = "seagreen"
    colors = [colors[val] for val in colors.keys()]
    fig.add_trace(go.Image(z=cv2.resize(img, (205, 136))), row=1, col=1)
    fig.add_trace(go.Bar(x=["Healthy", "Multiple diseases", "Rust", "Scab"],
                         y=preds, marker=dict(color=colors)), row=1, col=2)

    fig.update_layout(height=400, width=800, title_text="MobileNetV2 Predictions",
                      showlegend=False)
    fig.show()


# 显示前四幅图像预测结果
def mobile_display_four_examples(saved_model,train_data):
    train_images = train_data["image_id"][:4].progress_apply(mobile_load_image)
    preds = mobile_model_predict(saved_model,train_images[2])
    mobile_displayResult(train_images[2], preds)
    preds = mobile_model_predict(saved_model,train_images[0])
    mobile_displayResult(train_images[0], preds)
    preds = mobile_model_predict(saved_model,train_images[3])
    mobile_displayResult(train_images[3], preds)
    preds = mobile_model_predict(saved_model,train_images[1])
    mobile_displayResult(train_images[1], preds)


# 对测试集做预测，保存预测结果
def mobile_test_dataset_predict(saved_model, test_dataset):
    model = load_model(saved_model)
    sub_path = "/kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv"
    sub = pd.read_csv(sub_path)
    probs_mobilenet = model.predict(test_dataset, verbose=1)
    sub.loc[:, 'healthy':] = probs_mobilenet
    sub.to_csv('submission_mobilenet.csv', index=False)
    print(sub.head())


In [None]:
with strategy.scope():
    # 模型定义
    train_data = pd.read_csv(train_path)
    train_labels = np.float32(train_data.loc[:, 'healthy':'scab'].values)
    batch_size=BATCH_SIZE
    model = mobile_model_define(train_labels=train_labels,batch_size=batch_size)

In [None]:
# 模型训练
model_save_dir = 'models'
if not os.path.exists(model_save_dir):
    os.makedirs(model_save_dir)
saved_path = os.path.join(os.getcwd(), model_save_dir) + '/mobilenetv2.h5' # 图片存放路径
print(saved_path)
epochs = 20
steps_per_epoch =train_labels.shape[0] // BATCH_SIZE

history = mobile_model_train(model,train_dataset,valid_dataset,epochs,steps_per_epoch,saved_path)


In [None]:
# 模型评估
mobile_model_estimate(history,epochs)

In [None]:
# 模型预测，显示前四幅图像预测结果
mobile_display_four_examples(saved_path,train_data)

In [None]:
%%time
# 对测试集做预测，保存预测结果
mobile_test_dataset_predict(saved_model=saved_path,test_dataset=test_dataset)