In [None]:
%matplotlib inline
import IPython.core.display         
# setup output image format (Chrome works best)
IPython.core.display.set_matplotlib_formats("svg")
import matplotlib.pyplot as plt
import matplotlib
from numpy import *
from sklearn import *
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import PIL
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import tensorflow_addons as tfa
import random
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten, Dropout, Input, BatchNormalization, \
                                    GlobalAveragePooling2D, Concatenate
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
import logging
logging.basicConfig()
import struct
print(keras.__version__, tf.__version__)
# use keras backend (K) to force channels-last ordering
K.set_image_data_format('channels_last')
pd.set_option("display.max_columns", None)

In [None]:
train = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
print(len(train))
print(train.columns)
# print(train['labels'].value_counts())
print(train['labels'].value_counts().plot.bar())

In [None]:
train = train[:1800]

In [None]:
train['labels'] = train['labels'].apply(lambda string: string.split(' '))
train

In [None]:
s = list(train['labels'])
mlb = MultiLabelBinarizer()
trainx = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_, index=train.index)
print(trainx.columns)
print(trainx.sum())

labels = list(trainx.sum().keys())
print(labels)
label_counts = trainx.sum().values.tolist()

fig, ax = plt.subplots(1,1, figsize=(10,6))

sns.barplot(x= labels, y= label_counts, ax=ax)

In [None]:
train

Since the data size is too huge (18632 images wiht about 15GB), I use the resize version by [resized-plant2021](https://www.kaggle.com/ankursingh12/resized-plant2021) by Ankur Singh.And I use flow_from_dataframe mehod from keras to prevent out of memory issues since it can read the image by batch without loading all at once.

In [None]:
%%time
def add_gauss_noise(X, sigma2=0.1):  #0.05
    # add Gaussian noise with zero mean, and variance sigma2
    return X + np.random.normal(0, sigma2, X.shape)

# build the data augmenter
datagen = ImageDataGenerator(
    rescale=1/255.0,
    rotation_range=10,         # image rotation
    width_shift_range=0.1,     # image shifting
    height_shift_range=0.1,    # image shifting
    shear_range=0.1,           # shear transformation
    zoom_range=0.1,            # zooming
    horizontal_flip=True, 
    preprocessing_function=add_gauss_noise, 
    validation_split=0.1
)
bsize = 16

train_data = datagen.flow_from_dataframe(
    train,
    directory='../input/resized-plant2021/img_sz_512',
    x_col="image",
    y_col= 'labels',
    subset="training",
    color_mode="rgb",
    target_size = (224,224),
    class_mode="categorical",
    batch_size=bsize,
    shuffle=False,
    seed=40,
)
valid_data = datagen.flow_from_dataframe(
    train,
    directory='../input/resized-plant2021/img_sz_512',
    x_col="image",
    y_col= 'labels',
    subset="validation",
    color_mode="rgb",
    target_size = (224,224),
    class_mode="categorical",
    batch_size=bsize,
    shuffle=False,
    seed=40,
)

In [None]:
accname = 'f1_score'

def plot_history(history): 
    fig, ax1 = plt.subplots()
    
    ax1.plot(history.history['loss'], 'r', label="training loss ({:.6f})".format(history.history['loss'][-1]))
    ax1.plot(history.history['val_loss'], 'r--', label="validation loss ({:.6f})".format(history.history['val_loss'][-1]))
    ax1.grid(True)
    ax1.set_xlabel('iteration')
    ax1.legend(loc="best", fontsize=9)    
    ax1.set_ylabel('loss', color='r')
    ax1.tick_params('y', colors='r')

    if accname in history.history:
        ax2 = ax1.twinx()

        ax2.plot(history.history[accname], 'b', label="training f1_score ({:.4f})".format(history.history[accname][-1]))
        ax2.plot(history.history['val_'+accname], 'b--', label="validation f1_score ({:.4f})".format(history.history['val_'+accname][-1]))

        ax2.legend(loc="lower right", fontsize=9)
        ax2.set_ylabel('acc', color='b')        
        ax2.tick_params('y', colors='b')

In [None]:
# import tensorflow.keras.applications.resnet50 as resnet
from tensorflow.keras.preprocessing import image
K.clear_session()
random.seed(4487); tf.random.set_seed(4487)

# create the base pre-trained model with-out the classifier
# using global average pooling
weight_path = '../input/tf-keras-pretrained-model-weights/No Top/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5'
base_model = tf.keras.applications.DenseNet121(weights=weight_path, include_top=False, pooling='avg')

# start with the output of the ResNet50 (1x1x2048) 
x = base_model.output

# # fully-connected layer 
# x = Dense(128, activation='relu')(x)
# # fully-connected layer 
# x = Dense(64, activation='relu')(x)
# # fully-connected layer 
x = Dense(16, activation='relu')(x)
# finally, the softmax for the classifier 
predictions = Dense(6, activation='sigmoid')(x)

In [None]:
# build the model for training
# - need to specify the input layer and the output layer
model_ft = Model(inputs=base_model.input, outputs=predictions)

# # fix the layers of the ResNet50.
# for layer in base_model.layers:
#     layer.trainable = False

f1 = tfa.metrics.F1Score(num_classes=6, average='macro')

# compile the model - only the layers that we added will be trained
model_ft.compile(optimizer=keras.optimizers.SGD(lr=0.03, 
                                decay=1e-4,  # decay LR each iteration (batch) 
                                momentum=0.8, nesterov=True), 
              loss='binary_crossentropy', metrics=[f1])
# model_ft.compile(optimizer=keras.optimizers.Adam(lr=0.03), 
#               loss='binary_crossentropy', metrics=[f1])

# setup early stopping callback function
accearlystop = keras.callbacks.EarlyStopping(
    monitor=f1,     # look at the validation loss tf2.0 accuracy
    min_delta=0.02,       # threshold to consider as no change
    patience=5,             # stop if  epochs with no change
    verbose=1, mode='max', restore_best_weights= True
)
lossearlystop = keras.callbacks.EarlyStopping(
    monitor='val_loss',     # look at the validation loss tf2.0 accuracy
    min_delta=0.02,       # threshold to consider as no change
    patience=5,             # stop if  epochs with no change
    verbose=1, mode='min', restore_best_weights= True
)
# callbacks_list = [earlystop]
lrschedule = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                 factor=0.05, patience=5, verbose=1)
# callbacks_list = [lrschedule,accearlystop,lossearlystop]
# callbacks_list = [accearlystop,lossearlystop]
callbacks_list = []


# train the model on the new data for a few epochs
STEP_SIZE_TRAIN=train_data.n
STEP_SIZE_VALID=valid_data.n
history = model_ft.fit_generator(
            train_data,  # data from generator
#             steps_per_epoch=1,    # should be number of batches per epoch
            epochs=10,
            callbacks=callbacks_list, 
            validation_data=valid_data, 
#             validation_steps = 1,
            verbose=True)

plot_history(history)

In [None]:
# model_ft.save('fulltrainedResnet.h5')

In [None]:
loss, f1score = model_ft.evaluate_generator(valid_data,verbose=1)

In [None]:
type(valid_data)

In [None]:
def str2max(list_str,threshold):
    max_id =[]
    for i in list_str:
        if i > threshold:
            max_id.append(1)
        else:
            max_id.append(0)
    return max_id

def evaluate(data,threshold):
    score_dic={}
    total, right = 0., 0.
    positive_i=0.0
    count = 0
    for x_true, y_true in tqdm(data):
        count = count +1
        a = model_ft.predict(x_true)
        y_pred_list=[]
        for i in a:
            label=str2max(i,threshold)
            label=np.array(label)
            y_pred_list.append(label)
        y_pred_list=np.array(y_pred_list)
        for i,j in zip(y_true.tolist(),y_pred_list.tolist()):
            total+=1
            if i==j:
                right+=1
        if (count == ceil(180/bsize)):
            break
    score_dic['acc']=right/total
    score_dic['correct']=right
    score_dic['total']=total
    return score_dic



In [None]:
type(np.linspace(0,1.0,num=20))
thresholds = np.linspace(0,1.0,num=20)
thresholds[0]

In [None]:
acc_list = []
for threshold in np.arange(0,1.0,0.05):
    acc = evaluate(valid_data,threshold)['acc']
    acc_list.append(acc)

In [None]:
max_acc = max(acc_list)
max_index = acc_list.index(max_acc)
# thresholds = np.linspace(0,1.0,num=20)
thresholds = np.arange(0,1.0,0.05)

best_threshold = thresholds[max_index]
print("best threshold is {} with the acc {}".format(best_threshold,max_acc))

In [None]:
final_score['acc']

# Submission

In [None]:
test = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')

for img_name in tqdm(test['image']):
    path = '../input/plant-pathology-2021-fgvc8/test_images/'+str(img_name)
    with PIL.Image.open(path) as img:
        img = img.resize((256,256))
        img.save(f'./{img_name}')

In [None]:
test_data = datagen.flow_from_dataframe(
    test,
    directory = './',
    x_col="image",
    y_col= None,
    color_mode="rgb",
    target_size = (224,224),
    classes=None,
    class_mode=None,
    batch_size=bsize,
    shuffle=False,
    seed=40,
)

preds = model_ft.predict(test_data)
print(preds)
preds = preds.tolist()

indices = []
for pred in preds:
    temp = []
    for category in pred:
        if category>=best_threshold:
            temp.append(pred.index(category))
    if temp!=[]:
        indices.append(temp)
    else:
        temp.append(np.argmax(pred))
        indices.append(temp)
    
print(indices)

In [None]:
labels = (train_data.class_indices)
labels = dict((v,k) for k,v in labels.items())
print(labels)

testlabels = []


for image in indices:
    temp = []
    for i in image:
        temp.append(str(labels[i]))
    testlabels.append(' '.join(temp))

print(testlabels)

In [None]:
delfiles = tf.io.gfile.glob('./*.jpg')

for file in delfiles:
    os.remove(file)

In [None]:
sub = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
sub['labels'] = testlabels
sub.to_csv('submission.csv', index=False)
sub