In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import math
import cv2
from tqdm import tqdm
import os
import PIL
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from keras_preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.preprocessing.image import img_to_array

from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
df = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
df.head()

In [None]:
df['labels'].sort_values().value_counts().plot.bar()

In [None]:
df['labels'] = df['labels'].apply(lambda s: s.split(' '))
df[:10]

In [None]:
s = list(df['labels'])
mlb = MultiLabelBinarizer()
trainx = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_, index=df.index)
print(trainx.columns)
print(trainx.sum())

labels = list(trainx.sum().keys())
print(labels)
label_counts = trainx.sum().values.tolist()

fig, ax = plt.subplots(1,1, figsize=(10,6))

sns.barplot(x= labels, y= label_counts, ax=ax)

In [None]:
df

In [None]:
datagen = ImageDataGenerator(
    rescale = 1/255.0,
    validation_split= 0.2,
    rotation_range=5,
    zoom_range=0.1,
    shear_range=0.05,
    horizontal_flip=True,
)
bsize = 32

In [None]:
train_data = datagen.flow_from_dataframe(
    df,
    directory = '../input/resized-plant2021/img_sz_256',
    x_col = 'image',
    y_col = 'labels',
    subset="training",
    color_mode="rgb",
    target_size = (224,224),
    class_mode="categorical",
    batch_size=bsize,
    shuffle=True,
    seed=40,
)

In [None]:
valid_data = datagen.flow_from_dataframe(
    df,
    directory = '../input/resized-plant2021/img_sz_256',
    x_col = 'image',
    y_col = 'labels',
    subset="validation",
    color_mode="rgb",
    target_size = (224,224),
    class_mode="categorical",
    batch_size=bsize,
    shuffle=True,
    seed=40,
)

In [None]:
accname = 'f1_score'

def plot_history(history): 
    fig, ax1 = plt.subplots()
    
    ax1.plot(history.history['loss'], 'r', label="training loss ({:.6f})".format(history.history['loss'][-1]))
    ax1.plot(history.history['val_loss'], 'r--', label="validation loss ({:.6f})".format(history.history['val_loss'][-1]))
    ax1.grid(True)
    ax1.set_xlabel('iteration')
    ax1.legend(loc="best", fontsize=9)    
    ax1.set_ylabel('loss', color='r')
    ax1.tick_params('y', colors='r')

    if accname in history.history:
        ax2 = ax1.twinx()

        ax2.plot(history.history[accname], 'b', label="training f1_score ({:.4f})".format(history.history[accname][-1]))
        ax2.plot(history.history['val_'+accname], 'b--', label="validation f1_score ({:.4f})".format(history.history['val_'+accname][-1]))

        ax2.legend(loc="lower right", fontsize=9)
        ax2.set_ylabel('acc', color='b')        
        ax2.tick_params('y', colors='b')

In [None]:
weight_path = '../input/dense121-weight/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5'
base_model = tf.keras.applications.DenseNet121(weights=weight_path, include_top=False, pooling='avg')
x = base_model.output
#fully connected layer
x = Dense(512, activation='relu')(x)
x = Dense(256, activation='relu')(x)
# finally, the softmax for the classifier 
predictions = Dense(6, activation='sigmoid')(x)

In [None]:
model = Model(inputs=base_model.input, outputs = predictions)
import tensorflow_addons as tfa
import keras 
f1 = tfa.metrics.F1Score(num_classes=6, average='macro')
model.compile(optimizer= tf.keras.optimizers.Adam(), 
              loss='binary_crossentropy', 
              metrics=[f1]
             )
# model.compile(optimizer=keras.optimizers.Adam(lr=0.03), 
#                  loss='binary_crossentropy', 
#                  metrics=[f1]
#                 )


In [None]:
accEarlyStop = keras.callbacks.EarlyStopping(
    monitor=f1,     # look at the validation loss tf2.0 accuracy
    min_delta=0.02,       # threshold to consider as no change
    patience=5,             # stop if  epochs with no change
    verbose=1, 
    mode='max', 
    restore_best_weights= True
)
lossEarlyStop = keras.callbacks.EarlyStopping(
    monitor='val_loss',     # look at the validation loss tf2.0 accuracy
    min_delta=0.02,       # threshold to consider as no change
    patience=5,             # stop if  epochs with no change
    verbose=1, 
    mode='min', 
    restore_best_weights= True
)
lrschedule = keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.05, 
    patience=5, 
    verbose=1
)
callbacks_list = [lrschedule]
# callbacks_list = []

In [None]:
history = model.fit_generator(
            train_data,  # data from generator
             #steps_per_epoch=1,    # should be number of batches per epoch
            epochs=10,
            callbacks=callbacks_list, 
            validation_data=valid_data, 
#             validation_steps = 1,
            verbose=True
)

In [None]:
plot_history(history)

In [None]:
loss, f1score = model.evaluate_generator(valid_data,verbose=1)

In [None]:
# _model = tf.keras.models.load_model('../input/dense121modelver2/dense121_model_ver2.h5')

In [None]:
def str2max(list_str,threshold):
    max_id =[]
    for i in list_str:
        if i > threshold:
            max_id.append(1)
        else:
            max_id.append(0)
    return max_id

def evaluate(data,threshold):
    score_dic={}
    total, right = 0., 0.
    positive_i=0.0
    count = 0
    for x_true, y_true in tqdm(data):
        count = count +1
        a = model.predict(x_true)
        y_pred_list=[]
        for i in a:
            label=str2max(i,threshold)
            label=np.array(label)
            y_pred_list.append(label)
        y_pred_list=np.array(y_pred_list)
        for i,j in zip(y_true.tolist(),y_pred_list.tolist()):
            total+=1
            if i==j:
                right+=1
        if (count == math.ceil(180/bsize)):
            break
    score_dic['acc']=right/total
    score_dic['correct']=right
    score_dic['total']=total
    return score_dic

In [None]:
type(np.linspace(0,1.0,num=20))
thresholds = np.linspace(0,1.0,num=20)
thresholds[0]

In [None]:
acc_list = []
for threshold in np.arange(0,1.0,0.05):
    acc = evaluate(valid_data,threshold)['acc']
    acc_list.append(acc)

In [None]:
max_acc = max(acc_list)
max_index = acc_list.index(max_acc)
# thresholds = np.linspace(0,1.0,num=20)
thresholds = np.arange(0,1.0,0.05)

best_threshold = thresholds[max_index]
print("best threshold is {} with the acc {}".format(best_threshold,max_acc))

In [None]:
best_threshold

In [None]:
# model.save('dense121_model_ver2.h5')

In [None]:
# _model.summary()

In [None]:
loss, f1score = model.evaluate_generator(valid_data,verbose=1)

In [None]:
sample_sub = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
sample_sub.head()

In [None]:
for img_name in tqdm(sample_sub['image']):
    print(img_name)
    path = '../input/plant-pathology-2021-fgvc8/test_images/'+str(img_name)
    with PIL.Image.open(path) as img:
        img = img.resize((256,256))
        img.save(f'./{img_name}')

In [None]:
test_datagen = ImageDataGenerator(
    rescale = 1/255.0
)

In [None]:
test_data = test_datagen.flow_from_dataframe(
    sample_sub,
    directory='../input/plant-pathology-2021-fgvc8/test_images',
    x_col='image',
    y_col=None,
    class_mode=None,
    color_mode="rgb",
    target_size=(224,224),
)

In [None]:
preds = model.predict(test_data)
print(preds)
preds = preds.tolist()

In [None]:
indices = []
for pred in preds:
    temp = []
    for category in pred:
        if category>=best_threshold:
            temp.append(pred.index(category))
    if temp!=[]:
        indices.append(temp)
    else:
        temp.append(np.argmax(pred))
        indices.append(temp)
    
print(indices)

In [None]:
labels = (train_data.class_indices)
labels = dict((v,k) for k,v in labels.items())
print(labels)

In [None]:
testlabels = []


for image in indices:
    temp = []
    for i in image:
        temp.append(str(labels[i]))
    testlabels.append(' '.join(temp))

print(testlabels)

In [None]:
sub = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
sub['labels'] = testlabels
sub.to_csv('submission.csv', index=False)
sub