In [None]:
%matplotlib inline
import IPython.core.display         
# setup output image format (Chrome works best)
IPython.core.display.set_matplotlib_formats("svg")
import matplotlib.pyplot as plt
import matplotlib
from numpy import *
from sklearn import *
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import PIL
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import tensorflow_addons as tfa
import random
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten, Dropout, Input, BatchNormalization, \
                                    GlobalAveragePooling2D, Concatenate
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
import logging
logging.basicConfig()
import struct
print(keras.__version__, tf.__version__)
# use keras backend (K) to force channels-last ordering
K.set_image_data_format('channels_last')
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
pd.set_option("display.max_columns", None)

In [None]:
train = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
print(len(train))
print(train.columns)
# print(train['labels'].value_counts())
print(train['labels'].value_counts().plot.bar())

In [None]:
train['labels'] = train['labels'].apply(lambda string: string.split(' '))

In [None]:
s = list(train['labels'])
mlb = MultiLabelBinarizer()
trainx = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_, index=train.index)
print(trainx.columns)
print(trainx.sum())

labels = list(trainx.sum().keys())
print(labels)
label_counts = trainx.sum().values.tolist()

fig, ax = plt.subplots(1,1, figsize=(10,6))

sns.barplot(x= labels, y= label_counts, ax=ax)

In [None]:
%%time
def add_gauss_noise(X, sigma2=0.1):  #0.05
    # add Gaussian noise with zero mean, and variance sigma2
    return X + np.random.normal(0, sigma2, X.shape)

# build the data augmenter
datagen = ImageDataGenerator(
    rescale=1/255.0,
    rotation_range=10,         # image rotation
    width_shift_range=0.1,     # image shifting
    height_shift_range=0.1,    # image shifting
    shear_range=0.1,           # shear transformation
    zoom_range=0.1,            # zooming
    horizontal_flip=True, 
    preprocessing_function=add_gauss_noise, 
    validation_split=0.2
)
bsize = 32

train_data = datagen.flow_from_dataframe(
    train,
    directory='../input/resized-plant2021/img_sz_512',
    x_col="image",
    y_col= 'labels',
    subset="training",
    color_mode="rgb",
    target_size = (224,224),
    class_mode="categorical",
    batch_size=bsize,
    shuffle=False,
    seed=40,
)
valid_data = datagen.flow_from_dataframe(
    train,
    directory='../input/resized-plant2021/img_sz_512',
    x_col="image",
    y_col= 'labels',
    subset="validation",
    color_mode="rgb",
    target_size = (224,224),
    class_mode="categorical",
    batch_size=bsize,
    shuffle=False,
    seed=40,
)

In [None]:
model_ft = tf.keras.models.load_model('../input/modeldense121/my_model_dense_freezy.h5') 

In [None]:
def str2max(list_str,threshold):
    max_id =[]
    for i in list_str:
        if i > threshold:
            max_id.append(1)
        else:
            max_id.append(0)
    return max_id

def evaluate(data,threshold):
    score_dic={}
    total, right = 0., 0.
    positive_i=0.0
    count = 0
    for x_true, y_true in tqdm(data):
        count = count +1
        a = model_ft.predict(x_true)
        y_pred_list=[]
        for i in a:
            label=str2max(i,threshold)
            label=np.array(label)
            y_pred_list.append(label)
        y_pred_list=np.array(y_pred_list)
        for i,j in zip(y_true.tolist(),y_pred_list.tolist()):
            total+=1
            if i==j:
                right+=1
        if (count == ceil(180/bsize)):
            break
    score_dic['acc']=right/total
    score_dic['correct']=right
    score_dic['total']=total
    return score_dic

In [None]:
acc_list = []
for threshold in np.arange(0,1.0,0.05):
    acc = evaluate(valid_data,threshold)['acc']
    acc_list.append(acc)

In [None]:
max_acc = max(acc_list)
max_index = acc_list.index(max_acc)
# thresholds = np.linspace(0,1.0,num=20)
thresholds = np.arange(0,1.0,0.05)

best_threshold = thresholds[max_index]
print("best threshold is {} with the acc {}".format(best_threshold,max_acc))

In [None]:
test = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')

for img_name in tqdm(test['image']):
    path = '../input/plant-pathology-2021-fgvc8/test_images/'+str(img_name)
    with PIL.Image.open(path) as img:
        img = img.resize((256,256))
        img.save(f'./{img_name}')

In [None]:
test_data = datagen.flow_from_dataframe(
    test,
    directory = './',
    x_col="image",
    y_col= None,
    color_mode="rgb",
    target_size = (256,256),
    classes=None,
    class_mode=None,
    batch_size=bsize,
    shuffle=False,
    seed=40,
)

preds = model_ft.predict(test_data)
print(preds)
preds = preds.tolist()

indices = []
for pred in preds:
    temp = []
    for category in pred:
        if category>=best_threshold:
            temp.append(pred.index(category))
    if temp!=[]:
        indices.append(temp)
    else:
        temp.append(np.argmax(pred))
        indices.append(temp)
    
print(indices)

In [None]:
labels = (train_data.class_indices)
labels = dict((v,k) for k,v in labels.items())
print(labels)

testlabels = []


for image in indices:
    temp = []
    for i in image:
        temp.append(str(labels[i]))
    testlabels.append(' '.join(temp))

print(testlabels)

In [None]:
delfiles = tf.io.gfile.glob('./*.jpg')

for file in delfiles:
    os.remove(file)

In [None]:
sub = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
sub['labels'] = testlabels
sub.to_csv('submission.csv', index=False)
sub