In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import skimage
import os
import matplotlib.pyplot as plt
import imageio
import numpy as np
import skimage.io
import skimage.transform
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout, BatchNormalization,LeakyReLU, Activation, MaxPooling2D
from sklearn.metrics import classification_report
from scipy.ndimage.filters import convolve
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MultiLabelBinarizer
import time


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Pre-final notebook

In this notebook I make big improvements compare to previous versions.

Below you can find:
* fisrt look at data
* some info about classes and their balance
* blanks for data processing
* blank model
* correct submission sender for this competition

+

* data processing
* better model

For data processing I use image augmentation.
Also I added some layers for models.


For future work:

* models for high leaderboard score

#### Lets take first look on data



In [None]:
data = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv', index_col=False, dtype={'labels':'category'})
data.head()

We got 2 columns: 'image' and 'labels'

'image' consists of names of images, 'labels' has linked diseases to the pictures

Now lets look to the target

In [None]:
img_exists = data['image'].apply(lambda f: os.path.exists('/kaggle/input/plant-pathology-2021-fgvc8/train_images/' + f))
data = data[img_exists]
img_folder = '/kaggle/input/plant-pathology-2021-fgvc8/train_images/'

f_figsize = (16,5)

diseases = data['labels'].cat.categories
diseases

There is 12 labels types, but they are not clear. Sometimes we got twinned label like 'rust complex'.

We want to split this labels to make it better for model, but later.

Now lets use seminar code to show some pictures from dataset

In [None]:
f, ax = plt.subplots(nrows=1,ncols=diseases.size - 1, figsize=f_figsize)

# Draw the first found bee of given subpecies
i=0
for s in diseases:
    if s == 'healthy': continue
    file = img_folder + data[data['labels']==s].iloc[0]['image']
    im=imageio.imread(file)
    ax[i].imshow(im, resample=True)
    ax[i].set_title(s, fontsize=8)
    i+=1
    
plt.suptitle("Plant diseases")
plt.tight_layout()
plt.show()

# Sample some healthy objects to have a look at
ncols = 5
healthy = data[data['labels'] == 'healthy'].sample(ncols)
f, ax = plt.subplots(nrows=1,ncols=ncols, figsize=f_figsize)

for i in range(0, ncols): 
    file = img_folder + healthy.iloc[i]['image']
    ax[i].imshow(imageio.imread(file))

plt.suptitle("Healthy plants")
plt.tight_layout()
plt.show()

health_cats = data['labels'].cat.categories
f, ax = plt.subplots(1, health_cats.size-1, figsize=f_figsize)

# Draw the first found bee with a particulat health issue
i=0
for c in health_cats:
    if c == 'healthy': continue
    bee = data[data['labels'] == c].sample(1).iloc[0]
    ax[i].imshow(imageio.imread(img_folder + bee['image']))
    ax[i].set_title(bee['labels'], fontsize=8)
    i += 1
    
plt.suptitle("Sick Bees")    
plt.tight_layout()
plt.show()

Of course there are imbalanced classes. But we would not do something with it.

In [None]:
plt.figure(figsize=(16, 6))
plt.barh(diseases, data['labels'].value_counts())
plt.tight_layout()
plt.show()

So now lets transform classes in indicators of diseases. 

In [None]:
classes = data.labels.apply(lambda x : x.split())
multi = MultiLabelBinarizer().fit(classes)
labels = pd.DataFrame(multi.transform(classes), columns = multi.classes_)

labels = pd.concat([data['image'], labels], axis=1)
labels.head()

This is our final version of labels for dataset.

In [None]:
labels.columns[1:]

Lets use some common seminar code for splitting.

In [None]:
def split(data):
    # Split to train and test before balancing
    train_data, test_data = train_test_split(data, random_state=24)

    # Split train to train and validation datasets
    train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=24)

    return(train_data, val_data, test_data)

In [None]:
train_plants_bal, val_plants, test_plants = split(labels)

In [None]:
train_plants_bal

In [None]:
len(train_plants_bal[train_plants_bal[train_plants_bal.columns[1]] == 1])

count = np.zeros((len(train_plants_bal.columns) - 1))
for i in range(1, len(train_plants_bal.columns)):
    count[i - 1] = len(train_plants_bal[train_plants_bal[train_plants_bal.columns[i]] == 1])

print(count)

In [None]:
plt.figure(figsize=(16, 6))
plt.barh(labels.columns[1:], count)
plt.tight_layout()
plt.show()

Now we can continue to data reading and models.

Below there are some constants, that we will use.

In [None]:
IMAGE_WIDTH, IMAGE_HEIGHT = 128, 128
IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
KERNEL_SIZE = 3
IMAGE_CHANNELS = 3
RANDOM_STATE = 1337
N_EPOCH = 30
BATCH_SIZE = 64
MAX_POOL_DIM = 2
target = ['complex', 'frog_eye_leaf_spot', 'healthy', 'powdery_mildew', 'rust', 'scab']

Also modified read_img function from seminar.

It is modified for faster reading using tensorflow

Also after some PhotoShop games I find that saturation and contrast can help in this problem

In [None]:
def read_img(file, sat_factor = 1.5, cont_factor = 1.5, img_folder='/kaggle/input/plant-pathology-2021-fgvc8/train_images/'):    
    image = tf.io.read_file(img_folder + file)
    image = tf.io.decode_jpeg(image, channels=IMAGE_CHANNELS)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.image.adjust_saturation(image, sat_factor)
    image = tf.image.adjust_contrast(image, cont_factor)
    return image


In [None]:
img  = read_img('8aa78fd5c6c0cec2.jpg', 1, 1)
plt.imshow(img)
plt.show()

In [None]:
img  = read_img('8aa78fd5c6c0cec2.jpg')
plt.imshow(img)
plt.show()

Below we will do something with images in ImageGenerator.

Now lets use ImageGenerator.

Firstly, I need to zoom image, because the leaf on most pictures is in the center. And some diseases are same color as plant branches, which are near the edge usually.

Also lets do some common flips, rotations and shifts.

In [None]:
def prepare2train(train_plants, val_plants, test_plants, target):


    print("Started train")
    train_X = np.stack(train_plants['image'].apply(read_img))
    train_y  = train_plants[target]


    print("Started val")
    val_X = np.stack(val_plants['image'].apply(read_img))
    val_y = val_plants[target]


    print("Started test")
    test_X = np.stack(test_plants['image'].apply(read_img))
    test_y = test_plants[target]


    generator = ImageDataGenerator(
            featurewise_center=False,
            samplewise_center=False,
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,
            rotation_range=25, 
            zoom_range = 0.15,
            width_shift_range=0.15,
            height_shift_range=0.15,
            horizontal_flip=True,
            vertical_flip=True)
    print("Started generator")
    generator.fit(train_X)
    return (generator, train_X, val_X, test_X, train_y, val_y, test_y)

So 128*128 is small. Not everything can be noticed, but there is restrictions on memory, so we will try to do best.

On the other hand, there is ImageDataGenerator with flow_from_... but it is very slow

In [None]:
generator, train_X, val_X, test_X, train_y, val_y, test_y = prepare2train(train_plants_bal, val_plants, test_plants, target)

Now we have data. Now let's try model with layers like VGG16 layers. They can give good results.

Due to target format we will use 'sigmoid' at last Dense layer and 'binary_crossentropy' loss

In [None]:
# Save the best model during the traning
checkpointer1 = keras.callbacks.ModelCheckpoint('best_model1.h5',
                                                monitor='val_accuracy',
                                                verbose=1,
                                                save_best_only=True,
                                                save_weights_only=True)
# Build CNN model
model1=Sequential()
model1.add(Conv2D(16, kernel_size=KERNEL_SIZE, input_shape=(*IMAGE_SIZE,IMAGE_CHANNELS), activation='relu', padding='same'))
model1.add(Conv2D(16, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model1.add(MaxPool2D(MAX_POOL_DIM))
model1.add(Conv2D(32, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model1.add(Conv2D(32, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model1.add(Conv2D(32, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model1.add(MaxPool2D(MAX_POOL_DIM))
model1.add(Conv2D(64, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model1.add(Conv2D(64, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model1.add(Conv2D(64, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model1.add(MaxPool2D(MAX_POOL_DIM))
model1.add(Flatten())
model1.add(Dense(units=2048,activation="relu"))
model1.add(Dense(len(target), activation='sigmoid'))
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model1.summary()

Remember, that we have 6 chunks, so we need to iterate over them to train model.

In [None]:
training1 = model1.fit_generator(generator.flow(train_X,train_y, batch_size=BATCH_SIZE),
                                 epochs=N_EPOCH,
                                 validation_data=(val_X, val_y),
                                 callbacks=[checkpointer1])
# Get the best saved weights
model1.load_weights('best_model1.h5')

Also I use very usefull evaluation code from seminar. 

It helps very good to check some results about model.

In [None]:
def eval_model(training, model, test_X, test_y, target):
    
    ## Trained model analysis and evaluation
    f, ax = plt.subplots(2,1, figsize=(6,5))
    ax[0].plot(training.history['loss'], label="Loss")
    ax[0].plot(training.history['val_loss'], label="Validation loss")
    ax[0].set_title('%s: loss' % target)
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Loss')
    ax[0].legend()
    
    # Accuracy
    ax[1].plot(training1.history['accuracy'], label="Accuracy")
    ax[1].plot(training1.history['val_accuracy'], label="Validation accuracy")
    ax[1].set_title('%s: accuracy' % target)
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Accuracy')
    ax[1].legend()
    plt.tight_layout()
    plt.show()

    # Accuracy by subspecies
    test_pred = model.predict(test_X)
    print(test_pred)
    acc_by_subspecies = np.logical_and((test_pred > 0.5), test_y).sum()/test_y.sum()
    acc_by_subspecies.plot(kind='bar', title='Accuracy by %s' % target)
    plt.ylabel('Accuracy')
    plt.show()

    # Print metrics
    print("Classification report")
    test_pred = np.argmax(test_pred, axis=1)
    test_truth = np.argmax(test_y.values, axis=1)
    #print(test_truth)
    #print(test_pred)
    print(classification_report(test_truth, test_pred, target_names=test_y.columns))

    # Loss function and accuracy
    test_res = model.evaluate(test_X, test_y.values, verbose=0)
    print('Loss function: %s, accuracy:' % test_res[0], test_res[1])

In [None]:
eval_model(training1, model1, test_X, test_y, ['complex', 'frog_eye_leaf_spot', 'healthy', 'powdery_mildew', 'rust', 'scab'])

We make simple model. Lets submit it and check results. 

In [None]:
test_df = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
test_path = "/input/plant-pathology-2021-fgvc8/test_images"
test_df

In [None]:
#kekw = read_img(test_df['image'][1], "../input/plant-pathology-2021-fgvc8/test_images/")

Submissions for this competition is allowed only from competition notebooks, so we need special saver for results.

In [None]:
submission = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
for row in submission.index:

    image = read_img(submission.loc[row,'image'],
                    img_folder='/kaggle/input/plant-pathology-2021-fgvc8/test_images/').numpy().reshape((1, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS))
    
    predict = model1.predict(image)[0]
    print(predict)
    predict = [1 if i>0.3 else 0 for i in predict]
    result = []
    for i,j in enumerate(predict):
        if j:
            result.append(labels.columns.tolist()[i + 1])
    result = ' '.join(result)
    submission.loc[row,'labels'] = result

submission.head()

In [None]:
os.chdir(r'/kaggle/working')

submission.to_csv('submission.csv', index=False)