In [None]:
import datetime
time1 = datetime.datetime.now()
print(time1)

In [None]:
import random
import cv2
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Flatten,Dense,Dropout,BatchNormalization
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.metrics import categorical_accuracy

from tensorflow.keras.callbacks import (EarlyStopping, ReduceLROnPlateau, 
                                        ModelCheckpoint, CSVLogger, LearningRateScheduler)
import tensorflow as tf

In [None]:
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224
IMAGE_CHANNELS = 3
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 5
EPOCHS = 1
BALANCE = False #True  False
AUG = False #True  False
LR = 0.005#0.001  0.005  0.01
H5_FILE_NAME = '_'.join(['model',
                        'balance' if BALANCE else 'nobalance',
                        'aug' if AUG else 'noaug',
                        str(LR)]) + '.h5'
print(H5_FILE_NAME)
TRAINING_LOG_FILE_NAME = '_'.join(['training_log',
                        'balance' if BALANCE else 'nobalance',
                        'aug' if AUG else 'noaug',
                        str(LR)]) + '.csv'
print(TRAINING_LOG_FILE_NAME)
SUBMISSION_FILE_NAME = '_'.join(['submission',
                        'balance' if BALANCE else 'nobalance',
                        'aug' if AUG else 'noaug',
                        str(LR)]) + '.csv'
print(SUBMISSION_FILE_NAME)

In [None]:
path = '../input/plant-pathology-2020-fgvc7/train.csv'
df_train_all = pd.read_csv(path)

path = '../input/plant-pathology-2020-fgvc7/test.csv'
df_test = pd.read_csv(path)

path = '../input/plant-pathology-2020-fgvc7/sample_submission.csv'
df_sample = pd.read_csv(path)


print(df_train_all.shape)
print(df_test.shape)
print(df_sample.shape)

In [None]:
# Identify the target class of each row in the train set

def get_class(row):
    
    if row['multiple_diseases'] == 1:
        return 'multiple_diseases'
    
    elif row['rust'] == 1:
        return 'rust'
    
    elif row['scab'] == 1:
        return 'scab'
    
    else:
        return 'healthy'
    
df_train_all['target'] = df_train_all.apply(get_class, axis=1)

df_train_all.head()

In [None]:
df_train_all['target'].value_counts()

In [None]:
# shuffle
df_train_all_shuffle = shuffle(df_train_all, random_state=101)
# select the column that we will use for stratification
y = df_train_all_shuffle['target']

df_train, df_val = train_test_split(df_train_all_shuffle, test_size=0.2, random_state=101, stratify=y)


print(df_train.shape)
print(df_val.shape)

In [None]:
df_train['target'].value_counts()

In [None]:
df_val['target'].value_counts()

In [None]:
def train_balancer(df_train):
    df_1 = df_train[df_train['target'] != 'multiple_diseases']
    df_2 = df_train[df_train['target'] == 'multiple_diseases']
    df_train_up = pd.concat([df_1, df_2,  df_2,  df_2,  df_2,  df_2,  df_2], axis=0).reset_index(drop=True)

    df_train_ret = shuffle(df_train_up, random_state=101)
    return df_train_ret

In [None]:
# This is the new class distribution of the train set
if BALANCE:
    df_train = train_balancer(df_train)
df_train['target'].value_counts()

In [None]:
df_train.to_csv('df_train.csv.gz', compression='gzip', index=False)
df_val.to_csv('df_val.csv.gz', compression='gzip', index=False)
df_test.to_csv('df_test.csv.gz', compression='gzip', index=False)
!ls

In [None]:
# Albumentations

import albumentations as albu


def augment_image(augmentation, image):
    
    """
    Uses the Albumentations library.
    
    Inputs: 
    1. augmentation - this is the instance of type of augmentation to do 
    e.g. aug_type = HorizontalFlip(p=1) 
    # p=1 is the probability of the transform being executed.
    
    2. image - image with shape (h,w)
    
    Output:
    Augmented image as a numpy array.
    
    """
    # get the transform as a dict
    aug_image_dict =  augmentation(image=image)
    # retrieve the augmented matrix of the image
    image_matrix = aug_image_dict['image']
    
    
    return image_matrix

In [None]:
# Define the transforms

# Modified from --> Pneumothorax - 1st place solution
# Source: https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/discussion/107824#latest-620521


aug_types1 = albu.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=45, 
                  interpolation=1, border_mode=4, value=None, mask_value=None, 
                  shift_limit_x=None, shift_limit_y=None, always_apply=False, 
                  p=1)

aug_types2 = albu.Flip(p=1)

aug_types3 = albu.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2,
                                           brightness_by_max=True, always_apply=False,p=1)

aug_types4 = albu.Blur(blur_limit=(3,3.5), always_apply=False, p=1)

aug_types5 = albu.OneOf([
                albu.ElasticTransform(alpha=1, sigma=50, alpha_affine=50,
                                       interpolation=1, border_mode=4, value=None,mask_value=None,
                                       always_apply=False, approximate=False, p=1),
                albu.GridDistortion(num_steps=5, distort_limit=0.3, interpolation=1, border_mode=4, 
                                 value=None, mask_value=None, always_apply=False, p=1)
                        ], p=1)

In [None]:
def train_generator_aug(batch_size=8,random_seed=None):
    
    while True:
        
        if random_seed:
            random.seed(random_seed)
        
        # load the data in chunks (batches)
        for df in pd.read_csv('df_train.csv.gz', chunksize=batch_size):
            
            # get the list of images
            image_id_list = list(df['image_id'])
            
            # Create empty X matrix - 3 channels
            X_train = np.zeros((6*len(df), IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS), dtype=np.uint8)
            

        
            
            # Create X_train
            #================
            
            for i in range(0, len(image_id_list)):
              
              
                # get the image and mask
                image_id = image_id_list[i] + '.jpg'


                # set the path to the image
                path = '../input/plant-pathology-2020-fgvc7/images/' + image_id

                # read the image
                image = cv2.imread(path)
                
                # convert to from BGR to RGB
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                
                # resize the image
                image = cv2.resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))
                
                
            
            
            # Create y_train
            # ===============
                cols = ['healthy', 'multiple_diseases', 'rust', 'scab']
                y_train = df[cols]
                y_train = pd.concat([y_train, y_train, y_train, y_train, y_train, y_train], axis=0).reset_index(drop=True)
                y_train = np.asarray(y_train) 


       
                X_train[i] = image
                X_train[i+1*len(image_id_list)] = augment_image(aug_types1, image)
                X_train[i+2*len(image_id_list)] = augment_image(aug_types2, image)
                X_train[i+3*len(image_id_list)] = augment_image(aug_types3, image)
                X_train[i+4*len(image_id_list)] = augment_image(aug_types4, image)
                X_train[i+5*len(image_id_list)] = augment_image(aug_types5, image)
                
            # Normalize the images
            X_train = X_train/255

            yield X_train, y_train
            

In [None]:
def train_generator_no_aug(batch_size=8):
    
    while True:
        
        # load the data in chunks (batches)
        for df in pd.read_csv('df_train.csv.gz', chunksize=batch_size):
            
            # get the list of images
            image_id_list = list(df['image_id'])
            
            # Create empty X matrix - 3 channels
            X_train = np.zeros((len(df), IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS), dtype=np.uint8)
            

        
            
            # Create X_train
            #================
            
            for i in range(0, len(image_id_list)):
              
              
                # get the image and mask
                image_id = image_id_list[i] + '.jpg'


                # set the path to the image
                path = '../input/plant-pathology-2020-fgvc7/images/' + image_id

                # read the image
                image = cv2.imread(path)
                
                # convert to from BGR to RGB
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                
                # resize the image
                image = cv2.resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))
                
                
            
            
            # Create y_train
            # ===============
                cols = ['healthy', 'multiple_diseases', 'rust', 'scab']
                y_train = df[cols]
                y_train = np.asarray(y_train) 
                
                # insert the image into X_train
                X_train[i] = image
                
                          
                
            # Normalize the images
            X_train = X_train/255

            yield X_train, y_train

In [None]:
# Test the generator

# initialize
if AUG:
    train_gen = train_generator_aug(batch_size=8,random_seed=123)
else:
    train_gen = train_generator_no_aug(batch_size=8)

# run the generator
X_train, y_train = next(train_gen)

print(X_train.shape)
print(y_train.shape)

In [None]:
def val_generator(batch_size=5):
    
    while True:
        
        # load the data in chunks (batches)
        for df in pd.read_csv('df_val.csv.gz', chunksize=batch_size):
            
            # get the list of images
            image_id_list = list(df['image_id'])
            
            # Create empty X matrix - 3 channels
            X_val = np.zeros((len(df), IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS), dtype=np.uint8)
            

        
            
            # Create X_val
            #================
            
            for i in range(0, len(image_id_list)):
              
              
                # get the image and mask
                image_id = image_id_list[i] + '.jpg'


                # set the path to the image
                path = '../input/plant-pathology-2020-fgvc7/images/' + image_id

                # read the image
                image = cv2.imread(path)
                
                # convert to from BGR to RGB
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                
                # resize the image
                image = cv2.resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))

                # insert the image into X_train
                X_val[i] = image
                
                
            
            
            # Create y_val
            # ===============

                cols = ['healthy', 'multiple_diseases', 'rust', 'scab']
                y_val = df[cols]
                y_val = np.asarray(y_val) 

                       
                
            # Normalize the images
            X_val = X_val/255

            yield X_val, y_val

In [None]:
# Test the generator

# initialize
val_gen = val_generator(batch_size=5)

# run the generator
X_val, y_val = next(val_gen)

print(X_val.shape)
print(y_val.shape)

In [None]:
def test_generator(batch_size=1):
    
    while True:
        
        # load the data in chunks (batches)
        for df in pd.read_csv('df_test.csv.gz', chunksize=batch_size):
            
            # get the list of images
            image_id_list = list(df['image_id'])
            
            # Create empty X matrix - 3 channels
            X_test = np.zeros((len(df), IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS), dtype=np.uint8)
            

        
            
            # Create X_test
            #================
            
            for i in range(0, len(image_id_list)):
              
              
                # get the image and mask
                image_id = image_id_list[i] + '.jpg'


                # set the path to the image
                path = '../input/plant-pathology-2020-fgvc7/images/' + image_id

                # read the image
                image = cv2.imread(path)
                
                # convert to from BGR to RGB
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                
                # resize the image
                image = cv2.resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))

                # insert the image into X_train
                X_test[i] = image
                
                 
                
            # Normalize the images
            X_test = X_test/255

            yield X_test

In [None]:
# Test the generator

# initialize
test_gen = test_generator(batch_size=1)

# run the generator
X_test = next(test_gen)

print(X_test.shape)

In [None]:
def create_model():
    pre_trained = tf.keras.applications.MobileNetV3Large(input_shape=(IMAGE_HEIGHT, IMAGE_WIDTH, 3), weights='imagenet', include_top=False)

    model = tf.keras.Sequential([
      pre_trained,
        tf.keras.layers.GlobalAveragePooling2D(),
#       tf.keras.layers.Flatten(),
      tf.keras.layers.Dropout(0.3),
      tf.keras.layers.Dense(4, activation='softmax')
      ])
    model.compile(
        loss = 'kullback_leibler_divergence', 
        optimizer = 'adam', 
        metrics = ['accuracy'])
    return model

model = create_model()

model.summary()

In [None]:
num_train_samples = len(df_train)
num_val_samples = len(df_val)

# determine num train steps
train_steps = np.ceil(num_train_samples / TRAIN_BATCH_SIZE)

# determine num val steps
val_steps = np.ceil(num_val_samples / VAL_BATCH_SIZE)

In [None]:
import datetime
time3 = datetime.datetime.now()
print(time3)

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import (EarlyStopping, ReduceLROnPlateau, 
                                        ModelCheckpoint, CSVLogger, LearningRateScheduler)
# Initialize the generators
if AUG:
    train_gen = train_generator_aug(batch_size=TRAIN_BATCH_SIZE,random_seed=123)
else:
    train_gen = train_generator_no_aug(batch_size=TRAIN_BATCH_SIZE)

val_gen = val_generator(batch_size=VAL_BATCH_SIZE)


model.compile(
    Adam(lr=LR),
    loss='categorical_crossentropy',
    metrics=['accuracy'])




filepath = H5_FILE_NAME

#earlystopper = EarlyStopping(patience=10, verbose=1)

checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, 
                             save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.3, patience=3, 
                                   verbose=1, mode='max')



log_fname = TRAINING_LOG_FILE_NAME
csv_logger = CSVLogger(filename=log_fname,
                       separator=',',
                       append=False)

callbacks_list = [checkpoint, csv_logger, reduce_lr]

history = model.fit(train_gen, steps_per_epoch=train_steps, epochs=EPOCHS, 
                    validation_data=val_gen, validation_steps=val_steps,
                    verbose=2,
                    callbacks=callbacks_list)


In [None]:
time4 = datetime.datetime.now()
print(time4)

In [None]:
# Training time
print(time4-time3)

In [None]:
# get the metric names so we can use evaulate_generator
model.metrics_names

In [None]:
model.load_weights(H5_FILE_NAME)

val_gen = val_generator(batch_size=VAL_BATCH_SIZE)

val_loss, val_acc = \
model.evaluate(val_gen, 
               steps=val_steps)

print('val_loss:', val_loss)
print('val_acc:', val_acc)

In [None]:
# Display the training log

train_log = pd.read_csv(TRAINING_LOG_FILE_NAME)

train_log.head()

In [None]:
# display the loss and accuracy curves

import matplotlib.pyplot as plt

acc = train_log['accuracy']
val_acc = train_log['val_accuracy']
loss = train_log['loss']
val_loss = train_log['val_loss']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc)
plt.plot(epochs, val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','val'],loc = 'upper left')
plt.show()

plt.plot(epochs, loss)
plt.plot(epochs, val_loss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','val'],loc = 'upper left')
plt.show()

In [None]:
model.load_weights(H5_FILE_NAME)

val_gen = val_generator(batch_size=1)

preds = model.predict(val_gen, steps=len(df_val), verbose=1)

In [None]:
# get y_pred as index values

y_pred = np.argmax(preds, axis=1)
print(y_pred)

In [None]:
# get y_true as index values

cols = ['healthy', 'multiple_diseases', 'rust', 'scab']
y_true = df_val[cols]
y_true = np.asarray(y_true) 

y_true = np.argmax(y_true, axis=1)
print(y_true)

In [None]:
model.load_weights(H5_FILE_NAME)
val_gen = val_generator(batch_size=1)

preds = model.predict(val_gen, steps=len(df_val), verbose=1)


y_pred = np.argmax(preds, axis=1)
print(y_pred[:50])

In [None]:
# Put the preds into a dataframe

df_preds = pd.DataFrame(preds, columns=['healthy', 'multiple_diseases', 'rust', 'scab'])

df_preds['image_id'] = df_val['image_id'].copy().values

df_preds.head()

In [None]:
# Create a submission csv file

df_results = pd.DataFrame({'image_id': df_preds.image_id,
                            'healthy': df_preds.healthy,
                               'multiple_diseases': df_preds.multiple_diseases,
                               'rust': df_preds.rust,
                               'scab': df_preds.scab,
                           'target':df_val['target'].values
                           }).set_index('image_id')


# create a submission csv file
df_results.to_csv(SUBMISSION_FILE_NAME) 
df_results.head()

In [None]:
time2 = datetime.datetime.now()
print(time2)

In [None]:
# Total time
print(time2 - time1)

In [None]:
!ls