# The shortest way to baseline.

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.applications import EfficientNetB4
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
import tensorflow as tf
from PIL import Image
import os
import matplotlib.pyplot as plt


## Version history:
* Ver. 7 Accuracy 0.748. Time 4559 s. Limit (32400)
* Ver. 10 Change photo resolution to 224x224 from 150x300. And EfficientNetB3 from EfficientNetB0. Accuracy 0.787. Time 4559 s.
* Ver. 11 Change photo resolution to 380x380 from 224x224. And EfficientNetB4 from EfficientNetB3. Accuracy 0.786. Time 16816 s.
* Ver. 12 Change photo resolution to 260x260 from 380x380. And EfficientNetB2 from EfficientNetB4. And add EarlyStopping. Also change learning_rate to 0.0005 from 0.001. Not improving.
* Ver. 13(14) Add photo rotaiting and change learning_rate to 0.001 from 0.0005. Not improving.
* Ver. 18 Add class_weight. Quality got worse.
* Ver. 22. Back to 260x260 and 20 epoch, remove EarlyStopping. Accuracy 0.817. Time 6239 s. New baseline.
* Ver. 23. Add 2 Dense layer to model. Not improving.
* Continue working...



In [None]:
general_path = '../input/cassava-leaf-disease-classification/'

# Read train data

In [None]:
train = pd.read_csv(general_path + 'train.csv')
train['label'] = train['label'].astype('string')
train.sample(5)

In [None]:
names_of_disease = pd.read_json(general_path + 'label_num_to_disease_map.json', typ='series')
names_of_disease

## Pictures

In [None]:
plt.figure(figsize=(16, 12))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    image = Image.open(general_path + 'train_images/' + train.iloc[i]['image_id'])
    array = np.array(image)
    plt.imshow(array)
    label=train.iloc[i]['label']
    plt.title(f'{names_of_disease[int(label)]}')
plt.show()
    

In [None]:
sizes = []
for i in range(1, len(train), 250):
    image = Image.open(general_path + 'train_images/' + train.iloc[i]['image_id'])
    array = np.array(image)
    sizes.append(array.shape)
print('Picture size', set(sizes))

In [None]:
img_width, img_height = 380, 380

# Start Training

In [None]:
datagen = ImageDataGenerator(validation_split=0.2,
                             vertical_flip=True,
                             horizontal_flip=True)
train_datagen_flow = datagen.flow_from_dataframe(
    dataframe=train,
    directory=general_path + 'train_images',
    x_col='image_id',
    y_col='label',
    target_size=(img_width, img_height),
    batch_size=20,
    subset='training',
    seed=12345)

In [None]:
valid_datagen_flow = datagen.flow_from_dataframe(
    dataframe=train,
    directory=general_path + 'train_images',
    x_col='image_id',
    y_col='label',
    target_size=(img_width, img_height),
    batch_size=20,
    subset='validation',
    seed=12345)

## Adjust class balance.

In [None]:
current_balance = train['label'].value_counts(normalize=True)
current_balance

In [None]:
class_weight = {0: (1 - current_balance['0']) / (1 - current_balance.min()),
                1: (1 - current_balance['1']) / (1 - current_balance.min()),
                2: (1 - current_balance['2']) / (1 - current_balance.min()),
                3: (1 - current_balance['3']) / (1 - current_balance.min()),
                4: (1 - current_balance['4']) / (1 - current_balance.min())}

class_weight

## Implement EarlyStopping 

In [None]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)

In [None]:
model = Sequential()
optimizer = Adam(lr=0.00105)
backbone = EfficientNetB4(include_top=False, 
                          weights=None, 
                          pooling='avg')
model.add(backbone)
model.add(Dense(5, activation='softmax'))
model.compile(loss="categorical_crossentropy", 
              optimizer=optimizer, 
              metrics=["accuracy"])
model.fit_generator(train_datagen_flow,
                    validation_data=valid_datagen_flow, 
                    epochs=50,
                    class_weight=class_weight,
                    callbacks=[early_stopping, mc],
                    verbose=2)

## Load best saved after EarlyStopping model

In [None]:
saved_model = load_model('best_model.h5')

# Submission

In [None]:
submission = pd.DataFrame(columns=['image_id','label'])
for image_name in os.listdir(general_path + 'test_images'):
    image_path = os.path.join(general_path + 'test_images', image_name)
    image = tf.keras.preprocessing.image.load_img(image_path)
    resized_image = image.resize((img_width, img_height))
    numpied_image = np.expand_dims(resized_image, 0)
    tensored_image = tf.cast(numpied_image, tf.float32)
    submission = submission.append(pd.DataFrame({'image_id': image_name,
                                                 'label': saved_model.predict_classes(tensored_image)}))

submission

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)