In [None]:
import pandas as pd
import numpy as np
import cv2
import os
import re
from PIL import Image
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import seaborn as sns # plotting
import random

## Future works
* Labeling bias correction (using augmentation)
* Augmentation (gray scale)
* Image resize efficiently (not using Keras package)
* Cutmix code adjustment
* Using latest model archiecture

## Data loading

In [None]:
path = '../input/plant-pathology-2021-fgvc8'
dir_train = os.path.join(path,'train_images')
train_df=pd.read_csv(os.path.join(path,'train.csv'))
train_df.head()

In [None]:
print('Number of images: {}'.format(train_df['image'].shape))
print('Number of labels: {}'.format(train_df['labels'].shape))

The number of images is well matched with labels.<br>
There is no missing lables.

In [None]:
print('Label of image: {}'.format(train_df['labels'][3]))
fig, ax = plt.subplots(1)
ax.imshow(plt.imread(os.path.join(dir_train,(train_df['image'][3]))))
plt.show()

#### Label Check

In [None]:
image_ids = train_df['labels'].unique()
print("Total number of images = ",len(train_df['labels']))
print("Number of Unique labels = ",len(image_ids))

In [None]:
train_df['labels'].value_counts()

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(8,30))
#categories = train_df['labels'].unique()
#plt.pie(train_df['labels'].value_counts())
#plt.legend(categories, loc='best')
#plt.show()

#### Encoding the labels

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder() # Create label encoder

encoder.fit(train_df['labels'])
train_df['label'] = encoder.transform(train_df['labels'])
train_df.head()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,8))
label_x = train_df['label'].value_counts()
sns.barplot(label_x.index, label_x)
plt.show()

In [None]:
labels_num = train_df['labels'].value_counts()
print('Number of label: \n{}'.format(labels_num))

In [None]:
label_num = train_df['label'].value_counts()
print('Number of label: \n{}'.format(label_num))

#### Check image size
I've already checked the image size.<br>
The size of images are same with 2672x4000.

In [None]:
#sample_img = (os.path.join(dir_train,(train_df['image'][3])))
#image = cv2.imread(sample_img, cv2.IMREAD_COLOR)
## convert imreaded image BGR to RGB
#image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
#image /= 255.0

## Model Setting

In [None]:
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SequentialSampler

#### Cutmix (Ongoing)

In [None]:
def generate_cutmix(df, img_dir, beta, n_label):
    im1, im2 = random.sample(range(0,n_label),2)
    img1 = cv2.cvtColor(cv2.imread(os.path.join(img_dir,(df['image'][int(im1)]))), cv2.COLOR_BGR2RGB)
    img2 = cv2.cvtColor(cv2.imread(os.path.join(img_dir,(df['image'][int(im2)]))), cv2.COLOR_BGR2RGB)
    lam1 = int(beta*img1.shape[0])
    lam2 = int(beta*img1.shape[1])
    img2 = cv2.resize(img2, dsize=(lam2, lam1), interpolation=cv2.INTER_AREA)
    img1[:lam1, :lam2,:] = img2
    return img1
new_img = generate_cutmix(train_df, dir_train, 0.4, label_num[1])
fig, ax = plt.subplots(1)
ax.imshow(new_img)
plt.show()

## Prepare data

In [None]:
from sklearn.model_selection import train_test_split
dfs = train_df[['image','label']]

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
train_datagen = ImageDataGenerator(horizontal_flip = True,
                                  rescale = 1./255,
                                  zoom_range = 0.2,
                                  validation_split = 0.2)
test_datagen = ImageDataGenerator(rescale = 1./255,
                                 validation_split = 0.2)

In [None]:
dir_train = '../input/resized-plant2021/img_sz_256'
dfs = train_df[['image','labels']]
dfs

In [None]:
train_generator = train_datagen.flow_from_dataframe(dataframe = dfs,
                                                   directory = dir_train,
                                                   target_size = (256,256),
                                                   x_col = 'image',
                                                   y_col = 'labels',
                                                   batch_size = 128,
                                                   color_mode = 'rgb',
                                                   class_mode = 'categorical',
                                                   subset = 'training')

test_generator = test_datagen.flow_from_dataframe(dataframe = dfs,
                                                 directory = dir_train,
                                                 target_size = (256,256),
                                                 x_col = 'image',
                                                 y_col = 'labels',
                                                 batch_size = 128,
                                                 color_mode = 'rgb',
                                                 class_mode = 'categorical',
                                                 subset = 'validation')

## Build model (Keras)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, Dropout
from tqdm import tqdm
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
model = tf.keras.Sequential([
    Conv2D(32, (3,3), activation = 'relu', input_shape = [256, 256, 3]), # 2672x4000 -> resize to 1/16 167x250
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),
    
    Conv2D(32, (3,3), activation = 'relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),
    
    #Conv2D(32, (3,3), activation = 'relu'),
    #BatchNormalization(),
    #MaxPooling2D(2,2),
    #Dropout(0.2),
    
    Flatten(),
    Dense(32, activation = 'relu'),
    BatchNormalization(),
    #Dropout(0.5),
    Dense(12, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
earlystop = EarlyStopping(patience=3)
lr_schedule = ReduceLROnPlateau(monitor='val_acc', patience=2, verbose=1, factor=0.5, min_lr = 0.0001)
callbacks = [earlystop, lr_schedule]

In [None]:
history = model.fit(train_generator,epochs = 10, validation_data = test_generator, callbacks=callbacks)

## Test and Predict

In [None]:
import os
path = '../input/plant-pathology-2021-fgvc8'
dir_test = os.path.join(path,'test_images')

In [None]:
test_datagen = ImageDataGenerator(rescale = 1./255)
test_set = test_datagen.flow_from_directory(path,
                                            classes=['test_images'])

In [None]:
output = model.predict(test_set)
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
print(test_generator.class_indices)
y_output = np.argmax(output, axis=1)

In [None]:
lbs = test_generator.class_indices
lbs = dict((v,k) for k, v in lbs.items())
print(lbs)

preds = [lbs[k] for k in y_output]
print(preds[:10])

#### Export to csv

In [None]:
path = '../input/plant-pathology-2021-fgvc8'
test_df=pd.read_csv(os.path.join(path,'sample_submission.csv'))
pred_df = test_df.copy()
pred_df.head()

In [None]:
pred_df['labels'] = preds
pred_df.head()

In [None]:
pred_df.to_csv('submission.csv',index=False)