In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install tensorflow-gpu==2.0.0-alpha0
# !pip install tensorflow-gpu==2.4.1

In [None]:
# %tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
print(device_name)
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
tf.__version__

In [None]:
# from tensorflow.python.client import device_lib
# device_lib.list_local_devices()

In [None]:
# !cat /proc/meminfo

> # Include packages and libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from keras_preprocessing.image import ImageDataGenerator

from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

> # About the data

## Read file data 

In [None]:
df = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
df.head()

In [None]:
df.info()

The data contains 2 columns.  
The first column is the *images' names* named '**image**', another column is the '**labels**' of all the images in the dataset.  
There are 18632 images and 18632 labels so that there are no null data in this dataset.

> # Process the data

## Visualize the data

In [None]:
print(df['labels'].value_counts().plot.bar())

## **NOTE:**  
### As we can see here, there are 12 labels but some labels are the combine other labels.  
### So that, there actually are 5 diseases which are:  
* rust
* scab
* complex
* frog_eye_leaf_spot
* powdery_mildew

### And another label is:  
* healthy

In the description of the challenge, it is said that "**Unhealthy leaves with too many diseases to classify visually will have the complex class, and may also have a subset of the diseases identified.**"  
But in the visualization of data above, there label 'complex' also goes with 'rust', 'frog_eye_leaf_spot', 'powdery_mildew'.  
So i suppose the 'complex' label is not the combination of the remaining labels and it's still an independent label, but can combine with other labels. 

### Because one image (leaf) can have multiple diseases so that this task is a multi-label classification problem!!!

Reform the type of column 'labels' in the dataset from String to Lists in which all labels of all images are contained. 

In [None]:
df['labels'] = df['labels'].apply(lambda string: string.split(' '))
df.head()

Keep on processing the data, i'm using MultiLabelBinarizer to convert all the labels to the type of a pandas DataFrame named '**data**'.  
This '**data**' table represents each disease label as a column and if an image, or leaf, has that disease, the value of it's cell in that column will be 1, otherwise 0.   

In [None]:
_labels = list(df['labels'])
mlb = MultiLabelBinarizer()
data = pd.DataFrame(mlb.fit_transform(_labels), columns=mlb.classes_, index=df.index)
print(data.sum())

labels = list(data.sum().keys())
print(labels)
label_counts = data.sum().values.tolist()

fig, ax = plt.subplots(1,1, figsize=(20,6))

sns.barplot(x= labels, y= label_counts, ax=ax)

for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height(), int(p.get_height()), ha='center')
    
plt.title('THE AMOUNT OF EACH LABEL')

In [None]:
data.insert(0, 'image', df['image'], True)
data

Now we have a table of images' names and their diseases index.  
Let's get the target of all image in the dataset.

In [None]:
target = []
for row in range(len(data)):
    target.append(list((data.iloc[row])[1:]))
    
len(target), target[:5]

In [None]:
target = np.array(target)
target[:5]

## Using Image Data Generator to load the image data from directory

Call an ImageDataGenerator

In [None]:
image_generator = ImageDataGenerator(rescale=1/255.0, validation_split=0.2)
#                                      preprocessing_function=tf.keras.applications.vgg16.preprocess_input)

Define important arguments

In [None]:
HEIGHT = 32 #128 64
WIDTH = 32 #128 64
SEED = 42
BATCH_SIZE = 32

Create a train_generator, validation_generator to get the image from file train_images for training and validating.

In [None]:
type(df), df['labels'].value_counts()

In [None]:
with tf.device('/GPU:0'):
    train_generator = image_generator.flow_from_dataframe(
        dataframe=df,
        directory='../input/plant-pathology-2021-fgvc8/train_images',
        x_col='image',
        y_col='labels',
        subset='training',
        batch_size=BATCH_SIZE, 
        seed=SEED,
        class_mode='categorical',
        target_size=(HEIGHT, WIDTH),
        shuffle=True,
    )
    validation_generator = image_generator.flow_from_dataframe(
        dataframe=df,
        directory='../input/plant-pathology-2021-fgvc8/train_images',
        x_col='image',
        y_col='labels',
        subset='validation',
        batch_size=BATCH_SIZE, 
        seed=SEED,
        class_mode='categorical',
        target_size=(HEIGHT, WIDTH),
        shuffle=True,
    )

Create a CNN model

In [None]:
# instantiating the model in the strategy scope creates the model on the TPU
# with tpu_strategy.scope():
# print(tpu, tpu_strategy)
# run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
with tf.device('/GPU:0'):
    model=Sequential()
    model.add(Conv2D(64,kernel_size=4,activation='relu',input_shape=(HEIGHT,WIDTH,3)))
    model.add(MaxPooling2D(2,2))
    model.add(Conv2D(64,(3,3),activation='relu'))
    model.add(MaxPooling2D(2,2))
#     model.add(Conv2D(64,(3,3),activation='relu'))
#     model.add(MaxPooling2D(2,2))
    model.add(Conv2D(128,(3,3),activation='relu'))
    model.add(MaxPooling2D(2,2))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(12,activation='softmax'))

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'],
    #     options=run_opts
    )

# with tf.device('/gpu:0'):
#     model = Sequential()
#     model.add(Conv2D(64, kernel_size=4, activation='relu', input_shape=(HEIGHT, WIDTH, 3)))
#     model.add(MaxPooling2D(2,2))
#     model.add(Conv2D(64, kernel_size=4, activation='relu'))
#     model.add(MaxPooling2D(2,2))
#     model.add(Dropout(0.5))
#     model.add(Conv2D(128, kernel_size=4, activation='relu'))
#     model.add(MaxPooling2D(2,2))
#     model.add(Conv2D(128, kernel_size=4, activation='relu'))
#     model.add(MaxPooling2D(2,2))
#     model.add(Dropout(0.5))
#     model.add(Conv2D(256, kernel_size=4, activation='relu'))
#     model.add(MaxPooling2D(2,2))
#     model.add(Conv2D(256, kernel_size=4, activation='relu'))
#     model.add(MaxPooling2D(2,2))
#     model.add(Flatten())
#     model.add(Dropout(0.5))
# #     model.add(Dense(512, activation='relu'))
#     model.add(Dense(6, activation='softmax'))

#     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["accuracy"])

model.summary()


Compile the model

In [None]:
# model.compile(
#     optimizer='adam',
#     loss='categorical_crossentropy',
#     metrics=['accuracy'])
# model.summary()

Define arguments 

In [None]:
TRAIN_STEP_SIZE = train_generator.samples/train_generator.batch_size
VALIDATION_STEP_SIZE = validation_generator.samples/validation_generator.batch_size

In [None]:
# tf.test.is_gpu_available(), tf.test.gpu_device_name()

Fit the model

In [None]:
with tf.device('/gpu:0'):
    model_history=model.fit_generator(train_generator, validation_data=validation_generator, epochs=5)

#                              steps_per_epoch=TRAIN_STEP_SIZE,
#                              validation_steps=VALIDATION_STEP_SIZE
                                
            

In [None]:
tf.__version__