In [None]:
import numpy as np
import pandas as pd
import os

# import tensorflow as tf
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import tempfile
import pprint

from tqdm import tqdm
from PIL import Image, ImageOps
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

import glob
import io
import os
import yaml

import IPython.display as display

%matplotlib inline

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set.
    # On Kaggle this is always the case.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
from kaggle_datasets import KaggleDatasets

GCS_DS_PATH = KaggleDatasets().get_gcs_path('plant-pathology-2020-fgvc7')
GCS_DS_PATH

In [None]:
class CFG:
    SEED = 6718
    N_SPLITS = 5    
    epochs = 10
    batch_size = REPLICAS * 32
    IMG_SIZE = 512
    OUTPUT_DIR = ''
    IMG_PATH = '../input/plant-pathology-2020-fgvc7/images/'

In [None]:
test_df = pd.read_csv('../input/plant-pathology-2020-fgvc7/sample_submission.csv')
label_cols = list(test_df.columns[1:])

print(test_df.shape)
test_df.head()

In [None]:
def decode_img_from_file_path(path, label):
    file_bytes = tf.io.read_file(path) # Reads and outputs the entire contents of the input filename.
    img = tf.image.decode_jpeg(file_bytes, channels=3) # Decode a JPEG-encoded image to a uint8 tensor
    img = tf.image.resize(img, [CFG.IMG_SIZE, CFG.IMG_SIZE])
    img = tf.cast(img, tf.float32) / 255.0 # Casts a tensor to the type float32 and divides by 255.
    return img, label


def data_augment(image, label=None, seed=CFG.SEED):
    image = tf.image.random_flip_left_right(image, seed=seed)

    if label is None:
        return image
    else:
        return image, label

In [None]:
def build_dataset(paths, labels=None, bsize=32, mode='train'):
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    #if mode == 'train':
    #    albu_fn = albu_train_fn
    #else:
    #    albu_fn = albu_val_fn
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_img_from_file_path, num_parallel_calls=AUTO)
    # dset = dset.map(albu_fn, num_parallel_calls=AUTO)
    dset = dset.map(data_augment, num_parallel_calls=AUTO)
    
    if mode == 'train':
        # dset = dset.repeat()
        dset = dset.shuffle(1024)
    dset = dset.batch(bsize).prefetch(AUTO) # overlaps data preprocessing and model execution while training
    return dset

In [None]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model


def create_model(input_shape, output_shape):
    net = EfficientNetB0(include_top=False,input_shape=input_shape, weights='imagenet')
    x = net.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(32, activation = "relu")(x)
    out = Dense(output_shape, activation = "sigmoid")(x)
    model = Model(inputs = net.input, outputs = out)
    return model

In [None]:
train = pd.read_csv("../input/plant-pathology-2020-fgvc7/train.csv")


y = train[label_cols].values

kf = KFold(n_splits=CFG.N_SPLITS,random_state=CFG.SEED, shuffle=True)
for fold, (train_idx, valid_idx) in enumerate(kf.split(train,y)):
    train.loc[valid_idx, 'kfold'] = fold

train['kfold'] = train['kfold'].astype(int)
train.head()

In [None]:
train.to_csv('folds.csv', index=False)

In [None]:
def train_loop(fold):
    train_df = pd.read_csv("folds.csv")
    train_fold = train_df[train_df.kfold != fold]
    valid_fold = train_df[train_df.kfold == fold]

    X_train_idx = train_fold['image_id'].values
    X_valid_idx = valid_fold['image_id'].values

    train_paths = GCS_DS_PATH + '/images/' + X_train_idx + '.jpg'
    valid_paths = GCS_DS_PATH + '/images/' + X_valid_idx + '.jpg'

    y_train = train_fold[label_cols].values
    y_valid = valid_fold[label_cols].values

    train_ds = build_dataset(train_paths, y_train, bsize=CFG.batch_size, mode='train')
    valid_ds = build_dataset(valid_paths, y_valid, bsize=CFG.batch_size, mode='valid')
    
    steps_per_epoch = train_paths.shape[0] // CFG.batch_size

    rlr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 2, verbose = 0, 
                                    min_delta = 1e-4, min_lr = 1e-6, mode = 'min')

    ckp = ModelCheckpoint('fold-%i.h5'%fold, monitor = 'val_loss',
                          verbose = 0, save_best_only = True, mode = 'min')

    es = EarlyStopping(monitor = 'val_loss', min_delta = 1e-4, patience = 5, mode = 'min', 
                       restore_best_weights = True, verbose = 0)


    with strategy.scope():
        model = create_model(input_shape = (CFG.IMG_SIZE, CFG.IMG_SIZE, 3), output_shape = len(label_cols))
        model.compile(optimizer = "Adam", loss = "binary_crossentropy", metrics = ["AUC"])

    history = model.fit(train_ds,
                        validation_data=valid_ds,        
                        epochs=CFG.epochs,
                        callbacks=[rlr,es,ckp])
    
    return history

### Training

In [None]:
fold = 0

history = train_loop(fold)
hist = pd.DataFrame(history.history)
hist.to_csv(f'history_fold{fold}.csv', index=False)

In [None]:
hist.head()

### Prediction

In [None]:
fold = 0
sub_df = pd.read_csv('../input/plant-pathology-2020-fgvc7/sample_submission.csv')


X_test_idx = sub_df['image_id'].values
test_paths = GCS_DS_PATH + '/images/' + X_test_idx + '.jpg'

y_test = sub_df[label_cols].values

test_ds = build_dataset(test_paths, y_test, bsize=CFG.batch_size, mode='test')

In [None]:
with strategy.scope():
    model = create_model(input_shape = (CFG.IMG_SIZE, CFG.IMG_SIZE, 3), output_shape = len(label_cols))
    model.compile(optimizer = "Adam", loss = "binary_crossentropy", metrics = ["AUC"])
    print('Loading best model...')
    model.load_weights('fold-%i.h5'%fold)

In [None]:
test_preds = model.predict(test_ds, verbose=1)
test_preds.shape

In [None]:
sub_df[label_cols] = test_preds
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=False)