In [None]:
'''# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'''

In [None]:
import os
import sys, math
import zipfile
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import numpy as np
from kaggle_datasets import KaggleDatasets

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path("plant-pathology-train-tfrecords")

In [None]:
print(GCS_PATH)

In [None]:
TARGET_SIZE = [1365,2048]
AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API
#GCS_PATTERN = '../input/plant-pathology-2020-fgvc7/images/Train_*.jpg'
GCS_OUTPUT = './'  # prefix for output file names
SHARDS = 16

def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

In [None]:
def read_tfrecord(example):
    TARGET_SIZE = [1365,2048]
    print(example)
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
        # additional (not very useful) fields to demonstrate TFRecord writing/reading of different types of data
        "label": tf.io.FixedLenFeature([4], tf.int64),  # 4 integers
        "size": tf.io.FixedLenFeature([2], tf.int64),  # two integers
    }
    # decode the TFRecord
    example = tf.io.parse_single_example(example, features)
    
    # FixedLenFeature fields are now ready to use: exmple['size']
    # VarLenFeature fields require additional sparse_to_dense decoding
    print(example["image"].get_shape())
    image = tf.image.decode_jpeg(example['image'], channels=3)
        
    label  = example['label']
    height = example['size'][0]
    width  = example['size'][1]
    
    image = tf.reshape(image,[height, width , 3])
    
    image = tf.image.resize(image,TARGET_SIZE, method=tf.image.ResizeMethod.LANCZOS3)
    return image, label, height, width
    
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False

filenames = tf.io.gfile.glob(GCS_PATH+"/*.tfrec")
print(filenames)
dataset4 = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
dataset4 = dataset4.with_options(option_no_order)
dataset4 = dataset4.map(read_tfrecord, num_parallel_calls=AUTO)
dataset4 = dataset4.shuffle(300)

for x in dataset4:
    print(x)
    break

In [None]:
final_dataset = dataset4.map(lambda image, label, height, width: (image, label))

for x in final_dataset:
    print(x)
    break

In [None]:
def data_augment(image, labels):
    # data augmentation. Thanks to the dataset.prefetch(AUTO) statement in the next function (below),
    # this happens essentially for free on TPU. Data pipeline code is executed on the "CPU" part
    # of the TPU while the TPU itself is computing gradients.
    print("a",image.get_shape())
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_saturation(image, 0, 2)
    print("b",image.get_shape())
    #image = 
    image = tf.math.divide(image,[255])
    print("c",image.get_shape())
    return image, labels   

final_dataset = final_dataset.map(data_augment, num_parallel_calls=AUTO)
final_dataset = final_dataset.repeat()
final_dataset = final_dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)


for x in final_dataset:
    print(x)
    break

def to_float32(image, label,x,y):
    print(image.get_shape())
    return tf.cast(image, tf.float32), tf.cast(label,tf.float32)

training_dataset = dataset4.map(to_float32)

base_dir = '../input/plant-pathology-2020-fgvc7/images/'
train_csv = pd.read_csv('../input/plant-pathology-2020-fgvc7/train.csv')
test_csv = pd.read_csv('../input/plant-pathology-2020-fgvc7/test.csv')

#train_csv = train_csv.astype(str)
train_csv = train_csv.sample(frac=1).reset_index(drop=True)
print(len(train_csv.index))

train_csv["image_id"] = train_csv["image_id"]+'.jpg'
test_csv["image_id"] = test_csv["image_id"]+'.jpg'
split_index = int(len(train_csv.index)*0.2)
val_csv = train_csv[-1*split_index:]
print(len(train_csv.index),len(train_csv.index) - split_index)
train_csv = train_csv[:-1* split_index]
print(len(train_csv.index),len(val_csv.index),split_index)
train_csv.head(10)
#train_dir = os.path.join(base_dir, 'train')
#validation_dir = os.path.join(base_dir, 'validation')

# Directory with our training cat pictures
#train_cats_dir = os.path.join(train_dir, 'cats')

# Directory with our training dog pictures
#train_dogs_dir = os.path.join(train_dir, 'dogs')

# Directory with our validation cat pictures
#validation_cats_dir = os.path.join(validation_dir, 'cats')

# Directory with our validation dog pictures
#validation_dogs_dir = os.path.join(validation_dir, 'dogs')

In [None]:
'''train_csv = train_csv.astype(str)
val_csv = val_csv.astype(str)
print(train_csv.iloc[5,2],type(train_csv.iloc[5,2]))'''

In [None]:
'''mask = train_csv["healthy"] == '1'
train_csv.loc[mask,"healthy"] = 1'''

In [None]:
'''print(train_csv.iloc[2,1],type(train_csv.iloc[2,1]))'''

train_csv["multiple_diseases"] = train_csv["multiple_diseases"].astype(int)
train_csv["rust"] = train_csv["rust"].astype(int)
train_csv["scab"] = train_csv["scab"].astype(int)
train_csv["healthy"] = train_csv["healthy"].astype(int)

val_csv["multiple_diseases"] = val_csv["multiple_diseases"].astype(int)
val_csv["rust"] = val_csv["rust"].astype(int)
val_csv["scab"] = val_csv["scab"].astype(int)
val_csv["healthy"] = val_csv["healthy"].astype(int)

In [None]:
def createAndCompileModel():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(512, (9,9), strides = (2,2), activation='relu', input_shape=(1365,2048, 3), name = "conv2D_1"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(246, (7,7), activation='relu',name = "conv2D_2"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(128, (7,7), activation='relu',name = "conv2D_3"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(128, (5,5), activation='relu', name = "conv2D_4"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(90, (3,3), activation='relu', name = "conv2D_6"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(64, (3,3), activation='relu', name = "conv2D_7"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(32, (3,3), activation='relu', name = "conv2D_8"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(4, activation='sigmoid')
    ])

    lr_schedule = ExponentialDecay(
        1e-4,
        decay_steps=12000,
        decay_rate=0.96,
        staircase=True)

    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(learning_rate=lr_schedule, momentum = 1e-4),
                  metrics=['accuracy'])
    return model

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    model = createAndCompileModel() # define your model normally

In [None]:
print(tpu_strategy.num_replicas_in_sync)

# All images will be rescaled by 1./255
train_datagen = ImageDataGenerator(rescale=1./255,
      rotation_range=30,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True,
      fill_mode='nearest')
test_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
'''
# Flow training images in batches of 20 using train_datagen generator
train_generator = train_datagen.flow_from_directory(
        train_dir,  # This is the source directory for training images
        target_size=(150, 150),  # All images will be resized to 150x150
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

# Flow validation images in batches of 20 using test_datagen generator
validation_generator = test_datagen.flow_from_directory(
        validation_dir,
        target_size=(150, 150),
        batch_size=20,
        class_mode='binary')
'''
batchSize = 4
train_generator = train_datagen.flow_from_dataframe(
        train_csv,
        directory = base_dir,
        x_col = 'image_id',
        y_col = ['healthy','multiple_diseases','rust','scab'],
        target_size=(750, 750),
        batch_size=batchSize,
        class_mode='raw'
        )
validation_generator = val_datagen.flow_from_dataframe(
        val_csv,
        directory = base_dir,
        x_col = 'image_id',
        y_col = ['healthy','multiple_diseases','rust','scab'],
        target_size=(750, 750),
        batch_size=batchSize,
        class_mode='raw'
        )
test_generator = test_datagen.flow_from_dataframe(
        test_csv,
        directory = base_dir,
        x_col = 'image_id',
        y_col = [],
        target_size=(750, 750),
        batch_size=batchSize,
        class_mode=None,
        shuffle = False
        )

In [None]:
with tpu_strategy.scope():
    model = tf.keras.models.load_model("../input/plant-pathology-2020-tpu/my_modelv5.h5")

In [None]:
model.summary()

In [None]:
'''
Epoch 45/45 #1000x1000 input
364/364 [==============================] - 695s 2s/step - loss: 0.1818 - accuracy: 0.8809 - val_loss: 0.2045 - val_accuracy: 0.8819

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_27 (Conv2D)           (None, 994, 994, 246)     36408     
_________________________________________________________________
max_pooling2d_26 (MaxPooling (None, 497, 497, 246)     0         
_________________________________________________________________
conv2d_28 (Conv2D)           (None, 493, 493, 246)     1513146   
_________________________________________________________________
max_pooling2d_27 (MaxPooling (None, 246, 246, 246)     0         
_________________________________________________________________
conv2d_29 (Conv2D)           (None, 242, 242, 128)     787328    
_________________________________________________________________
max_pooling2d_28 (MaxPooling (None, 121, 121, 128)     0         
_________________________________________________________________
conv2d_30 (Conv2D)           (None, 119, 119, 128)     147584    
_________________________________________________________________
max_pooling2d_29 (MaxPooling (None, 59, 59, 128)       0         
_________________________________________________________________
conv2d_31 (Conv2D)           (None, 57, 57, 90)        103770    
_________________________________________________________________
max_pooling2d_30 (MaxPooling (None, 28, 28, 90)        0         
_________________________________________________________________
conv2d_32 (Conv2D)           (None, 26, 26, 90)        72990     
_________________________________________________________________
max_pooling2d_31 (MaxPooling (None, 13, 13, 90)        0         
_________________________________________________________________
conv2d_33 (Conv2D)           (None, 11, 11, 64)        51904     
_________________________________________________________________
max_pooling2d_32 (MaxPooling (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_34 (Conv2D)           (None, 3, 3, 32)          18464     
_________________________________________________________________
max_pooling2d_33 (MaxPooling (None, 1, 1, 32)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               4224      
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 516       
=================================================================
Total params: 2,736,334
Trainable params: 2,736,334
Non-trainable params: 0
_________________________________________________________________

'''

In [None]:
'''
Epoch 30/30
364/364 [==============================] - 534s 1s/step - loss: 0.2676 - accuracy: 0.8259 - val_loss: 0.3444 - val_accuracy: 0.7967

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 994, 994, 246)     36408     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 497, 497, 246)     0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 493, 493, 128)     787328    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 246, 246, 128)     0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 244, 244, 128)     147584    
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 122, 122, 128)     0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 120, 120, 90)      103770    
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 60, 60, 90)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 58, 58, 64)        51904     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 29, 29, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 53824)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               6889600   
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 516       
=================================================================
Total params: 8,017,110
Trainable params: 8,017,110
Non-trainable params: 0
_________________________________________________________________

'''

In [None]:
'''
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 998, 998, 246)     6888      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 499, 499, 246)     0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 497, 497, 128)     283520    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 248, 248, 128)     0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 246, 246, 128)     147584    
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 123, 123, 128)     0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 121, 121, 64)      73792     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 60, 60, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 60, 60, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 230400)            0         
_________________________________________________________________
dense (Dense)                (None, 512)               117965312 
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 516       
=================================================================
Total params: 118,658,348
Trainable params: 118,658,348
Non-trainable params: 0
_________________________________________________________________
'''

In [None]:
#trainX = np.load("../input/notebookd1c779b8a4/trainDataXNumpy.npy")
#trainY = np.load("../input/notebookd1c779b8a4/trainDataYNumpy.npy")
#valX = np.load("../input/notebookd1c779b8a4/valDataXNumpy.npy")
#valY = np.load("../input/notebookd1c779b8a4/valDataYNumpy.npy")

for x in final_dataset:
    print(x)
    break

In [None]:
4 * tpu_strategy.num_replicas_in_sync

In [None]:
batchSize = 32
final_dataset2 = final_dataset.batch(batchSize)
final_dataset2 = final_dataset2.prefetch(AUTO)
TRAIN_STEPS = 1821 // batchSize
print("TRAINING IMAGES: ", 1821, ", STEPS PER EPOCH: ", TRAIN_STEPS)

history = model.fit(
      final_dataset2,
      batch_size = batchSize,
      shuffle=False,
      steps_per_epoch=TRAIN_STEPS,
      epochs=40)
#Epoch 100/100
#8/8 [==============================] - 58s 7s/step - loss: 0.1785 - accuracy: 0.8737 - val_loss: 0.4360 - val_accuracy: 0.6486
#Epoch 100/100
#8/8 [==============================] - 59s 7s/step - loss: 0.1752 - accuracy: 0.8579 - val_loss: 0.3450 - val_accuracy: 0.7143
# val_accuracy: 0.75 maybe
#Epoch 99/100
#29/29 [==============================] - 75s 3s/step - loss: 0.2268 - accuracy: 0.9360 - val_loss: 0.5634 - val_accuracy: 0.7543
#Epoch 5/50
#91/91 [==============================] - 80s 874ms/step - loss: 0.4055 - accuracy: 0.6225 - val_loss: 0.4009 - val_accuracy: 0.6108
#Epoch 100/100 #500x500 input
#91/91 [==============================] - 154s 2s/step - loss: 0.1259 - accuracy: 0.9181 - val_loss: 0.1227 - val_accuracy: 0.9460
#Epoch 55/55 #1000x1000 input
#364/364 [==============================] - 470s 1s/step - loss: 0.2441 - accuracy: 0.8293 - val_loss: 0.2559 - val_accuracy: 0.8819
#Epoch 30/30 #1000x1000 input
#364/364 [==============================] - 534s 1s/step - loss: 0.2676 - accuracy: 0.8259 - val_loss: 0.3444 - val_accuracy: 0.7967
#Epoch 45/45 #1000x1000 input
#364/364 [==============================] - 695s 2s/step - loss: 0.1818 - accuracy: 0.8809 - val_loss: 0.2045 - val_accuracy: 0.8819

In [None]:
model.save('./my_modelv6.h5')

In [None]:
import matplotlib.pyplot as plt
acc = history.history['accuracy']
#val_acc = history.history['val_accuracy']
loss = history.history['loss']
#val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training accuracy')
#plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training accuracy')

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training Loss')
#plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training loss')
plt.legend()

plt.show()

In [None]:
'''test_csv = pd.read_csv('../input/plant-pathology-2020-fgvc7/test.csv')
test_csv["image_id"] = test_csv["image_id"] + '.jpg'
test_csv.head()'''

In [None]:
'''test_generator = test_datagen.flow_from_dataframe(
        test_csv,
        directory = base_dir,
        x_col = 'image_id',
        y_col = ['healthy','multiple_diseases','rust','scab'],
        target_size=(150, 150),
        batch_size=200,
        class_mode='raw'
        )
'''

p = model.predict(test_generator)
print(p)

test_csv = pd.read_csv('../input/plant-pathology-2020-fgvc7/test.csv')
p_df = pd.DataFrame(p,columns=['healthy','multiple_diseases','rust','scab'])
result = pd.concat([test_csv, p_df], axis=1)
#print(result["image_id"][:-4])
#result["image_id"] = result["image_id"][:-4]

result.to_csv("./submission.csv",index = False)

result.head()

In [None]:
'''p[:5]'''

In [None]:
#model = tf.keras.models.load_model('../input/notebook45bc751087/my_modelv4.h5')

In [None]:
#model.evaluate(validation_generator)