In [None]:
import os
import cv2
import pandas as pd
from tqdm.notebook import tqdm


DATA_PATH = '/kaggle/input/cassava-leaf-disease-classification'

JPEG_PATH = os.path.join(DATA_PATH, 'train_images')
JPEG_SAVE_PATH = '/kaggle/train_images_jpeg'

CSV_PATH = os.path.join(DATA_PATH, 'train.csv')

RESIZE = 227
NUM_TFREDORDS = 1338
IMG_QUALITY = 95
DEBUG = False


#os.makedirs(JPEG_SAVE_PATH, exist_ok=True)
train_df = pd.read_csv(CSV_PATH)

## TFRecords

In [None]:
import math
import tensorflow as tf
import matplotlib.pyplot as plt

files = sorted(os.listdir(JPEG_PATH))

num_iter = math.ceil(len(files) / NUM_TFREDORDS)


def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def serialize_example(feature0, feature1, feature2):
  feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'target': _int64_feature(feature2)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()



for i in range(num_iter):
    cnt = min(NUM_TFREDORDS, len(files) - i*NUM_TFREDORDS)
    tf_filename = f'ld_train{str(i).zfill(2)}-{cnt}.tfrec'    
    print(f'Writing TFRecord: {i} {tf_filename}')

    
    with tf.io.TFRecordWriter(tf_filename) as wf:
        for j in range(cnt):
            img_id = files[NUM_TFREDORDS*i + j]
            img = cv2.imread(os.path.join(JPEG_PATH, img_id))
            img = cv2.resize(img, (RESIZE, RESIZE))
            # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  -> Fix:20201121
            
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            img_id = str.encode(img_id)
            target = train_df['label'][NUM_TFREDORDS*i + j]
            
            example = serialize_example(img, img_id, target)
            
            wf.write(example)

### image checking

In [None]:
import numpy as np


def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    return image


def parse_example(example):
    LABELED_TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'image_name': tf.io.FixedLenFeature([], tf.string),
        'target': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = example['image_name']
    target = example['target']
    return image, label, target


def display_one(image, title, target, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    plt.title(f'{title}: {target}')
    return (subplot[0], subplot[1], subplot[2]+1)


def display_batch_of_images(databatch):
    images, labels, targets = databatch
    images = images.numpy()
    labels = labels.numpy()
    targets = targets.numpy()
    if labels is None:
        labels = [None for _ in enumerate(images)]
    if targets is None:
        targets = [None for _ in enumerate(targets)]
        
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.2
    subplot=(rows, cols, 1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label, target) in enumerate(zip(images[:rows*cols], labels[:rows*cols], targets[:rows*cols])):
        title = label
        title = title.decode('utf-8')
        correct = True
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one(image, title, target, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0.2, hspace=0.2)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Flatten(input_shape=(128,128,3)),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dense(5)
# ])

# model.compile(optimizer='adam',
#               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#               metrics=['accuracy'])

def alexnet(in_shape=(227,227,3), n_classes=5, opt='sgd'):
    in_layer = tf.keras.layers.Input(in_shape)
    conv1 = tf.keras.layers.Conv2D(96, 11, strides=4, activation='relu')(in_layer)
    pool1 = tf.keras.layers.MaxPool2D(3, 2)(conv1)
    conv2 = tf.keras.layers.Conv2D(256, 5, strides=1, padding='same', activation='relu')(pool1)
    pool2 = tf.keras.layers.MaxPool2D(3, 2)(conv2)
    conv3 = tf.keras.layers.Conv2D(384, 3, strides=1, padding='same', activation='relu')(pool2)
    conv4 = tf.keras.layers.Conv2D(256, 3, strides=1, padding='same', activation='relu')(conv3)
    pool3 = tf.keras.layers.MaxPool2D(3, 2)(conv4)
    flattened = tf.keras.layers.Flatten()(pool3)
    dense1 = tf.keras.layers.Dense(4096, activation='relu')(flattened)
    drop1 = tf.keras.layers.Dropout(0.5)(dense1)
    dense2 = tf.keras.layers.Dense(4096, activation='relu')(drop1)
    drop2 = tf.keras.layers.Dropout(0.5)(dense2)
    preds = tf.keras.layers.Dense(n_classes, activation='softmax')(drop2)

    model = tf.keras.models.Model(in_layer, preds)
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=opt, metrics=["accuracy"])
    return model

model = alexnet()
print(model.summary())

In [None]:

# Writing TFRecord: 0 ld_train00-1338.tfrec
# Writing TFRecord: 1 ld_train01-1338.tfrec
# Writing TFRecord: 2 ld_train02-1338.tfrec
# Writing TFRecord: 3 ld_train03-1338.tfrec
# Writing TFRecord: 4 ld_train04-1338.tfrec
# Writing TFRecord: 5 ld_train05-1338.tfrec
# Writing TFRecord: 6 ld_train06-1338.tfrec
# Writing TFRecord: 7 ld_train07-1338.tfrec
# Writing TFRecord: 8 ld_train08-1338.tfrec
# Writing TFRecord: 9 ld_train09-1338.tfrec
# Writing TFRecord: 10 ld_train10-1338.tfrec
# Writing TFRecord: 11 ld_train11-1338.tfrec
# Writing TFRecord: 12 ld_train12-1338.tfrec
# Writing TFRecord: 13 ld_train13-1338.tfrec
# Writing TFRecord: 14 ld_train14-1338.tfrec
# Writing TFRecord: 15 ld_train15-1327.tfrec

# image_list=[]
# target_list=[]
# tfrec_list = [
#                 "ld_train00-1338.tfrec",
#                 "ld_train01-1338.tfrec",
#                 "ld_train02-1338.tfrec",
#                 "ld_train03-1338.tfrec",
#                 "ld_train04-1338.tfrec",
#                 "ld_train05-1338.tfrec",
#                 "ld_train06-1338.tfrec",
#                 "ld_train07-1338.tfrec",
#                 "ld_train08-1338.tfrec",
#                 "ld_train09-1338.tfrec",
#                 "ld_train10-1338.tfrec",
#                 "ld_train11-1338.tfrec",
#                 "ld_train12-1338.tfrec",
#                 "ld_train13-1338.tfrec",
#                 "ld_train14-1338.tfrec",
#                 "ld_train15-1327.tfrec"
# ]


    
dataset = tf.data.TFRecordDataset([ "ld_train00-1338.tfrec",
                "ld_train01-1338.tfrec",
                "ld_train02-1338.tfrec",
                "ld_train03-1338.tfrec",
                "ld_train04-1338.tfrec",
                "ld_train05-1338.tfrec",
                "ld_train06-1338.tfrec",
                "ld_train07-1338.tfrec",
                "ld_train08-1338.tfrec",
                "ld_train09-1338.tfrec",
                "ld_train10-1338.tfrec",
                "ld_train11-1338.tfrec",
                "ld_train12-1338.tfrec",
                "ld_train13-1338.tfrec",
                "ld_train14-1338.tfrec",
                "ld_train15-1327.tfrec"]).map(parse_example).batch(10000) #1338 #21397

data = iter(dataset)
images, labels, targets = next(data)
del data
del dataset
del labels

print(len(images))
#images = images.numpy()
#labels = labels.numpy()
#targets = targets.numpy()


#model.fit(images.numpy(), targets.numpy(), epochs=10)


# for resize_file in tfrec_list:
#     if resize_file == "ld_train15-1327.tfrec":
#         dataset = tf.data.TFRecordDataset([resize_file]).map(parse_example).batch(1327) #1327
#     else:
#         dataset = tf.data.TFRecordDataset([resize_file]).map(parse_example).batch(1338) #1338

#     data = iter(dataset)
#     images, labels, targets = next(data)
#     print(resize_file, len(images))
#     images = images.numpy()
#     labels = labels.numpy()
#     targets = targets.numpy()

#     for i in images:
#         image_list.append(i)

#     for i in targets:
#         target_list.append(i)



In [None]:
model.fit(images, targets, epochs=1)
#model.fit(images, targets, epochs=10)

In [None]:
image_path = DATA_PATH + "/test_images/" + "2216849948.jpg"
image = tf.keras.preprocessing.image.load_img(image_path, target_size=(227, 227, 3))
input_arr = tf.keras.preprocessing.image.img_to_array(image)
input_arr = np.array([input_arr])  # Convert single image to a batch.
predictions = model.predict(input_arr)


In [None]:
print(predictions)
np.argmax(predictions, axis=1)
#model.predict_classes(input_arr)

In [None]:
import pandas as pd
path = DATA_PATH+"/label_num_to_disease_map.json"
data = pd.read_csv(path)
data

In [None]:
import pandas as pd

data = pd.read_csv(DATA_PATH+"/sample_submission.csv")
data.info()

In [None]:
#df = pd.DataFrame({'image_id': ['2216849948.jpg'], 'label': [np.argmax(predictions, axis=1)[0]] })
df = pd.DataFrame({'image_id': ['2216849948.jpg'], 'label': 4 })
df.to_csv("submission.csv",index=False)
df.info()

In [None]:
data = pd.read_csv("submission.csv")
data

In [None]:
df.to_csv("submission2.csv")
data2 = pd.read_csv("submission2.csv")
data2