# Simple CNN using keras

Create an algorithm that can identify a specific type of cactus in aerial imagery.  
Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.  

**Contents:**

1. View and load image data
- Build and train CNN model
- View model performance
- Create perdictions on test data

In [None]:
# Import modules

import os
import zipfile
import random
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import cv2
from tensorflow import keras

# from keras.applications.vgg16 import VGG16

from tensorflow.keras.layers import Conv2D, Dense, Flatten, Dropout, Activation
from tensorflow.keras.layers import BatchNormalization, Reshape, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

print(os.listdir("../input/aerial-cactus-identification/"))

## Load labelled image data into generators

In [None]:
# Extract images from zip files
# (Not sure why I have to do this as everywhere seems to say that zip archives are automatically extracted... but oh well)

# Need to extract them to /kaggle/temp/ directory (instead of /kaggle/working/)
# Otherwise won't let me submit submission.csv file, instead generates output visulations of the images

with zipfile.ZipFile("../input/aerial-cactus-identification/train.zip","r") as z:
    z.extractall("/kaggle/temp/")
with zipfile.ZipFile("../input/aerial-cactus-identification/test.zip","r") as z:
    z.extractall("/kaggle/temp/test/") # needs to be in subdirectory (i.e. test/test/) for flow_from_directory to work

# for dirname, _, filenames in os.walk("./train"):
#     for filename in filenames[:5]:
#         print(os.path.join(dirname, filename))

print(len(os.listdir("../temp/train")))
print(len(os.listdir("../temp/test/test")))

In [None]:
# Set directories

train_dir = "../temp/train"
test_dir = "../temp/test"
labels = pd.read_csv('../input/aerial-cactus-identification/train.csv')

labels.has_cactus = labels.has_cactus.astype(str) # Classes must be str and not int
print(labels['has_cactus'].value_counts())

In [None]:
# Display example image
# Image(os.path.join(train_dir, labels.iloc[0,0]), width=250, height=250)

# Plot random sample of training images

rand_images = random.sample(os.listdir(train_dir), 16)

fig = plt.figure(figsize=(16,4))
for i, im in enumerate(rand_images):
    plt.subplot(2, 8, i+1)
    im = cv2.imread(os.path.join(train_dir, im))
    plt.imshow(im)
    plt.axis('off')
plt.show()

In [None]:
# Split training data into training and validation sets

# Could use sklearn.model_selection.train_test_split instead

validation_split = 0.8
idxs = np.random.permutation(range(len(labels))) < validation_split*len(labels)

train_labels = labels[idxs]
val_labels = labels[~idxs]
print(len(train_labels), len(val_labels))

In [None]:
# Process image JPEGs into tensors
# Pixel values rescaled from [0,255] to [0,1]

# Generate batches of tensor image data (with real-time data augmentation - horizontal and vertical flips)
train_datagen = keras.preprocessing.image.ImageDataGenerator(rescale=1/255, horizontal_flip=True, vertical_flip=True)

batch_size = 128

train_generator = train_datagen.flow_from_dataframe(train_labels,directory=train_dir,x_col='id',
                                                    y_col='has_cactus',class_mode='binary',batch_size=batch_size,
                                                    target_size=(32,32))
val_generator = train_datagen.flow_from_dataframe(val_labels,directory=train_dir,x_col='id',
                                                    y_col='has_cactus',class_mode='binary',batch_size=batch_size,
                                                    target_size=(32,32))

## Build and train CNN model

In [None]:
# Build CNN model

input_shape = (32, 32, 3)

model = keras.models.Sequential()

model.add(Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=input_shape))
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D((2, 2)))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

# Alternative model (incl. batch normalization, dropout and global average pooling)

# model = keras.models.Sequential()

# model.add(Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=input_shape))
# model.add(BatchNormalization())
# model.add(Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=input_shape))
# model.add(BatchNormalization())
# model.add(MaxPooling2D())

# model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling2D())

# model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling2D())

# model.add(Conv2D(256, (3, 3), padding='same', activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling2D())

# model.add(GlobalAveragePooling2D())

# model.add(Dense(256))
# model.add(Activation('relu'))
# model.add(Dropout(0.5))

# model.add(Dense(1))
# model.add(Activation('sigmoid'))

# model.summary()

In [None]:
# Define model loss, optimizer and metrics

model.compile(loss = keras.losses.binary_crossentropy,
              optimizer = 'adam',
              metrics = ['acc'])

callbacks = [EarlyStopping(monitor='val_loss', patience=20, verbose=1, restore_best_weights=True), # Stop training when a monitored metric has stopped improving
             ReduceLROnPlateau(patience=10, verbose=1), # Reduce learning rate when a metric has stopped improving
#              ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', 
#                              verbose=0, save_best_only=True)
            ]

In [None]:
# Train / fit model

epochs = 100

history = model.fit(train_generator,
                    epochs = epochs,
                    verbose = 1,
                    callbacks = callbacks,
                    validation_data = val_generator,
                    #class_weight = class_weights,
                   )

In [None]:
# Load best model (best weights restored automatically)

# Model with lowest validation loss is loaded (not necessarily the model with the best validation accuracy...)

idx = np.argmax(history.history['val_acc'])
print(history.history['val_loss'][idx], history.history['val_acc'][idx])

idx = np.argmin(history.history['val_loss'])
print(history.history['val_loss'][idx], history.history['val_acc'][idx])

## Plot model performance

In [None]:
# Evaluate model performance

plt.figure(figsize=(16,4))

plt.subplot(1,2,1)
plt.plot(history.history['acc'], label = 'training accuracy')
plt.xlabel('# epochs')
plt.ylabel('Accuracy')

plt.plot(history.history['val_acc'], label = "validation accuracy")
plt.title("Accuracy evolution")
plt.legend()
plt.ylim(0.9,1.01)

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label = 'training loss')
plt.xlabel('# epochs')
plt.ylabel('Loss - Binary Cross Entropy')

plt.plot(history.history['val_loss'], label = "validation loss")
plt.title("Loss evolution")
plt.legend()
plt.ylim(-0.01,0.1)

plt.show()

## Make submission file with prediction on test data

In [None]:
# Use trained model to make predication on test data

test_datagen = keras.preprocessing.image.ImageDataGenerator(rescale = 1/255)

test_generator = test_datagen.flow_from_directory(
    directory = test_dir,
    target_size = (32, 32),
    batch_size = 1,
    class_mode = None,
    shuffle = False)

probabilities = model.predict(test_generator)

In [None]:
# Create submission file

sample_submission = pd.read_csv('../input/aerial-cactus-identification/sample_submission.csv')
df = pd.DataFrame({'id': sample_submission['id']})
df['has_cactus'] = probabilities
df.to_csv("submission.csv", index=False)