### What's the problem
Learn a bunch of pictures to determine whether there is a cactus in the picture.
### Review point
* Preparation of CNN input data
* Basic CNN for **yes or no** problems

In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# imports used in this project

# keras
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ModelCheckpoint
# ploting
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [None]:
# import csv datas
train_data = pd.read_csv("/kaggle/input/aerial-cactus-identification/train.csv",dtype="str")
test_data = pd.read_csv("/kaggle/input/aerial-cactus-identification/sample_submission.csv",dtype="str")

### Understand the data

In [None]:
# train csv
train_data.head()

In [None]:
# unzip the images
!mkdir "/kaggle/data/"
!mkdir "/kaggle/data/aerial-cactus-identification"
!unzip "/kaggle/input/aerial-cactus-identification/train.zip" -d "/kaggle/data/aerial-cactus-identification/train" > /dev/null
!unzip "/kaggle/input/aerial-cactus-identification/test.zip" -d "/kaggle/data/aerial-cactus-identification/test" > /dev/null

train_dir = "/kaggle/data/aerial-cactus-identification/train/train"
test_dir = "/kaggle/data/aerial-cactus-identification/test/test"

In [None]:
# show some of the images
images_with_cactus = train_data[train_data.has_cactus=='1'][:10]
images_without_cactus = train_data[train_data.has_cactus=='0'][:10]
# plot images with cactus
for i in range(10):
    plt.subplot(4,5,i+1)
    img_data = mpimg.imread(train_dir+'/'+images_with_cactus.iloc[i]["id"])
    plt.imshow(img_data)
# plot images without cactus  
for i in range(10):
    plt.subplot(4,5,10+i+1)
    img_data = mpimg.imread(train_dir+'/'+images_without_cactus.iloc[i]["id"])
    plt.imshow(img_data)

### Create image data using [ImageDataGenerator](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator)

In [None]:
# create images data
datagen = ImageDataGenerator(rescale=1./255,# make each pixel 0~1
    rotation_range=20,# rotate the images
    width_shift_range=0.2, # shift the images
    height_shift_range=0.2, # shift the images
    horizontal_flip=True) # flip the images
# training data
train_generator = datagen.flow_from_dataframe(
        dataframe=train_data[:15001],
        directory=train_dir,
        x_col='id',
        y_col='has_cactus',
        shuffle=True,
        class_mode='binary',# yes or no problem
        batch_size=150,
        target_size=(32, 32))
# validating data
validation_generator = datagen.flow_from_dataframe(
        dataframe=train_data[15000:],
        directory=train_dir,
        x_col='id',
        y_col='has_cactus',
        class_mode='binary',
        batch_size=50,
        target_size=(32, 32))
# testing data
test_generator = datagen.flow_from_dataframe(
        dataframe=test_data,
        directory=test_dir,
        x_col='id',
        y_col='has_cactus',
        class_mode=None,# no target value
        shuffle=False,# do not mess the image order
        target_size=(32, 32)
)

### Create CNN model

In [None]:
model = keras.models.Sequential([
    # input shape 32*32*3
    # 1st set of layers: Conv2d+BatchNormalization+Relu --> 32*32*3 becomes 32*32*32
    keras.layers.Conv2D(32,(5,5),padding="same",input_shape=(32,32,3)),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    # 2nd set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 32*32*32 becomes 16*16*32
    keras.layers.Conv2D(32,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(),
    # 3rd set of layers: Conv2d+BatchNormalization+Relu --> 16*16*32 becomes 16*16*64
    keras.layers.Conv2D(64,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    # 4th set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 16*16*64 becomes 8*8*64
    keras.layers.Conv2D(64,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(padding="same"),
    # 5th set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 8*8*64 becomes 4*4*128
    keras.layers.Conv2D(128,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(padding="same"),
    # flatten the image --> 4*4*128 becomes 2048
    keras.layers.Flatten(),
    # go through a bunch of neurons and drop some of the links --> 2048 becomes 1024
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.Dropout(0.2),
    # go through another bunch of neurons and drop some of the links --> 1024 becomes 128
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.2),
    # finally go through a single neuron --> 128 becomes 1
    keras.layers.Dense(1, activation="sigmoid")
])
# compile model for binary results
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])
# we can get summary of the model
#model.summary()

### Fitting the model

In [None]:
# callback for each epoch
# model_path = '/kaggle/working/best_model.h5'
model_path = '/kaggle/input/review-of-cactus-recognition-basic-cnn/best_model.h5'
callbacks = [
    # save model
    ModelCheckpoint(filepath=model_path, monitor='val_loss', save_best_only=True),
    # stop when changes become little
    EarlyStopping(monitor='val_loss',patience=20)
]

In [None]:
# training the model
# training is dealt with in version 4
# model.fit_generator(
#     train_generator,
#     epochs=100,
#     validation_data=validation_generator,
#     callbacks=callbacks,
#     shuffle=True# shuffle data to get more stable fitting
# )

### Get test results

In [None]:
model.load_weights(model_path)
res = model.predict_generator(test_generator)
test_data["has_cactus"] = res
test_data.to_csv("/kaggle/working/submission.csv",index=False)

In [None]:
test_data.head()

In [None]:
# show some of the images
images_with_cactus = np.random.choice(test_data[test_data.has_cactus>=0.5]['id'],10)
images_without_cactus = np.random.choice(test_data[test_data.has_cactus<0.5]['id'],10)
# plot images with cactus
for i in range(10):
    plt.subplot(4,5,i+1)
    img_data = mpimg.imread(test_dir+'/'+images_with_cactus[i])
    plt.imshow(img_data)
# plot images without cactus  
for i in range(10):
    plt.subplot(4,5,10+i+1)
    img_data = mpimg.imread(test_dir+'/'+images_without_cactus[i])
    plt.imshow(img_data)

### Well there are some misjudged images...