In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [None]:
from keras_preprocessing.image import ImageDataGenerator
from zipfile import ZipFile

# Data extraction

In [None]:
path = "/kaggle/input/aerial-cactus-identification/"
files_dataframe = pd.read_csv(path + "train.csv", dtype=str)
files_dataframe.head()

In [None]:
training_files = "train/" + files_dataframe["id"]
print("Training sample:")
print(training_files.head(2))

# Organises the images in the following way:
# .
# |_ training/train/img_name.jpg
#                   ....
# |_ test/img_name.jpg
#         ....

with ZipFile(path + "train.zip", 'r') as zipper:
    # Extract the training set
    zipper.extractall("./training/", training_files)
    
with ZipFile(path + "test.zip", 'r') as zipper:
    # Extract the training set
    zipper.extractall("./test/")

# Data visualisation

In [None]:
class_reparts = files_dataframe['has_cactus'].value_counts()
ax = class_reparts.plot.bar()

Let's define weights for both classes that will serve to balance the classes during the training phase:

In [None]:
total_samples = files_dataframe['has_cactus'].size
print("Total number of samples: ", total_samples)
has_cactus_weight = total_samples / (2 * class_reparts['1'])
no_cactus_weight = total_samples / (2 * class_reparts['0'])
class_weights = {0: no_cactus_weight, 1: has_cactus_weight}
print("Class weights: ", class_weights)

### Observe some images

In [None]:
import matplotlib.pyplot as plt
from matplotlib.image import imread
plt.figure(figsize=(36, 12))
for i, k in enumerate(np.random.randint(0, len(files_dataframe), size=(20, ))):
    plt.subplot(4, 5, i + 1)
    plt.imshow(imread("./training/" + training_files.iloc[k]))
    plt.title("Label :" + str(files_dataframe["has_cactus"].iloc[k]))

## Enhance the image quality
We'll use skimage to apply adaptative equalization on the images in order to improve their constrast, since it seems determining regarding the examples above.

In [None]:
import skimage.exposure as exposure

def preprocess(img):
    p2, p98 = np.percentile(img, (3, 97))
    img_rescale = exposure.rescale_intensity(img, in_range=(p2, p98))
    return img_rescale

plt.figure(figsize=(12, 24))
plt.subplot(121)
img = imread("./training/" + training_files.iloc[0])
plt.imshow(img)
plt.title("Before histogram equalization")

plt.subplot(122)
img = preprocess(img)
plt.imshow(img)
plt.title("After histogram equalization")

plt.figure(figsize=(36, 12))
for i, k in enumerate(np.random.randint(0, len(files_dataframe), size=(20, ))):
    plt.subplot(4, 5, i + 1)
    plt.imshow(imread("./training/" + training_files.iloc[k]))
    plt.title("Label :" + str(files_dataframe["has_cactus"].iloc[k]))

## Building the generators

### Main generator and Data augmentation parameters

In [None]:

generator = ImageDataGenerator(samplewise_center=True,
                               samplewise_std_normalization=True,
                               vertical_flip=True,
                               horizontal_flip=True,
                               validation_split=0.1,
                               preprocessing_function=preprocess)

noPreprocessGenerator = ImageDataGenerator(samplewise_center=True,
                               samplewise_std_normalization=True,
                               vertical_flip = True,
                               horizontal_flip = True,
                               rotation_range = 45)

# Generator with no data augmentation, used for validation and predictions
noAugmentationGenerator = ImageDataGenerator(samplewise_center=True,
                               samplewise_std_normalization=True,
                               vertical_flip = False,
                               horizontal_flip = False,
                               preprocessing_function=preprocess)

### Training and validation generators

In [None]:

training_generator = generator.flow_from_dataframe(dataframe=files_dataframe, directory="./training/train/", x_col="id", y_col="has_cactus",
                                                   class_mode="categorical", target_size=(32, 32), batch_size=32, subset="training",
                                                   shuffle=True)

validation_generator = generator.flow_from_dataframe(dataframe=files_dataframe, directory="./training/train/",x_col="id", y_col="has_cactus",
                                                     class_mode="categorical", target_size=(32, 32), batch_size=32, subset='validation',
                                                   shuffle=True)


noproc_training_generator = generator.flow_from_dataframe(dataframe=files_dataframe, directory="./training/train/", x_col="id", y_col="has_cactus",
                                                   class_mode="categorical", target_size=(32, 32), batch_size=32, subset="training",
                                                   shuffle=True)

noproc_validation_generator = generator.flow_from_dataframe(dataframe=files_dataframe, directory="./training/train/",x_col="id", y_col="has_cactus",
                                                     class_mode="categorical", target_size=(32, 32), batch_size=32, subset='validation',
                                                   shuffle=True)

In [None]:
import tensorflow as tf

plt.figure(figsize=(36, 12))
imgs, labels = next(validation_generator)
plotindx = 1
for img, label in zip(imgs, labels):
    plt.subplot(7, 5, plotindx)
    plt.imshow(img)
    labl = 0
    if label[0] == 0:
        labl = 1
    plt.title("Label :" + str(labl))
    plotindx += 1

# Building the model(s)

In [None]:
from tensorflow.keras import datasets, layers, models
from keras.models import clone_model

In [None]:
model = models.Sequential([
    layers.Conv2D(8, kernel_size=(3, 3), activation='relu',padding='same',input_shape=(32, 32, 3)),
    layers.Conv2D(16, kernel_size=(3, 3), activation='relu',padding='same'),
    layers.Conv2D(16, kernel_size=(3, 3), activation='relu',padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2),strides=2),
    layers.Dropout(0.3),
    layers.Conv2D(32, kernel_size=(5, 5), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2),strides=2),
    layers.Dropout(0.3),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.Dense(2, activation='softmax')
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics="accuracy")
model.summary()

# Training

In [None]:
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint

# Reduce learning rate as training goes on
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

# Save the model that achieves the best performances
first_model_save = ModelCheckpoint("/tmp/checkpoint_phase1",
                                  monitor="val_loss",
                                  mode="min",
                                  save_best_only=True)
second_model_save = ModelCheckpoint("/tmp/checkpoint_phase2",
                                  monitor="val_loss",
                                  mode="min",
                                  save_best_only=True)

### First training phase
The version of the model that achieves the best validation accuracy is saved and used for a second training phase.

In [None]:
history = model.fit_generator(training_generator, validation_data=validation_generator,
                  steps_per_epoch=training_generator.n // training_generator.batch_size,
                  verbose=1, epochs=32,
                              class_weight=class_weights,
                              callbacks=[first_model_save])

### Second Training phase
We now train a second time and once again save the model that achieves the best validation accuracy

In [None]:
# Create generators for the second training phase
generator_p2 = ImageDataGenerator(samplewise_center=True,
                               samplewise_std_normalization=True,
                               validation_split=0.1,
                               preprocessing_function=preprocess)

training_generator_p2 = generator_p2.flow_from_dataframe(dataframe=files_dataframe, directory="./training/train/", x_col="id", y_col="has_cactus",
                                                   class_mode="categorical", target_size=(32, 32), batch_size=32, subset="training",
                                                   shuffle=True)

validation_generator_p2 = generator_p2.flow_from_dataframe(dataframe=files_dataframe, directory="./training/train/",x_col="id", y_col="has_cactus",
                                                     class_mode="categorical", target_size=(32, 32), batch_size=32, subset='validation',
                                                   shuffle=True)

In [None]:
from keras.models import load_model
model = load_model("/tmp/checkpoint_phase1")

In [None]:
history = model.fit_generator(training_generator_p2, validation_data=validation_generator_p2,
                  steps_per_epoch=training_generator_p2.n // training_generator_p2.batch_size,
                  verbose=1, epochs=16,
                              class_weight=class_weights,
                              callbacks=[second_model_save])

In [None]:
model = load_model("/tmp/checkpoint_phase2")

## Training on the test set
#### Predict for the test set and create a dataframe from the predictions

In [None]:
test_generator = noAugmentationGenerator.flow_from_directory(directory="./test/",
                                                   class_mode=None, target_size=(32, 32), batch_size=32,
                                                   shuffle=False)

In [None]:
probas = model.predict(test_generator)
firstPredictions = tf.argmax(probas, axis=1)

## Use those prediction to train on the test data

### Adds the test images to the training directory


In [None]:
import shutil
for dirname, _, filenames in os.walk('./test/'):
    for filename in filenames:
        shutil.copy(os.path.join(dirname, filename), "./training/train")

Checking that there are now 21 500 images (17 500 for training + 4000 test images) in ./training/train/

In [None]:
images = []
for dirname, _, filenames in os.walk('./training/train'):
    for filename in filenames:
        images.append(os.path.join(dirname, filename))
print(len(images))
print(images[:5])

# Create the dataframe with the model's predictions taken as the labels


In [None]:
test_files_data = pd.DataFrame({"id": [file for file in os.listdir("./test/test")], "has_cactus": firstPredictions}).astype(str)
print(test_files_data.head())
final_training_dataframe = pd.concat((files_dataframe, test_files_data))
print(final_training_dataframe.head())

In [None]:
generator_final = ImageDataGenerator(samplewise_center=True,
                               samplewise_std_normalization=True,
                               vertical_flip = True,
                               horizontal_flip = True,
                               preprocessing_function=preprocess)
final_training_generator = generator_final.flow_from_dataframe(dataframe=final_training_dataframe, directory="./training/train", x_col="id", y_col="has_cactus",
                                                         class_mode="categorical", target_size=(32, 32), batch_size=32, subset="training",
                                                         shuffle=True)
final_training_valid = generator_final.flow_from_dataframe(dataframe=final_training_dataframe, directory="./training/train", x_col="id", y_col="has_cactus",
                                                         class_mode="categorical", target_size=(32, 32), batch_size=32, subset="validation",
                                                         shuffle=True)

## Train with the training + test data

In [None]:
history = model.fit_generator(final_training_generator, validation_data=final_training_valid,
                  steps_per_epoch=final_training_generator.n // final_training_generator.batch_size,
                  verbose=1, epochs=16)

# Final predictions 

#### Load the model that achieved the best perfomances during the training phase

In [None]:
# model = load_model("/tmp/checkpoint_phase2")

In [None]:
predictions = tf.argmax(model.predict(test_generator), axis=1)
output = pd.DataFrame({"id": test_generator.filenames,
                        "has_cactus": predictions})
output["id"] = output["id"].apply(lambda s: s.split("/")[1])
output.to_csv("submission.csv", index=False)

# Suppress all extracted images

In [None]:
import shutil
try:
    shutil.rmtree("test")
except OSError as e:
    print("Test files already erased")
try:
    shutil.rmtree("training")
except OSError as e:
    print("Training files already erased")