# Capstone Project - Kaggle Invasive Species Monitoring

In [1]:
# Import the required libraries 

import matplotlib.pyplot as ply
import cv2
from sklearn.datasets import load_files
from skimage.io import imread_collection 
from keras.utils import np_utils
import numpy as np
import pandas as pd 
from glob import glob
import os
import math

Using TensorFlow backend.


In [2]:
from sklearn.model_selection import train_test_split

path = "Datasets/train/"
# Function to load train, test and validation datasets

def load_training(path):
    
    # Load in training labels as a Pandas dataframe
    train_targets_data = pd.read_csv('Datasets/train_labels.csv',index_col = False)
    
    # Convert Dataframe into Numpy array
    train_targets = np.array(train_targets_data['invasive'].loc[:,])
    
    # Training image file paths stored as numpy array 'forest_files'
    train_files = np.array(glob("Datasets/train/*"))
    
    # Glob is unordered, sort in increasing numeric file name
    train_files = sorted(train_files, key=lambda name: int(name[15:-4]))
    
    # Insert filepath as name for training set
    train_targets_data['name'] = train_files
    
    return train_files, train_targets_data, train_targets
    
train_files, train_targets_data, train_targets = load_training(path)

In [3]:
train_targets_data.head()

Unnamed: 0,name,invasive
0,Datasets/train\1.jpg,0
1,Datasets/train\2.jpg,0
2,Datasets/train\3.jpg,1
3,Datasets/train\4.jpg,0
4,Datasets/train\5.jpg,1


In [4]:
from sklearn.model_selection import train_test_split

# Perform a split to create and training set and validation set

train_files, valid_files, train_targets, valid_targets = train_test_split(train_files, train_targets, test_size = 0.2, random_state = 42)

In [5]:
path = "Datasets/test"

def load_testing(path):
    
    # Load in testing labels as a Pandas dataframe
    test_targets = pd.read_csv('Datasets/sample_submission.csv')
    
    # Forest testing image file paths stored as numpy array 'testing_images'
    test_files = np.array(glob('Datasets/test/*'))
    
    # Glob is unordered, sort in increasing numeric file name
    test_files = sorted(test_files, key = lambda name: int(name[14:-4]))
    
    return test_files, test_targets

test_files, test_targets = load_testing(path)

In [6]:
test_targets.head()

Unnamed: 0,name,invasive
0,1,0.5
1,2,0.5
2,3,0.5
3,4,0.5
4,5,0.5


### File Statistics

In [7]:
# Produce statistics regarding the datasets

print('There are %s total forest images.\n' % len(np.hstack([train_files, valid_files, test_files])))
print('There are %d training forest images.' % len(train_files))
print('There are %d validation forest images.' % len(valid_files))
print('There are %d test forest images.'% len(test_files))

There are 3826 total forest images.

There are 1836 training forest images.
There are 459 validation forest images.
There are 1531 test forest images.


### Display Dataset Visualisation 

In [None]:
# Code to create a visual plot of the above statistics

%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')

total_forest = len(np.hstack([train_files, valid_files, test_files]))
total_train = len(train_files)
total_valid = len(valid_files)
total_test = len(test_files)

from matplotlib import pyplot as plt
import numpy as np
plt.style.use('ggplot')

objects = ['Total Images','Training Images','Validation Images','Testing Images']
y_pos = np.arange(len(objects))
count = [total_forest, total_train, total_valid, total_test]


plt.barh(y_pos, count, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('Image Count')
plt.title('Dataset Evaluation')

plt.show()

### Pre-process the Data - Conversion into 4D Tensor

In [8]:
# Function to pre-proces the data into 4D array

size_image = (400,400)

from keras.preprocessing import image
from tqdm import tqdm

def path_tensor(img_path):
    
    # Takes RGB image and loads as PIL.Image.Image type
    img = image.load_img(img_path, target_size = size_image)
    
    # Convert the PIL.Image.Image type to a 3D tensor with the shape (224, 224, 3)
    x = image.img_to_array(img)
    
    # 3D tensor conversion to 4D tensor, shape (1, 224, 224, 3). Returns 4D tensor
    return np.expand_dims(x, axis = 0)

def paths_tensor(img_paths):
    list_tensors = [path_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_tensors)   
    

### Run Data Pre-processing

In [9]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Pre-Process the data for Keras
train_tensors = paths_tensor(train_files).astype('float32')/255
valid_tensors = paths_tensor(valid_files).astype('float32')/255
test_tensors = paths_tensor (test_files).astype('float32')/255

100%|██████████| 1836/1836 [00:53<00:00, 34.43it/s]
100%|██████████| 459/459 [00:13<00:00, 36.18it/s]
100%|██████████| 1531/1531 [00:41<00:00, 36.71it/s]


### Check Shape of Inputs

In [10]:
print("The shape of the training tensor is:",train_tensors.shape)
print("The shape of the training targets is:",train_targets.shape)
print("The shape of the validation tensor is:",valid_tensors.shape)
print("The shape of the validation targets is:",valid_targets.shape)
print("The shape of the testing tensor is:",test_tensors.shape)
print("The shape of the testing targets is:",test_targets.shape)

The shape of the training tensor is: (1836, 400, 400, 3)
The shape of the training targets is: (1836,)
The shape of the validation tensor is: (459, 400, 400, 3)
The shape of the validation targets is: (459,)
The shape of the testing tensor is: (1531, 400, 400, 3)
The shape of the testing targets is: (1531, 2)


## Obtain Bottleneck Features

In [11]:
from keras import applications
from keras.preprocessing import image

img_width, img_height, img_channels = 400, 400, 3

### Obtain Bottleneck Features - VGG16

In [12]:
# Load in VGG16 bottleneck features

VGG16_bottleneck_features = applications.VGG16(include_top=False, weights='imagenet', input_shape=(img_width, img_height, img_channels))

### Obtain Bottleneck Features - VGG19

In [None]:
# Load in VGG19 bottleneck features

VGG19_bottleneck_features = applications.VGG19(include_top=False, weights='imagenet', input_shape=(img_width, img_height, img_channels))

### Obtain Bottleneck Features - Xception

In [None]:
# Load in Xception bottleneck features

Xception_bottleneck_features = applications.Xception(include_top=False, weights='imagenet', input_shape=(img_width, img_height, img_channels))

### Obtain Bottleneck Features - Inception V3

In [15]:
# Load in Inception V3 bottleneck features

Inception_bottleneck_features = applications.InceptionV3(include_top=False, weights='imagenet', input_shape=(img_width, img_height, img_channels))

### Obtain Bottleneck Features - ResNet50

In [37]:
# Load in ResNet50 bottleneck features

ResNet50_bottleneck_features = applications.ResNet50(include_top=False, weights='imagenet', input_shape=(img_width, img_height, img_channels))

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


## Model Architecture

In [13]:
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential
from keras.models import Sequential, Model, load_model
from keras import optimizers

### Model Architecture VGG16

In [14]:
# Produce the VGG16 architecture 

VGG16_model = Sequential()
VGG16_model.add(Flatten(input_shape = VGG16_bottleneck_features.output_shape[1:]))
VGG16_model.add(Dense(256, activation = 'relu'))
VGG16_model.add(Dropout(0.5))
VGG16_model.add(Dense(1, activation = 'sigmoid'))

model_VGG16 = Model(inputs = VGG16_bottleneck_features.input, outputs = VGG16_model(VGG16_bottleneck_features.output))

### Model Architecture VGG19

In [None]:
# Produce the VGG19 architecture 

VGG19_model = Sequential()
VGG19_model.add(Flatten(input_shape = VGG19_bottleneck_features.output_shape[1:]))
VGG19_model.add(Dense(256, activation = 'relu'))
VGG19_model.add(Dropout(0.5))
VGG19_model.add(Dense(1, activation = 'sigmoid'))

model_VGG19 = Model(inputs = VGG19_bottleneck_features.input, outputs = VGG19_model(VGG19_bottleneck_features.output))

### Model Architecture Xception

In [None]:
# Produce the Xception architecture 

Xception_model = Sequential()
Xception_model.add(Flatten(input_shape = Xception_bottleneck_features.output_shape[1:]))
Xception_model.add(Dense(256, activation = 'relu'))
Xception_model.add(Dropout(0.5))
Xception_model.add(Dense(1, activation = 'sigmoid'))

model_Xception = Model(inputs = Xception_bottleneck_features.input, outputs = Xception_model(Xception_bottleneck_features.output))

### Model Architecture Inception V3

In [19]:
# Produce the Inception V3architecture 

Inception_model = Sequential()
Inception_model.add(Flatten(input_shape = Inception_bottleneck_features.output_shape[1:]))
Inception_model.add(Dense(256, activation = 'relu'))
Inception_model.add(Dropout(0.5))
Inception_model.add(Dense(1, activation = 'sigmoid'))

model_Inception = Model(inputs = Inception_bottleneck_features.input, outputs = Inception_model(Inception_bottleneck_features.output))

### Model Architecture ResNet50

In [None]:
# Produce the ResNet50 architecture 

ResNet50_model = Sequential()
ResNet50_model.add(Flatten(input_shape = ResNet50_bottleneck_features.output_shape[1:]))
ResNet50_model.add(Dense(256, activation = 'relu'))
ResNet50_model.add(Dropout(0.5))
ResNet50_model.add(Dense(1, activation = 'sigmoid'))

model_ResNet50 = Model(inputs = ResNet50_bottleneck_features.input, outputs = ResNet50_model(ResNet50_bottleneck_features.output))

## Compile the Model(s)

In [15]:
# Set values for learning rate and momentum
lr = 1e-4
momentum = 0.9 

### Compile the VGG16 Model

In [16]:
model_VGG16.compile(optimizers.SGD(lr = lr, momentum = momentum, nesterov = True), loss = 'binary_crossentropy', metrics =['accuracy'])

### Compile the VGG19 Model

In [None]:
model_VGG19.compile(optimizers.SGD(lr = lr, momentum = momentum), loss = 'binary_crossentropy', metrics =['accuracy'])

### Compile the Xception Model

In [None]:
model_Xception.compile(optimizers.SGD(lr = lr ,momentum = momentum), loss = 'binary_crossentropy', metrics =['accuracy'])

### Compile the Inception V3 Model

In [21]:
model_Inception.compile(optimizers.SGD(lr = lr ,momentum = momentum), loss = 'binary_crossentropy', metrics =['accuracy'])

### Compile the ResNet 50 model

In [None]:
model_ResNet50.compile(optimizers.SGD(lr = lr ,momentum = momentum), loss = 'binary_crossentropy', metrics =['accuracy'])

## Save the Best Performing Model(s)

In [17]:
from keras.callbacks import ModelCheckpoint

### Save the Best Performing Model VGG16

In [18]:
VGG16_checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.VGG16.hdf5', verbose = 1, save_best_only=True)

### Save the Best Performing Model VGG19

In [None]:
VGG19_checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.VGG19.hdf5', verbose = 1, save_best_only=True)

### Save the Best Performing Model Xception

In [None]:
Xception_checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.Xception.hdf5', verbose = 1, save_best_only=True)

### Save the Best Performing Model Inception V3

In [24]:
Inception_checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.Inception.hdf5', verbose = 1, save_best_only=True)

### Save the Best Performing Model ResNet50

In [None]:
ResNet50_checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.ResNet50.hdf5', verbose = 1, save_best_only=True)

### Image Augementation

In [19]:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

train_datagen = ImageDataGenerator(
            rotation_range = 40,
            width_shift_range = 0.2,
            height_shift_range = 0.2,
            shear_range = 0.2,
            zoom_range = 0.2,
            horizontal_flip = True,
            vertical_flip = True,
            fill_mode = 'nearest'
)

train_datagen.fit(train_tensors)

## Fit the Model(s)

In [20]:
# Set the values for number of epochs and batch size 
epochs = 50
batch_size = 32

### Fit the VGG16 Model

In [21]:
VGG16_history = model_VGG16.fit_generator(
        train_datagen.flow(train_tensors, train_targets, batch_size = batch_size),
        epochs = epochs,
        steps_per_epoch = train_tensors.shape[0] // batch_size,
        validation_data = (valid_tensors, valid_targets),
        callbacks=[VGG16_checkpointer])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Fit the VGG19 Model

In [None]:
VGG19_history = model_VGG19.fit_generator(
        train_datagen.flow(train_tensors, train_targets, batch_size = batch_size),
        epochs = epochs,
        steps_per_epoch = train_tensors.shape[0] // batch_size,
        validation_data = (valid_tensors, valid_targets),
        callbacks=[VGG16_checkpointer])

### Fit the Xception Model

In [None]:
Xception_history = model_Xception.fit_generator(
        train_datagen.flow(train_tensors, train_targets, batch_size = batch_size),
        epochs = epochs,
        steps_per_epoch = train_tensors.shape[0] // batch_size,
        validation_data = (valid_tensors, valid_targets),
        callbacks=[Xception_checkpointer])

### Fit the Inception V3 Model

In [27]:
Inception_history = model_Inception.fit_generator(
        train_datagen.flow(train_tensors, train_targets, batch_size = batch_size),
        epochs = epochs,
        steps_per_epoch = train_tensors.shape[0] // batch_size,
        validation_data = (valid_tensors, valid_targets),
        callbacks=[Inception_checkpointer])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Fit the ResNet50 Model

In [None]:
ResNet50_history = model_ResNet50.fit_generator(
        train_datagen.flow(train_tensors, train_targets, batch_size = batch_size),
        epochs = epochs,
        steps_per_epoch = train_tensors.shape[0] // batch_size,
        validation_data = (valid_tensors, valid_targets),
        callbacks=[Xception_checkpointer])

## Load the Best Saved Weights

In [22]:
# Comment out accordingly based on the weights for the model you wish to load

model_VGG16.load_weights('saved_models/weights.best.VGG16.hdf5')
#model_VGG19.load_weights('saved_models/weights.best.VGG19.hdf5')
#model_Xception.load_weights('saved_models/weights.best.Xception.hdf5')
#model_Inception.load_weights('saved_models/weights.best.Inception.hdf5')
#model_ResNet50.load_weights('saved_models/weights.best.ResNet50.hdf5')

In [23]:
# Comment out accordingly for models you wish to use for the predictions 

predictions = model_VGG16.predict(test_tensors)
#predictions = model_VGG19.predict(test_tensors)
#predictions = model_Xception.predict(test_tensors)
#predictions = model_Inception.predict(test_tensors)
#predictions = model_ResNet50.predict(test_tensors)

In [24]:


sample_submission = pd.read_csv("Datasets/sample_submission.csv")
img_path = "Datasets/test/"

test_names = []
file_paths = []

for i in range(len(sample_submission)):
    test_names.append(sample_submission.loc[i][0])
    file_paths.append( img_path + str(int(sample_submission.loc[i][0])) +'.jpg' )
    
test_names = np.array(test_names)

In [25]:
# Create the submission file based on the predictions

sample_submission = pd.read_csv("Datasets/sample_submission.csv")

for i, name in enumerate(test_names):
    sample_submission.loc[sample_submission['name'] == name, 'invasive'] = predictions[i]

sample_submission.to_csv("submit_VGG16_run7.csv", index=False)