# Can we find out name of the tree from shape of a leaf?

### This is the second notebook. The first notebook, leaf_id_CNN.ipynb, can be found in the same folder as this.

Dataset from: https://www.kaggle.com/c/leaf-classification/data

#### Description
The dataset consists approximately 1,584 images of leaf specimens (16 samples each of 99 species) which have been converted to binary black leaves against white backgrounds. Three sets of features are also provided per image: a shape contiguous descriptor, an interior texture histogram, and a ﬁne-scale margin histogram. For each feature, a 64-attribute vector is given per leaf sample.

#### Previous conclusion about data
99 classes, 16 samples from each class.
data per image - id, species, margin (64), shape (64), texture (64)

In [1]:
import numpy as np
import pandas as pd
import os

import matplotlib.image as mpimg       # reading images to numpy arrays
import matplotlib.pyplot as plt        # to plot any graph
import matplotlib.patches as mpatches  # to draw a circle at the mean contour

from skimage import measure            # to find shape contour
from skimage import color
import scipy.ndimage as ndi            # to determine shape centrality

# matplotlib setup
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = (6, 6)      # setting default size of plots

from IPython.display import Image
import tensorflow as tf
import numpy as np

tf.logging.set_verbosity(tf.logging.INFO)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

from __future__ import print_function
import keras
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Convolution2D, MaxPooling2D, Flatten, Input, Concatenate
from keras.layers import Conv2D, MaxPooling2D
import os

Using TensorFlow backend.


## Basic CNN
Convolution layer, pooling, Convolution layer, pooling, dense layer, another dense layer for output.

In [None]:
def load_images(folder):
    images = []
    for filename in os.listdir(folder):
        img = mpimg.imread(os.path.join(folder, filename))
        if img is not None:
            images.append(img)
    return images

all_image_data = load_images('./data/kaggle_images/')

In [None]:
print(len(all_image_data))
print(len(all_image_data[0]), len(all_image_data[0][0]))
print(len(all_image_data[1]), len(all_image_data[1][0]))

## Turns out all the images are different sizes!
## Have to deal with this before using in CNN because I am using dense layers as well 
## - doesn't matter for filter layers.

In [None]:
# Parts of code from here: https://www.kaggle.com/abhmul/keras-convnet-lb-0-0052-w-visualization/comments
# This loads and resizes images to uniform size

from keras.preprocessing.image import img_to_array, load_img

def resize_img(img, max_dim=96):
    max_axis = max((0, 1), key=lambda i: img.size[i])
    scale = max_dim / float(img.size[max_axis])
    return img.resize((int(img.size[0] * scale), int(img.size[1] * scale)))

def load_image_data(ids, data_path, max_dim=96, center=True):
    # Input: an array of image ids to load from data_path
    # Initialize the output array
    X = np.empty((len(ids), max_dim, max_dim, 1))
    for i, idee in enumerate(ids):
        x = resize_img(load_img(os.path.join(data_path, str(idee) + '.jpg'), grayscale=True), max_dim=max_dim)
        x = img_to_array(x)
        # Get the corners of the bounding box for the image
        length = x.shape[0]
        width = x.shape[1]
        if center:
            h1 = int((max_dim - length) / 2)
            h2 = h1 + length
            w1 = int((max_dim - width) / 2)
            w2 = w1 + width
        else:
            h1, w1 = 0, 0
            h2, w2 = (length, width)
        X[i, h1:h2, w1:w2, 0:1] = x
    return np.around(X / 255.0)

In [None]:
# Now we want to split the input images according to the ids given in features training set.
# So we will read the training set to get the image ids of test set.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# standardize the data by setting the mean to 0 and std to 1
all_data = pd.read_csv('./data/train.csv')
label_col = all_data.pop('species')
id_col = all_data.pop('id')
data_standardized = pd.DataFrame(StandardScaler().fit(all_data).transform(all_data))
data_standardized.insert(0, 'id', id_col, True)
data_standardized.insert(1, 'species', label_col, True)

#SPLIT
data_train, data_test = train_test_split(data_standardized, test_size=0.1, random_state=1)
data_train, data_val = train_test_split(data_train, test_size=0.1, random_state=1)

y_all = data_standardized['species']

lb = LabelBinarizer()
y_train = lb.fit(y_all).transform(data_train['species'])
y_test = lb.fit(y_all).transform(data_test['species'])
y_val = lb.fit(y_all).transform(data_val['species'])
print(y_train.shape, y_test.shape, y_val.shape)

X_train = data_train.drop(['id', 'species'], axis=1)
X_test = data_test.drop(['id', 'species'], axis=1)
X_val = data_val.drop(['id', 'species'], axis=1)
print(X_train.shape, X_test.shape, X_val.shape)

# Read corresponding images for test and train
img_data_path = './data/kaggle_images/'
X_train_images = load_image_data(data_train['id'], img_data_path)
X_test_images = load_image_data(data_test['id'], img_data_path)
X_val_images = load_image_data(data_val['id'], img_data_path)
print(X_train_images.shape, X_test_images.shape, X_val_images.shape)

In [None]:
# Copied from https://keras.io/examples/cifar10_cnn/

x_train_model = X_train_images
x_test_model = X_test_images
x_val_model = X_val_images
Y_train_model = y_train
Y_test_model = y_test
Y_val_model = y_val

num_classes = 99

model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train_model.shape[1:]))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# initiate RMSprop optimizer
opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)

In [None]:
# Let's train the model using RMSprop

batch_size = 32
num_classes = 99
epochs = 100
data_augmentation = False
# num_predictions = 20
save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name = 'keras_leaf_combined_model_v2.h5'

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

model.fit(x_train_model, Y_train_model,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_model, Y_test_model),
          shuffle=True)

# Save model and weights
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Saved trained model at %s ' % model_path)

# Score trained model.
scores = model.evaluate(x_val_model, Y_val_model, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

## Add given numerical features to training

In [None]:
def combined_model():
    image = Input(shape=(96, 96, 1), name='image')
    # First convolutional layer
    x = Convolution2D(8, 5, 5, input_shape=(96, 96, 1), border_mode='same')(image)
    x = (Activation('relu'))(x)
    x = (MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(x)

    # Second convolutional layer
    x = (Convolution2D(32, 5, 5, border_mode='same'))(x)
    x = (Activation('relu'))(x)
    x = (MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(x)

    # Flatten our array
    x = Flatten()(x)
    # Define the pre-extracted feature input
    numerical = Input(shape=(192,), name='numerical')
    # Concatenate the output of our convnet with our pre-extracted feature input
    concatenated = Concatenate()([x, numerical])

    # Fully connected layer
    x = Dense(100, activation='relu')(concatenated)
    x = Dropout(.5)(x)

    # Get the final output
    out = Dense(99, activation='softmax')(x)
    # How we create models with the Functional API
    model = Model(input=[image, numerical], output=out)
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    return model

print('Creating the model...')
comb_model = combined_model()
print('Model created!')


In [None]:
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

print('Training model...')
history = comb_model.fit(x = {'image': X_train_images, 'numerical': X_train},
                    y = y_train,
                    batch_size = 100,
                    epochs = 100,
                    verbose = 1
                   )

# Save model and weights
save_dir = os.path.join(os.getcwd(), 'saved_models')
combined_model_name = "leafnet_combined.h5"
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, combined_model_name)
comb_model.save(model_path)
print('Saved trained model at %s ' % model_path)

# Score trained model.
scores = comb_model.evaluate(x = {'image': X_val_images, 'numerical': X_val}, y = Y_val_model, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

In [None]:
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

batch_size = 32
num_classes = 99
epochs = 100

print('Training model...')
history = comb_model.fit(x = {'image': X_train_images, 'numerical': X_train},
                         batch_size=batch_size,
                         epochs=epochs,
                         validation_data=({'image': X_test_images, 'numerical': X_test}, Y_test_model),
                         shuffle=True,
                         verbose = 1
                        )

# Save model and weights
save_dir = os.path.join(os.getcwd(), 'saved_models')
combined_model_name = "leafnet_combined_2.h5"
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, combined_model_name)
comb_model.save(model_path)
print('Saved trained model at %s ' % model_path)

# Score trained model.
scores = comb_model.evaluate(x = {'image': X_val_images, 'numerical': X_val}, y = Y_val_model, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])