In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Conv1D, Dropout
from keras.layers import Conv2D, MaxPooling2D, Input, merge
from keras.optimizers import SGD, Adam
from keras.utils import np_utils
from keras.preprocessing.image import img_to_array, load_img
import tensorflow as tf
import os

train = pd.read_csv('train.csv')

train_copy = pd.read_csv('train.csv')

Using TensorFlow backend.


Preprocessing data
================

In [2]:
### load the 3 features from csv file which are MARGIN, SHAPE, TEXTURE
def load_num_data(train):
    
    # get the id and label from the our training data
    ID = train.pop('id')
    label = train.pop('species')
    
    #labe our training data from 0~99 and standardize the data
    y = LabelEncoder().fit(label).transform(label)    
    X = StandardScaler().fit(train).transform(train)
    

    return ID, X, y

Extract the image data from image files
===============================

In [3]:
### Using AbhijeetMulgund's image processing function
###cited:https://www.kaggle.com/abhmul/keras-convnet-lb-0-0052-w-visualization

def resize_img(img, max_dim=80):
    """
    Resize the image to so the maximum side is of size max_dim
    Returns a new image of the right size
    """
    # Get the axis with the larger dimension
    max_ax = max((0, 1), key=lambda i: img.size[i])
    # Scale both axes so the image's largest dimension is max_dim
    scale = max_dim / float(img.size[max_ax])
    return img.resize((int(img.size[0] * scale), int(img.size[1] * scale)))
    


def load_image_data(ids, max_dim=80, center=True):
    """
    Takes as input an array of image ids and loads the images as numpy
    arrays with the images resized so the longest side is max-dim length.
    If center is True, then will place the image in the center of
    the output array, otherwise it will be placed at the top-left corner.
    """
    # Initialize the output array
    # NOTE: Theano users comment line below and
    X = np.empty((len(ids), max_dim, max_dim, 1))
    # X = np.empty((len(ids), 1, max_dim, max_dim)) # uncomment this
    for i, idee in enumerate(ids):
        # Turn the image into an array
        x = resize_img(load_img(os.path.join('images', str(idee) + '.jpg'), grayscale=True), max_dim=max_dim)
        x = img_to_array(x)
        # Get the corners of the bounding box for the image
        # NOTE: Theano users comment the two lines below and
        length = x.shape[0]
        width = x.shape[1]
        # length = x.shape[1] # uncomment this
        # width = x.shape[2] # uncomment this
        if center:
            h1 = int((max_dim - length) / 2)
            h2 = h1 + length
            w1 = int((max_dim - width) / 2)
            w2 = w1 + width
        else:
            h1, w1 = 0, 0
            h2, w2 = (length, width)
        # Insert into image matrix
        # NOTE: Theano users comment line below and
        X[i, h1:h2, w1:w2, 0:1] = x
        # X[i, 0:1, h1:h2, w1:w2] = x  # uncomment this
    # Scale the array values so they are between 0 and 1
    return np.around(X / 255.0)

Get our image data and spilt the data to 90% of training data and 0.1 for valiation data
============================================

In [4]:
ID, X_num_tr, y = load_num_data(train)
   
X_img_tr = load_image_data(ID)
    
sss = StratifiedShuffleSplit(train_size=0.9, random_state=23)
tr_index, te_index= next(sss.split(X_num_tr, y))
X_num_val, X_img_val, y_val = X_num_tr[te_index], X_img_tr[te_index], y[te_index]
X_num_tr, X_img_tr, y_tr = X_num_tr[tr_index], X_img_tr[tr_index], y[tr_index]
   

y_tr = np_utils.to_categorical(y_tr, 99)
y_val = np_utils.to_categorical(y_val, 99)



Build our CNN model with keras functional API
======================================
the reason we use function API not Sequential is that we have to different input here one is image data and another is 3 different attritube(MARGIN, SHAPE, TEXTURE) 

In [5]:
from keras.layers.merge import concatenate

input1 = Input(shape=(80, 80, 1))

input2 = Input(shape=(192,))

# create our first layer
x = Conv2D(5, 3, 3, input_shape=(80, 80, 1), border_mode='same')(input1)
x = (Activation('relu'))(x)
x = (MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(x)

#  the second layer
x = (Conv2D(32, 5, 5, border_mode='same'))(x)
x = (Activation('relu'))(x)
x = (MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(x)

x = Flatten()(x)


# Concatenate the output of our convnet for input2
concatenated = concatenate([x, input2])

x = Dense(101, activation='relu')(concatenated)
x = Dropout(.4)(x)

out = Dense(99, activation='softmax')(x)

# create our models for two inputs
model = Model(input=[input1, input2], output=out)



print(y_tr.shape)
print(y_val.shape)

(891, 99)
(99, 99)


  
  del sys.path[0]


Experiments and Evaluation
========================

In [6]:
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

#sgd = SGD(lr=0.001, decay=1e-4, momentum=0.9)
#we have try out different kind of optimizers(Adam, sgd, rmsprop) and our best model is using the Adam
optim = Adam(lr=0.005, beta_1=0.9,beta_2=0.999, epsilon= None, decay=0.0, amsgrad=True)
model.compile(loss='categorical_crossentropy',optimizer=optim,metrics=['accuracy'])

# we store our best model in leaf_model.h5 files
best_leaf_clafier = "leaf_model.h5"
best_model = ModelCheckpoint(best_leaf_clafier, monitor='val_loss', verbose=1, save_best_only=True)
#here is the paramenter we find for the best score
nb_epoch = 40
batch_size=64

model.fit([X_img_tr, X_num_tr], y_tr, nb_epoch=nb_epoch, validation_data=([X_img_val, X_num_val], y_val), batch_size=32, callbacks=[best_model])

model = load_model(best_leaf_clafier)

  app.launch_new_instance()


Train on 891 samples, validate on 99 samples
Epoch 1/40

Epoch 00001: val_loss improved from inf to 1.21325, saving model to leaf_model.h5
Epoch 2/40

Epoch 00002: val_loss improved from 1.21325 to 0.26967, saving model to leaf_model.h5
Epoch 3/40

Epoch 00003: val_loss improved from 0.26967 to 0.12999, saving model to leaf_model.h5
Epoch 4/40

Epoch 00004: val_loss did not improve from 0.12999
Epoch 5/40

Epoch 00005: val_loss improved from 0.12999 to 0.11174, saving model to leaf_model.h5
Epoch 6/40

Epoch 00006: val_loss did not improve from 0.11174
Epoch 7/40

Epoch 00007: val_loss did not improve from 0.11174
Epoch 8/40

Epoch 00008: val_loss did not improve from 0.11174
Epoch 9/40

Epoch 00009: val_loss did not improve from 0.11174
Epoch 10/40

Epoch 00010: val_loss improved from 0.11174 to 0.09054, saving model to leaf_model.h5
Epoch 11/40

Epoch 00011: val_loss improved from 0.09054 to 0.08263, saving model to leaf_model.h5
Epoch 12/40

Epoch 00012: val_loss did not improve fro

Run on testing dataset
==================

In [7]:
test = pd.read_csv('test.csv')
te_ID = test.pop('id')

x_num_te = StandardScaler().fit(test).transform(test)

x_img_te = load_image_data(te_ID)
yPred = model.predict([x_img_te, x_num_te])
yPred = pd.DataFrame(yPred,index=te_ID,columns=sorted(train_copy.species.unique()))
fp = open('submission_nn_kernel1.csv','w')
fp.write(yPred.to_csv())

803953