In [1]:
%matplotlib inline

In [53]:
from keras_utilities import *
from keras_utilities.models.vgg16 import Vgg16
from keras_utilities.models.vgg16bn import Vgg16BN
from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D, Input, Dropout, BatchNormalization
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
import bcolz
import os
import numpy as np
import pandas as pd

In [3]:
# Constants
PROD = True

if not PROD:
    dataset = 'sample'
else:
    dataset = 'train'

# Moving Data Around

In [4]:
%cd ~/Fisheries/

/home/ubuntu/Fisheries


In [5]:
%ls

analysis.ipynb  sample_submission_stg1.csv.zip  test_stg1.zip  train.zip
[0m[01;34mdata[0m/           sample_submission_stg2.csv.zip  test_stg2.7z   [01;34mval[0m/
[01;34msample[0m/         [01;34mtest_stg1[0m/                      [01;34mtrain[0m/


In [6]:
%ls train/

[0m[01;34mALB[0m/  [01;34mBET[0m/  [01;34mDOL[0m/  [01;34mLAG[0m/  [01;34mNoF[0m/  [01;34mOTHER[0m/  [01;34mSHARK[0m/  [01;34mYFT[0m/


In [7]:
# ! rm -R /home/ubuntu/Fisheries/sample/

In [8]:
# create_data_sample('train', 'sample')

In [9]:
%ls train/

[0m[01;34mALB[0m/  [01;34mBET[0m/  [01;34mDOL[0m/  [01;34mLAG[0m/  [01;34mNoF[0m/  [01;34mOTHER[0m/  [01;34mSHARK[0m/  [01;34mYFT[0m/


In [10]:
%ls sample/

[0m[01;34mALB[0m/  [01;34mBET[0m/  [01;34mDOL[0m/  [01;34mLAG[0m/  [01;34mNoF[0m/  [01;34mOTHER[0m/  [01;34mSHARK[0m/  [01;34mYFT[0m/


In [11]:
! ls -l train/ALB/ | wc -l

1372


In [12]:
! ls -l sample/ALB | wc -l 

427


We're good on counts, and seem to have successfully partitioned out 25% of the data to represent a training sample. Next, we have to work on coming up with a good validation set. Do the training set and validation sets come from different boats? If so, we might want to think about splitting out a few of the training boats to serve as validation boats.

Maybe a new way to handle validation set creation is to allow two types of partitioning:

1. A random percentage of the training data
2. Some subset of the training data that meets a certain criteria (we could start with something simply based on the file's name)

For cases where individual samples are totally iid we can use method 1, whereas if there is some relationship between examples (like some fish coming from the same boat, in this case) we can use method 2.

In [13]:
# TODO: a function for recursively creating a validation set from a training directory
# for subdir in os.listdir('train'):
#     train_path = 'train/' + subdir + '/'
#     val_path = 'val' + '/' + subdir + '/'
#     move_data_subset(train_path, val_path, subset_pct=.2, method='move')

In [14]:
! ls -l val/ALB | wc -l

348


In [15]:
! ls -l train/ALB | wc -l

1372


# Extracting Data

In [16]:
gen = ImageDataGenerator()

In [17]:
train_gen = gen.flow_from_directory(dataset, 
                                    target_size = (224,224), 
                                    batch_size = 32, 
                                    class_mode = 'categorical')

Found 2997 images belonging to 8 classes.


In [18]:
val_gen = gen.flow_from_directory('val',
                                  target_size = (224,224),
                                  batch_size = 32,
                                  class_mode = 'categorical')

Found 784 images belonging to 8 classes.


In [19]:
mdl = Vgg16()

In [20]:
mdl.model.pop()
for layer in mdl.model.layers: layer.trainable = False
mdl.model.add(Dense(8, activation='softmax'))
mdl.compile()

In [21]:
# mdl.model.fit_generator(train_gen, 
#                         samples_per_epoch = train_gen.nb_sample, 
#                         nb_epoch = 1, 
#                         validation_data = val_gen, 
#                         nb_val_samples=val_gen.nb_sample)

# Pre-Computing Convolutional Layers

This section is computationally intensive, so everything is commented out.

In [22]:
# mdl = Vgg16BN()

In [23]:
# layer_types = map(lambda layer: type(layer), mdl.model.layers)

In [24]:
# first_dense_index = layer_types.index(Dense)

In [25]:
# new_layers = mdl.model.layers[:first_dense_index]

In [26]:
# conv_out_mdl = Sequential(new_layers)

In [27]:
# conv_out_data = conv_out_mdl.predict_generator(train_gen, train_gen.nb_sample)

In [28]:
# conv_out_data.shape

In [29]:
# bcolz.carray(conv_out_data, rootdir='data/train')

In [30]:
# val_conv_out_data = conv_out_mdl.predict_generator(val_gen, val_gen.nb_sample)

In [31]:
# bcolz.carray(val_conv_out_data, rootdir='data/val')

In [32]:
# ! mv data/train data/sample

In [34]:
# conv_out_unflattened_layers = conv_out_mdl.layers[:-1]

In [35]:
# conv_out_unflattened_layers

In [36]:
# conv_out_unflattened = Sequential(conv_out_unflattened_layers)

In [37]:
# conv_out_unflattened_trn = conv_out_unflattened.predict_generator(train_gen, train_gen.nb_sample)

In [38]:
# conv_out_unflattened_trn.shape

In [39]:
# bcolz.carray(conv_out_unflattened_trn, rootdir='data/train_unflattened')

In [40]:
# conv_out_unflattened_val = conv_out_unflattened.predict_generator(val_gen, val_gen.nb_sample)

In [41]:
# conv_out_unflattened_val.shape

In [42]:
# bcolz.carray(conv_out_unflattened_val, rootdir='data/val_unflattened')

In [43]:
! ls data

sample	train  train_unflattened  val  val_unflattened


# Modeling

## Single dense layer, just to get things up and running

In [61]:
y = train_gen.classes

In [62]:
y

array([0, 0, 0, ..., 7, 7, 7], dtype=int32)

In [44]:
trn = bcolz.open('data/train')

In [106]:
trn.shape

(2997, 25088)

In [107]:
inp = Input(shape=(25088,))

In [116]:
d1 = Dense(512, activation='relu')(inp)
d2 = Dense(512, activation='relu')(d1)
d3 = Dense(512, activation='relu')(d2)
d4 = Dense(8, activation='softmax')(d3)

In [117]:
appended_mdl = Model(input=inp, output=d2)

In [118]:
appended_mdl.compile(optimizer=Adam(lr=0.1), loss='sparse_categorical_crossentropy')

In [119]:
appended_mdl.fit(x=trn, y=y, nb_epoch=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6710c98450>

gradient explosionnnnnnnn

In [126]:
# TODO: start with the weights from the vgg dense layers and fine tune those