### Creating .csv files

In [1]:
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline

In [61]:
# converts matrix (n, m), representing an image, into a vector (n*m, )
def img_to_row(img, index):
    data = img.flatten().astype(float)
    return data

# returns a matrix representing a directory of images (row = image, column = pixel number)
def img_dir_to_matrix(root_path, extension = '', need_names = False):
    index = 0
    data = None
    filenames = []
    for (path, dirs, files) in os.walk(root_path):
        for filename in files:
            if (filename.endswith(extension)):
                fullpath = os.path.join(path, filename)
                img = cv2.imread(fullpath, 0)
                row = img_to_row(img, index)
                if index == 0:
                    data = row.reshape(1, -1)
                else:
                    data = np.vstack([data, row])
                if need_names:
                    filenames.append(filename)
#                 if index == 5:
#                     return (data, filenames) if need_names else data
                index += 1
                if index % 5000 == 0:
                    print index
    return (data, filenames) if need_names else data

def export_to_csv(path, array, head = None):
    frame = pd.DataFrame(array, columns=head)
    frame.to_csv(path, sep = ',', header = (head != None), index = None)

In [62]:
def generate_train_class(root_path, extension, class_value):
    train = img_dir_to_matrix(root_path, extension)
    train_y = np.array([class_value] * train.shape[0])
    return (train, train_y)
    
def generate_train_sample(root_path, extension, class_true_dir, class_false_dir):
    if not root_path.endswith('/'):
        root_path += '/'
    print 'Generating training sample for True class'
    train_true, train_true_y = generate_train_class(root_path + class_true_dir, extension, 1)
    print 'Generating training sample for False class'
    train_false, train_false_y = generate_train_class(root_path + class_false_dir, extension, 0)
    
    print 'Merging samples'
    trainX = np.concatenate((train_true, train_false), 0)
    trainY = np.concatenate((train_true_y, train_false_y), 0)
    return (trainX, trainY)

def generate_test_sample(root_path, extension):
    print 'Generating test sample'
    if not root_path.endswith('/'):
        root_path += '/'
    (data, filenames) = img_dir_to_matrix(root_path, extension, need_names=True)
    test_ids = np.array([int(filename.split('.')[0]) for filename in filenames])
    frame = pd.DataFrame(data)
    frame['id'] = test_ids
    return frame

In [64]:
%time trainX, trainY = generate_train_sample('data/train/', '.tif', 'Hieroglyph', 'Other')
%time test_frame = generate_test_sample('data/test/', '.tif')

export_to_csv('data/train/trainX.csv', trainX)
export_to_csv('data/train/trainY.csv', trainY)
test_frame.to_csv('data/test/testX.csv', index=False)

Generating training sample for True class
5000
10000
15000
20000
25000
30000
35000
40000
Generating training sample for False class
5000
10000
15000
20000
25000
30000
35000
40000
Merging samples
CPU times: user 10min 54s, sys: 4min 16s, total: 15min 11s
Wall time: 15min 13s
5000
10000
15000
20000
25000
30000
CPU times: user 3min 54s, sys: 1min 35s, total: 5min 29s
Wall time: 5min 30s


### Reading input data

In [2]:
%time trainX = pd.read_csv('data/train/trainX.csv', header=None, dtype=np.float32).values.reshape((-1, 1, 20, 20))
%time trainY = pd.read_csv('data/train/trainY.csv', header=None, dtype=np.int32).values.reshape((-1))
%time test_info = pd.read_csv('data/test/testX.csv')
testIds = test_info['id'].values.astype(int)
testX = test_info.drop('id', 1).values.astype(np.float32).reshape((-1, 1, 20, 20))

CPU times: user 2.89 s, sys: 99.1 ms, total: 2.99 s
Wall time: 3 s
CPU times: user 6.94 ms, sys: 0 ns, total: 6.94 ms
Wall time: 6.74 ms
CPU times: user 1.21 s, sys: 19.6 ms, total: 1.23 s
Wall time: 1.23 s


### Building convolutional neural net
Architecture taken from:  
https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py  
build_cnn function  
SAME AS #2, but with more epochs

In [3]:
import lasagne
import lasagne.layers as layers
from lasagne.nonlinearities import softmax, rectify
from lasagne.updates import adam
from lasagne.updates import nesterov_momentum

from nolearn.lasagne import NeuralNet, TrainSplit

In [4]:
layers0 = [
    (layers.InputLayer, {'shape': (None, 1, 20, 20)}),

    (layers.Conv2DLayer, {'num_filters': 32, 'filter_size': 5, 'nonlinearity': rectify}),
    (layers.MaxPool2DLayer, {'pool_size': 2}),

    (layers.Conv2DLayer, {'num_filters': 32, 'filter_size': 5, 'nonlinearity': rectify}),
    (layers.MaxPool2DLayer, {'pool_size': 2}),

    (layers.DenseLayer, {'num_units': 256, 'nonlinearity': rectify}),
    (layers.DropoutLayer, {}),
    (layers.DenseLayer, {'num_units': 10, 'nonlinearity': rectify}),

    (layers.DenseLayer, {'num_units': 2, 'nonlinearity': softmax}),
]

In [5]:
net0 = NeuralNet(
    layers = layers0,
    max_epochs = 500,
    update = adam,
    update_learning_rate = 0.0002,
    objective_l2 = 0.0025,
    train_split = TrainSplit(eval_size=0.16),
    verbose = 2
)

In [7]:
random_indices = np.arange(np.size(trainY))
np.random.shuffle(random_indices)
# print trainY[random_indices]
net0.fit(trainX[random_indices], trainY[random_indices], epochs=200)

# Neural Network with 62080 learnable parameters

## Layer information

name        size        total    cap.Y    cap.X    cov.Y    cov.X
----------  --------  -------  -------  -------  -------  -------
input0      1x20x20       400   100.00   100.00   100.00   100.00
conv2d1     32x16x16     8192   100.00   100.00    25.00    25.00
maxpool2d2  32x8x8       2048   100.00   100.00    25.00    25.00
conv2d3     32x4x4        512    76.92    76.92    65.00    65.00
maxpool2d4  32x2x2        128    76.92    76.92    65.00    65.00
dense5      256           256   100.00   100.00   100.00   100.00
dropout6    256           256   100.00   100.00   100.00   100.00
dense7      10             10   100.00   100.00   100.00   100.00
dense8      2               2   100.00   100.00   100.00   100.00

Explanation
    X, Y:    image dimensions
    cap.:    learning capacity
    cov.:    coverage of image
    [35mmagenta[0m: capacity too low (<1/6)
    [36mcyan[0m:    image coverage too high (>100

NeuralNet(X_tensor_type=None,
     batch_iterator_test=<nolearn.lasagne.base.BatchIterator object at 0x7f440780b490>,
     batch_iterator_train=<nolearn.lasagne.base.BatchIterator object at 0x7f440780b390>,
     check_input=True, custom_scores=None,
     layers=[(<class 'lasagne.layers.input.InputLayer'>, {'shape': (None, 1, 20, 20)}), (<class 'lasagne.layers.conv.Conv2DLayer'>, {'filter_size': 5, 'nonlinearity': <function rectify at 0x7f4408585f50>, 'num_filters': 32}), (<class 'lasagne.layers.pool.MaxPool2DLayer'>, {'pool_size': 2}), (<class 'lasa....layers.dense.DenseLayer'>, {'num_units': 2, 'nonlinearity': <function softmax at 0x7f4408585b90>})],
     loss=None, max_epochs=500, more_params={},
     objective=<function objective at 0x7f440780ef50>, objective_l2=0.0025,
     objective_loss_function=<function categorical_crossentropy at 0x7f440840f050>,
     on_batch_finished=[],
     on_epoch_finished=[<nolearn.lasagne.handlers.PrintLog instance at 0x7f44078101b8>],
     on_training

In [8]:
%time trainY_predicted = net0.predict(trainX)
print sk.metrics.accuracy_score(trainY_predicted, trainY)

CPU times: user 1min 55s, sys: 6min 23s, total: 8min 18s
Wall time: 1min 31s
0.997137857768


In [9]:
%time testY = net0.predict(testX)

CPU times: user 49.3 s, sys: 2min 44s, total: 3min 33s
Wall time: 39.3 s


### Output

In [10]:
testY_frame = pd.DataFrame()
testY_frame['Id'] = testIds
testY_frame['Prediction'] = testY
testY_frame.sort_values(by='Id', inplace=True)
testY_frame.to_csv('results/res4.csv', index=None)