In [None]:
import numpy as np
import pandas as pd
np.random.seed(2017)
import os
import glob
import cv2
import datetime
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.cross_validation import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.constraints import maxnorm
from sklearn.metrics import log_loss
from keras import __version__ as keras_version

from PIL import ImageFilter, ImageStat
from PIL import Image, ImageDraw
from sklearn.preprocessing import MinMaxScaler

min_max_s = MinMaxScaler(feature_range=(0, 255), copy=True)

def im_stats(im_stats_df):
    im_stats_d = {}
    for i in range(len(im_stats_df)):
        im_stats_im_ = Image.open(im_stats_df['path'][i])
        im_stats_d[im_stats_df['path'][i]] = {'Stats': ImageStat.Stat(im_stats_im_), 'Size': im_stats_im_.size}
    im_stats_df['size_0'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Size'][0])
    im_stats_df['size_1'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Size'][1])
    im_stats_df['sum_0'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].sum[0])
    im_stats_df['sum_1'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].sum[1])
    im_stats_df['sum_2'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].sum[2])
    im_stats_df['mean_0'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].mean[0])
    im_stats_df['mean_1'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].mean[1])
    im_stats_df['mean_2'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].mean[2])
    im_stats_df['rms_0'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].rms[0])
    im_stats_df['rms_1'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].rms[1])
    im_stats_df['rms_2'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].rms[2])
    im_stats_df['var_0'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].var[0])
    im_stats_df['var_1'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].var[1])
    im_stats_df['var_2'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].var[2])
    im_stats_df['stddev_0'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].stddev[0])
    im_stats_df['stddev_1'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].stddev[1])
    im_stats_df['stddev_2'] = im_stats_df['path'].map(lambda x: im_stats_d[x]['Stats'].stddev[2])
    return im_stats_df

def get_im_cv2(path):
    img = cv2.imread(path)
    resized = cv2.resize(img, (32, 32), cv2.INTER_LINEAR)
    return resized

def load_train():
    X_train = []
    X_train_id = []
    y_train = []
    start_time = time.time()
    train = []

    print('Read train images')
    folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
    for fld in folders:
        index = folders.index(fld)
        print('Load folder {} (Index: {})'.format(fld, index))
        path = os.path.join('..', 'input', 'train', fld, '*.jpg')
        files = glob.glob(path) #limited
        train += files
        for fl in files:
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl)
            X_train.append(img)
            X_train_id.append(flbase)
            y_train.append(index)
    train = im_stats(pd.DataFrame(train, columns=['path']))
    train = min_max_s.fit_transform(train[[c for c in train.columns if c not in ['path']]]).astype(int)
    for im in range(len(X_train)):
        for i in range(len(train[im])):
            X_train[im][i,0] = [train[im][i],0,0]
            X_train[im][i,1] = [train[im][i],0,0]
            X_train[im][i,2] = [train[im][i],0,0]
    print('Read train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_train, y_train, X_train_id

def load_test():
    path = os.path.join('..', 'input', 'test_stg1', '*.jpg')
    files = sorted(glob.glob(path))[:100] #limited

    X_test = []
    X_test_id = []
    test = []
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_test.append(img)
        X_test_id.append(flbase)
    test = im_stats(pd.DataFrame(files, columns=['path']))
    test = min_max_s.transform(test[[c for c in test.columns if c not in ['path']]]).astype(int)
    for im in range(len(X_test)):
        for i in range(len(test[im])):
            X_test[im][i,0] = [test[im][i],0,0]
            X_test[im][i,1] = [test[im][i],0,0]
            X_test[im][i,2] = [test[im][i],0,0]
    return X_test, X_test_id

def create_submission(predictions, test_id, info):
    result1 = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])
    result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
    now = datetime.datetime.now()
    sub_file = 'submission_' + info + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    result1.to_csv(sub_file, index=False)

def read_and_normalize_train_data():
    train_data, train_target, train_id = load_train()

    print('Convert to numpy...')
    train_data = np.array(train_data, dtype=np.uint8)
    train_target = np.array(train_target, dtype=np.uint8)

    print('Reshape...')
    train_data = train_data.transpose((0, 3, 1, 2))

    print('Convert to float...')
    train_data = train_data.astype('float32')
    train_data = train_data / 255
    train_target = np_utils.to_categorical(train_target, 8)

    print('Train shape:', train_data.shape)
    print(train_data.shape[0], 'train samples')
    return train_data, train_target, train_id

def read_and_normalize_test_data():
    start_time = time.time()
    test_data, test_id = load_test()

    test_data = np.array(test_data, dtype=np.uint8)
    test_data = test_data.transpose((0, 3, 1, 2))

    test_data = test_data.astype('float32')
    test_data = test_data / 255

    print('Test shape:', test_data.shape)
    print(test_data.shape[0], 'test samples')
    print('Read and process test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return test_data, test_id

def dict_to_list(d):
    ret = []
    for i in d.items():
        ret.append(i[1])
    return ret

def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()

def create_model():
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(3, 32, 32), dim_ordering='th'))
    model.add(Convolution2D(8, 3, 3, activation='relu', dim_ordering='th', init='he_uniform'))
    model.add(Dropout(0.2))
    
    model.add(Flatten())
    model.add(Dense(16, activation='relu',init='he_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='softmax'))

    model.compile(optimizer='adadelta', loss='categorical_crossentropy')
    return model

def get_validation_predictions(train_data, predictions_valid):
    pv = []
    for i in range(len(train_data)):
        pv.append(predictions_valid[i])
    return pv

def run_cross_validation_create_models(nfolds=5):
    batch_size = 20
    nb_epoch = 4
    random_state = 0

    train_data, train_target, train_id = read_and_normalize_train_data()

    yfull_train = dict()
    kf = KFold(len(train_id), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    sum_score = 0
    models = []
    for train_index, test_index in kf:
        model = create_model()
        X_train = train_data[train_index]
        Y_train = train_target[train_index]
        X_valid = train_data[test_index]
        Y_valid = train_target[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=3, verbose=0),
        ]
        model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              shuffle=True, verbose=2, validation_data=(X_valid, Y_valid),
              callbacks=callbacks)

        predictions_valid = model.predict(X_valid.astype('float32'), batch_size=batch_size, verbose=2)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score*len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        models.append(model)

    score = sum_score/len(train_data)
    print("Log_loss train independent avg: ", score)

    info_string = '_' + str(np.round(score,3)) + '_flds_' + str(nfolds) + '_eps_' + str(nb_epoch)
    return info_string, models

def run_cross_validation_process_test(info_string, models):
    batch_size = 20
    num_fold = 0
    yfull_test = []
    test_id = []
    nfolds = len(models)

    for i in range(nfolds):
        model = models[i]
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        test_data, test_id = read_and_normalize_test_data()
        test_prediction = model.predict(test_data, batch_size=batch_size, verbose=2)
        yfull_test.append(test_prediction)

    test_res = merge_several_folds_mean(yfull_test, nfolds)
    info_string = 'loss_' + info_string + '_folds_' + str(nfolds)
    create_submission(test_res, test_id, info_string)

if __name__ == '__main__':
    print('Keras version: {}'.format(keras_version))
    num_folds = 2
    info_string, models = run_cross_validation_create_models(num_folds)
    run_cross_validation_process_test(info_string, models)

In [None]:
#Now to blend use the following with your various results

import numpy as np

df1 = pd.read_csv('../input/sample_submission_stg1.csv') #change these
df2 = pd.read_csv('../input/sample_submission_stg1.csv') #change these
c = [c+'_' if c !='image' else c for  c in df2.columns]
df2.columns = c
df = pd.merge(df1, df2, on='image', how='inner')

for c in df1.columns:
    if c != 'image':
        df[c] = (df[c] + df[c+'_'])/2
df[df1.columns].to_csv('z11_sub_blend01.csv', index=False)