In [None]:
import numpy as np
np.random.seed(2017)

%matplotlib inline
import matplotlib.pyplot as plt
import os 
import glob
import datetime
import time
import pandas as pd
import cv2
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from skimage import transform, util
from skimage import filters, color

from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from keras import __version__ as keras_version
keras_version

In [None]:
#size of image
img_size = 128
#num of collor changls 
num_channels = 3
#conv filter row and col
f_row = f_col = 3
#number of filters for convolution layer1
num_filter1 = 16
#number of filters for convolution layer2
num_filter2 = 36

#folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']

## Reading and resizeing images using OpenCv - cv2

In [None]:
def get_img_cv2(path):
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    #print(img.shape)
    #img = util.img_as_float(img)
    #eimg =  filters.sobel(color.rgb2gray(img)) #signify importance of fixes
    #gray = color.rgb2gray(img)
    #out = transform.seam_carve(gray, eimg, 'vertical',  500 )
   
    resized = cv2.resize(img, (img_size, img_size), cv2.INTER_LINEAR)
    return resized
    #return out

path = os.path.join('input','train_img', '*.png')
print(path)
files = glob.glob(path)
#print(files)
fl = files[1]
img = get_img_cv2(fl)
print(img.shape)
plt.imshow(img)
plt.show()

## Loading training data 


In [None]:
def load_train_data():
    train_images = []    
    train_df = pd.read_csv('input/train.csv')
    train_ids = train_df.image_id.values
    train_labels = train_df.label.values    
   
    for img in tqdm(train_ids):
        train_images.append(get_img_cv2('input/train_img/{}.png'.format(img)))            
    
    return train_images, train_labels, train_ids

In [None]:
#train_images, train_labels, train_ids = load_train_data()    

In [None]:
#Y_train = {k:v for v,k in enumerate(set(train_labels))}
#y_train = [Y_train[k] for k in train_labels]  
#len(y_train)

## Load test data

In [None]:
#plt.imshow(train_images[0])
#plt.show()

In [None]:
def load_test_data():
    
    test_images = []
    test_df = pd.read_csv('input/test.csv')
    test_ids = test_df.image_id.values
    
    for img in tqdm(test_ids):
        test_images.append(get_img_cv2('input/test_img/{}.png'.format(img)))
    
    return test_images, test_ids     

In [None]:
#test_images, test_id = load_test_data()

In [None]:
#plt.imshow(test_images[0])
#plt.show()

## read and normalize train data set

In [None]:
def read_and_normalize_train_data():
    train_images, train_labels, train_ids = load_train_data()    
    
    print("Conver to numpy array")
    train_images = np.array(train_images, dtype = np.uint8)
    
    
    print("Reshape train_images")
    train_images = train_images.transpose((0, 3, 1, 2))
    
    print("Normalize and Convert to float")
    train_images = train_images.astype('float32')
    train_images = train_images / 255
    
    Y_train = {k:v for v,k in enumerate(set(train_labels))}
    y_train = [Y_train[k] for k in train_labels]
    train_labels = np_utils.to_categorical(y_train, len(Y_train))
    train_labels = np.array(train_labels, dtype = np.uint8)
    
    print("Train shape :", train_images.shape)
    print(train_images.shape[0], "train samples")
    return train_images, train_labels, Y_train, train_ids


## read and normalize test data

In [None]:
def read_and_normalize_test_data():
    test_images, test_id = load_test_data()
    
    print("Convert to numpy array")
    test_images = np.array(test_images, dtype = np.uint8)
    
    print("Reshape test images")
    test_images = test_images.transpose((0,3,1,2))
    #test_images = test_images.transpose((0,1,2)) #gray image
    
    print("Convert to float and normalize")
    test_images = test_images.astype('float32')
    test_images = test_images / 255
    
    print("Test Shape: ", test_images.shape)
    print(test_images.shape[0], "test samples")
    return test_images, test_id

## Keras Model creation

In [None]:
def create_model():
    model = Sequential()
    model.add(ZeroPadding2D(padding=(1,1), input_shape = (num_channels, img_size, img_size), dim_ordering='th'))
    model.add(Convolution2D(nb_filter=num_filter1, nb_row=f_row, nb_col=f_col, activation='relu', dim_ordering='th'))
    model.add(ZeroPadding2D(padding=(1,1), dim_ordering='th'))
    model.add(Convolution2D(nb_filter=num_filter1, nb_row=f_row, nb_col=f_col, activation='relu', dim_ordering='th'))          
    model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), dim_ordering='th'))  
    model.add(Dropout(0.25))          
              
    model.add(ZeroPadding2D(padding=(1,1), dim_ordering='th')) 
    model.add(Convolution2D(nb_filter=num_filter2, nb_row=f_row, nb_col=f_col, activation='relu', dim_ordering='th'))  
    model.add(ZeroPadding2D(padding=(1,1), dim_ordering='th'))  
    model.add(Convolution2D(nb_filter=num_filter2, nb_row=f_row, nb_col=f_col, activation='relu', dim_ordering='th'))  
    model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), dim_ordering='th'))  
    model.add(Dropout(0.25))  
              
    model.add(Flatten())   
    model.add(Dense(output_dim=128, activation='relu'))  
    model.add(Dropout(0.5))  
    model.add(Dense(output_dim=128, activation='relu')) 
    model.add(Dropout(0.5))  
    model.add(Dense(output_dim=25, activation='softmax')) 
              
    sgd = SGD(lr=0.1, decay= 1e-6, momentum=0.9, nesterov=True)   
    model.compile(loss = 'categorical_crossentropy', optimizer = sgd, metrics =['accuracy'])  
    
    return model

## Running Kfold cross validation on created model

In [None]:
def run_kfold_on_created_model(nfolds = 10):
    #model fit input dimentions
    batch_size = 32
    nb_epoch = 10
    random_state = 51
    
    train_images, train_labels, Y_train, train_ids = read_and_normalize_train_data()
    yfull_train = dict()
    kfold = KFold(len(train_images), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_folds = 0
    sum_score = 0
    models =[]
    for train_idx, valid_idx in kfold:
        model = create_model()
        x_train = train_images[train_idx]
        y_train = train_labels[train_idx]
        x_valid = train_images[valid_idx]
        y_valid = train_labels[valid_idx]
        
        num_folds += 1
        print("start Kfold number {0} from {1}".format(num_folds, nfolds))
        print("slpit train: ", len(x_train), len(y_train))
        print("split valid: ", len(x_valid), len(y_valid))
        
        callbacks = [
            EarlyStopping(monitor='val_acc', patience=3, verbose=0)
        ]
        model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, shuffle=True,
                  verbose=2, validation_data=(x_valid, y_valid), callbacks=callbacks)
                  
        prediction_valid = model.predict(x_valid.astype('float32'), batch_size= batch_size, verbose=2)
        
        score = accuracy_score(np.argmax(y_valid, axis =1), 
                               np.argmax(prediction_valid, axis = 1))
        print("accuracy_score = ", score)
        sum_score += score*len(valid_idx)
        
        #store valid preditcion
        for i in range(len(valid_idx)):
            yfull_train[valid_idx[i]] = prediction_valid[i]
            
        models.append(model) 
     
    score = sum_score / len(train_images)
    print("accuracy_score_train indepent avg", score)
    
    info_string = 'acc' +str(score) + '_folds_' + str(nfolds) + '_ep_' + str(nb_epoch)
    return info_string, models, Y_train
        

In [None]:
def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()

## Create submission 

In [None]:
def create_submission(predictions, test_id, info, Y_train):
    predictions = np.argmax(predictions, axis= 1)    
    y_maps = dict()
    y_maps = {v:k for k, v in Y_train.items()}
    pred_labels = [y_maps[k] for k in predictions]
                            
    sub1 = pd.DataFrame({'test_id':test_id, 'label':pred_labels})    
    sub_file = 'submission_' + info + '.csv'
    sub1.to_csv(sub_file, index = False)

## run Kfold validatin process on test data using trained model 
take mean of kfoled predictions

In [None]:
def run_kfold_validatin_on_test(info_string, models, Y_train):
    num_folds = 0
    batch_size = 16
    yfull_test = []
    nfolds = len(models)
    
    test_data, test_id = read_and_normalize_test_data()
    
    for i in range(nfolds):
        num_folds += 1
        model = models[i]
        print("starts Kfold number {0} from {1}".format(num_folds, nfolds))        
        test_predition = model.predict(test_data, batch_size=batch_size, verbose=2)
        yfull_test.append(test_predition)
        
    test_res = merge_several_folds_mean(data=yfull_test, nfolds=nfolds)
   
    create_submission(predictions=test_res, test_id=test_id, info=info_string, Y_train =Y_train)

## run the model

In [None]:
if  __name__ == '__main__':
    st = time.time()
    print("Keras verstion {}".format(keras_version))
    nfolds = 5
    info_string, models, Y_train = run_kfold_on_created_model(nfolds)
    run_kfold_validatin_on_test(info_string, models, Y_train)
    print("completly run model: {} seconds".format(round(time.time() - st, 2))) 
    

In [None]:
sub = pd.read_csv('submission_1.csv')
sub.head(5)

In [None]:
sub.label.value_counts()