In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix
from scipy import misc
import os
import imgaug.augmenters as iaa
import matplotlib.pyplot as plt
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Convolution2D, MaxPooling2D, Conv2D, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
import keras
import keras.backend as K
from keras import applications
%matplotlib inline

Using TensorFlow backend.


Смотрим на распределение классов.

In [3]:
labels = pd.read_csv('trainLabels.csv')
labels.head()

Unnamed: 0,ID,Class
0,1,n
1,2,8
2,3,T
3,4,I
4,5,R


In [4]:
counts = labels.Class.value_counts()
labels.Class.value_counts()

A    459
E    357
R    309
O    291
N    279
I    270
S    270
T    257
e    196
C    170
L    168
D    165
a    156
H    152
P    131
o    129
i    128
M    127
n    125
r    123
G    111
t    107
s     95
B     90
U     83
0     78
l     74
F     69
K     67
1     65
    ... 
W     57
d     54
Y     54
J     52
c     52
h     45
m     44
X     43
6     42
5     41
u     39
y     37
p     37
4     37
3     36
9     36
g     32
Z     31
7     31
f     28
q     28
k     26
b     25
w     24
v     23
x     23
8     21
Q     21
j     20
z     20
Name: Class, Length: 62, dtype: int64

In [5]:
cl_number = len(np.unique(labels.Class.values)) # количество классов

Виден большой дисбаланс в классах => попытка это исправить за счет аугментации.

In [6]:
augm_index = np.ndarray((cl_number,2), dtype = 'object')
for i, key in enumerate(counts.to_dict().keys()):
    mult_ind = counts.max()/counts.to_dict()[key]/4
    augm_index[i][0] = key
    augm_index[i][1] = np.floor(mult_ind)

In [7]:
def data_augmentation(data, labels, augm_values):
    sometimes = lambda aug: iaa.Sometimes(0.3, aug)
    seq = iaa.Sequential([
        iaa.Fliplr(p=0.5),
        sometimes(iaa.Flipud(p=0.5)),
        iaa.Crop(percent=(0, 0.2)),
        iaa.Affine(rotate = (-90,90), 
                   scale = {"x": (0.8, 1.2), "y": (0.8, 1.2)},
                   shear = (-20, 20)),
        iaa.Dropout(p = 0.1)
    ])
    
    for label, mult_value in augm_values:
        #print(label, mult_value)
        for n in range(int(mult_value) + 1):
            new_data = seq.augment_images(data[labels == label])
            data = np.vstack((data, np.round(new_data)))
            new_labels = np.full(new_data.shape[0], label)
            labels = np.concatenate((labels, new_labels))
    return data, labels  

Загрузка тренировочной выборки.

In [8]:
im_size = 32

In [9]:
train_files = os.listdir('train')

In [10]:
train_images = np.zeros((len(train_files), im_size, im_size, 3))
for i, file in enumerate(sorted(train_files,key = lambda str: int(str.split('.')[0]))):
    real_name = os.path.join('train', file)
    image = misc.imread(real_name,mode = 'RGB')
    image = misc.imresize(image,(im_size, im_size))
    train_images[i] = image

Augmentation

In [11]:
aug_data, aug_labels = data_augmentation(train_images, labels.Class.values, augm_index)

Бинаризация лэйблов.

In [12]:
binar_machine = LabelBinarizer()
binar_machine.fit(labels.Class.values)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

Обучение моделей

In [13]:
def create_model():
    model = Sequential()
 
    model.add(Conv2D(128,kernel_size=(3,3),padding='same', input_shape=(im_size,im_size, 3), activation='relu',))
    model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(256, kernel_size=(3,3), padding='same',activation='relu'))
    model.add(Conv2D(256, kernel_size=(3,3),padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(512, kernel_size=(3,3), padding='same',activation='relu'))
    model.add(Conv2D(512, kernel_size=(3,3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), data_format='channels_last'))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096,activation='relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(cl_number,  activation = 'softmax'))
    return model

In [14]:
batch_size = 32
nb_epochs = 50
cv = StratifiedKFold(n_splits=3)
scores = []
for train_indices, test_inidces in cv.split(aug_data, aug_labels):
    model = create_model()
    model.compile(optimizer = keras.optimizers.Adadelta(),
                  loss = keras.losses.categorical_crossentropy,
                  metrics = ['accuracy'])
    model.fit(aug_data[train_indices], binar_machine.transform(aug_labels[train_indices]),
              batch_size = batch_size,
              epochs=nb_epochs)
    scores.append(model.evaluate(train_images[test_indices], binar_machine(aug_labels[test_indices]))[1])
print('CV score: {}'.format(np.mean(scores)))

Epoch 1/50
 2208/21249 [==>...........................] - ETA: 124s - loss: 15.8400 - acc: 0.0140

KeyboardInterrupt: 

После выбора лучшей модели, обучаем ее на всех данных.

In [15]:
batch_size = 32
nb_epochs = 100
final_model = create_model()
final_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 32, 32, 128)       3584      
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 32, 32, 128)       147584    
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 16, 16, 128)       0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 16, 16, 256)       295168    
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 16, 16, 256)       590080    
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 8, 8, 256)         0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 8, 8, 512)         1180160   
__________

In [20]:
final_model.compile(optimizer = keras.optimizers.Adadelta(),
              loss = keras.losses.categorical_crossentropy,
              metrics = ['accuracy'])
final_model.fit(aug_data, binar_machine.transform(aug_labels),
          batch_size = batch_size,
          epochs=nb_epochs)

Epoch 1/100
 1152/31900 [>.............................] - ETA: 128s - loss: 5.2386 - acc: 0.0330

KeyboardInterrupt: 

Загрузка тестовой выборки.

In [21]:
f_tests = os.listdir('test')
test_images = np.zeros((len(f_tests), im_size, im_size, 3))
for i, file in enumerate(sorted(f_tests,key = lambda str: int(str.split('.')[0]))):
    #print(file)
    real_name = os.path.join('test', file)
    image = misc.imread(real_name,mode = 'RGB')
    image = misc.imresize(image,(im_size, im_size))
    #print(image.shape)
    test_images[i] = image

Получение предсказания на тестовой выборке.

In [22]:
predictions = final_model.predict(test_images)

In [23]:
test_csv = pd.read_csv('sampleSubmission.csv')
test_csv.drop('Class', inplace=True, axis = 1)

In [25]:
test_csv['Class'] = pd.Series(binar_machine.inverse_transform(predictions))
test_csv.to_csv('result.csv', index = False)