In [None]:
!mkdir cifar10
!curl -o cifar-10-python.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
!tar -xvzf cifar-10-python.tar.gz -C cifar10

Для данного задания взял нейронную сеть:
https://github.com/Lasagne/Recipes/blob/master/papers/deep_residual_learning/Deep_Residual_Learning_CIFAR-10.py
и попытался ее обучить

<h1 align="center">First of all -- Checking Questions</h1> 

**Вопрос 1**: Чем отличаются современные сверточные сети от сетей 5 летней давности?

<Ответ>

**Вопрос 2**: Какие неприятности могут возникнуть во время обучения современных нейросетей?

<Ответ>

**Вопрос 3**: У вас есть очень маленький датасет из 100 картинок, но вы очень хотите использовать нейросеть, какие неприятности вас ждут и как их решить?

<Ответ>

**Вопрос 4**: У вас есть очень маленький датасет из 100 картинок, классификация, но вы очень хотите использовать нейросеть, какие неприятности вас ждут и как их решить? что делать если первый вариант  решения не заработает?

<Ответ>

**Вопрос 5**: Как сделать стайл трансфер для музыки? oO

<Ответ>

### Соберите нейронку: 
- Many times x (Conv+Pool)
- Many small convolutions like 3x3
- Batch Norm 
- Residual Connection
- Data Augmentation 
- Learning rate Schedule 
- ...

### Для вдохновения 
- http://torch.ch/blog/2015/07/30/cifar.html
- http://www.robots.ox.ac.uk/~vgg/research/very_deep/
- https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf
- https://github.com/szagoruyko/wide-residual-networks 

### Самое интересное
- Для сдачи задания нужно набрать на точность тесте > **92.5**% (это займет много времени, торопитесь :) )
- Для получения бонусных баллов > **95.0**%
- Будет очень хорошо если вы придумаете свою архитектуру или сможете обучить что-то из вышеперечисленного :)
- А для обучения всего этого добра вам будет куда удобнее использовать GPU на Amazon 
    - Инструкция https://github.com/persiyanov/ml-mipt/tree/master/amazon-howto 
    - Вам помогут tmux, CuDNN, ssh tunnel, nvidia-smi, ... 
    - Wish you get fun :)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pygpu

%matplotlib inline

In [2]:
import sys
import os
import time
import string
import random
import pickle

import numpy as np
import theano
import theano.tensor as T
import lasagne

# for the larger networks (n>=9), we need to adjust pythons recursion limit
sys.setrecursionlimit(10000)

# ##################### Load data from CIFAR-10 dataset #######################
# this code assumes the cifar dataset from 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
# has been extracted in current working directory

def unpickle(file):
    import pickle
    fo = open(file, 'rb')
    dict = pickle.load(fo, encoding='bytes')
    fo.close()
    return dict

def load_data():
    xs = []
    ys = []
    for j in range(5):
      d = unpickle('cifar10/cifar-10-batches-py/data_batch_' + str(j+1))
      x = d[b'data']
      y = d[b'labels']
      xs.append(x)
      ys.append(y)

    d = unpickle('cifar10/cifar-10-batches-py/test_batch')
    xs.append(d[b'data'])
    ys.append(d[b'labels'])

    x = np.concatenate(xs) / np.float32(255)
    y = np.concatenate(ys)
    x = np.dstack((x[:, :1024], x[:, 1024:2048], x[:, 2048:]))
    x = x.reshape((x.shape[0], 32, 32, 3)).transpose(0,3,1,2)

    # subtract per-pixel mean
    pixel_mean = np.mean(x[0:50000],axis=0)
    #pickle.dump(pixel_mean, open("cifar10-pixel_mean.pkl","wb"))
    x -= pixel_mean

    # create mirrored images
    X_train = x[0:50000,:,:,:]
    Y_train = y[0:50000]
    X_train_flip = X_train[:,:,:,::-1]
    Y_train_flip = Y_train
    X_train = np.concatenate((X_train,X_train_flip),axis=0)
    Y_train = np.concatenate((Y_train,Y_train_flip),axis=0)

    X_test = x[50000:,:,:,:]
    Y_test = y[50000:]

    return (
        lasagne.utils.floatX(X_train),
        Y_train.astype('int32'),
        lasagne.utils.floatX(X_test),
        Y_test.astype('int32'),
    )

Using cuDNN version 5110 on context None
Mapped name None to device cuda0: GRID K520 (0000:00:03.0)


In [3]:
X_train, y_train, X_test, y_test = load_data()

In [4]:
# ##################### Build the neural network model #######################

from lasagne.layers import Conv2DLayer as ConvLayer
#from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer
from lasagne.layers import ElemwiseSumLayer
from lasagne.layers import InputLayer
from lasagne.layers import DenseLayer
from lasagne.layers import GlobalPoolLayer
from lasagne.layers import PadLayer
from lasagne.layers import ExpressionLayer
from lasagne.layers import NonlinearityLayer
from lasagne.nonlinearities import softmax, rectify
from lasagne.layers import batch_norm

def build_cnn(input_var=None, n=9):
    # create a residual learning building block with two stacked 3x3 convlayers as in paper
    def residual_block(l, increase_dim=False, projection=False):
        input_num_filters = l.output_shape[1]
        if increase_dim:
            first_stride = (2,2)
            out_num_filters = input_num_filters * 2
        else:
            first_stride = (1,1)
            out_num_filters = input_num_filters

        stack_1 = batch_norm(ConvLayer(l, num_filters=out_num_filters, filter_size=(3,3), stride=first_stride, nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False))
        stack_2 = batch_norm(ConvLayer(stack_1, num_filters=out_num_filters, filter_size=(3,3), stride=(1,1), nonlinearity=None, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False))
        
        # add shortcut connections
        if increase_dim:
            if projection:
                # projection shortcut, as option B in paper
                projection = batch_norm(ConvLayer(l, num_filters=out_num_filters, filter_size=(1,1), stride=(2,2), nonlinearity=None, pad='same', b=None, flip_filters=False))
                block = NonlinearityLayer(ElemwiseSumLayer([stack_2, projection]),nonlinearity=rectify)
            else:
                # identity shortcut, as option A in paper
                identity = ExpressionLayer(
                    l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], int(s[2]/2), int(s[3]/2))
                )
                padding = PadLayer(identity, [int(out_num_filters/4),0,0], batch_ndim=1)
                block = NonlinearityLayer(ElemwiseSumLayer([stack_2, padding]),nonlinearity=rectify)
        else:
            block = NonlinearityLayer(ElemwiseSumLayer([stack_2, l]),nonlinearity=rectify)
        
        return block

    # Building the network
    l_in = InputLayer(shape=(None, 3, 32, 32), input_var=input_var)

    # first layer, output is 16 x 32 x 32
    l = batch_norm(ConvLayer(l_in, num_filters=16, filter_size=(3,3), stride=(1,1), nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False))
    
    # first stack of residual blocks, output is 16 x 32 x 32
    for _ in range(n):
        l = residual_block(l)

    # second stack of residual blocks, output is 32 x 16 x 16
    l = residual_block(l, increase_dim=True)
    for _ in range(1,n):
        l = residual_block(l)

    # third stack of residual blocks, output is 64 x 8 x 8
    l = residual_block(l, increase_dim=True)
    for _ in range(1,n):
        l = residual_block(l)
    
    # average pooling
    l = GlobalPoolLayer(l)

    # fully connected layer
    network = DenseLayer(
            l, num_units=10,
            W=lasagne.init.HeNormal(),
            nonlinearity=softmax)

    return network

### Вот и всё, пошли её учить

In [5]:
# ############################# Batch iterator ###############################

def iterate_minibatches(inputs, targets, batchsize, shuffle=False, augment=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        if augment:
            padded = np.pad(inputs[excerpt],((0,0),(0,0),(4,4),(4,4)),mode='constant')
            random_cropped = np.zeros(inputs[excerpt].shape, dtype=np.float32)
            crops = np.random.random_integers(0,high=8,size=(batchsize,2))
            for r in range(batchsize):
                random_cropped[r,:,:,:] = padded[r,:,crops[r,0]:(crops[r,0]+32),crops[r,1]:(crops[r,1]+32)]
            inp_exc = random_cropped
        else:
            inp_exc = inputs[excerpt]

        yield inp_exc, targets[excerpt]

In [6]:
import lasagne
from theano import tensor as T
from lasagne.nonlinearities import *

input_X = T.tensor4("X")
target_y = T.vector("target Y integer",dtype='int32')

In [7]:
net = build_cnn(input_X)

In [8]:
y_predicted = lasagne.layers.get_output(net)
all_weights = lasagne.layers.get_all_params(net, trainable=True)
print(all_weights)

[W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, b]


In [9]:
from lasagne.layers import get_all_layers
curr_lr = 0.1
loss = lasagne.objectives.categorical_crossentropy(y_predicted, target_y).mean()
loss += lasagne.regularization.regularize_layer_params(get_all_layers(net), lasagne.regularization.l2) * 0.0001
accuracy = lasagne.objectives.categorical_accuracy(y_predicted, target_y).mean()

In [10]:
updates = lasagne.updates.momentum(loss, all_weights, learning_rate=curr_lr, momentum=0.9)

In [11]:
train_fun = theano.function([input_X, target_y], [loss, accuracy], updates=updates, allow_input_downcast=True)
accuracy_fun = theano.function([input_X, target_y], accuracy, allow_input_downcast=True)

# Процесс обучения

In [12]:
import time

num_epochs = 100 #количество проходов по данным

batch_size = 128 #размер мини-батча


for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    
    train_err = 0
    train_acc = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True, augment=True):
        inputs, targets = batch
        train_err_batch, train_acc_batch = train_fun(inputs, targets)
        train_err += train_err_batch
        train_acc += train_acc_batch
        train_batches += 1

    # And a full pass over the validation data:
    val_acc = 0
    val_batches = 0
    for batch in iterate_minibatches(X_test, y_test, batch_size):
        inputs, targets = batch
        val_acc += accuracy_fun(inputs, targets)
        val_batches += 1

    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(epoch+1, num_epochs+1, time.time() - start_time))
    print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
    print("  train accuracy:\t\t{:.2f} %".format(train_acc / train_batches * 100))
    print("  validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))

    if (epoch + 1) == 41 or (epoch + 1) == 61:
        curr_lr = curr_lr * 0.1
        print("New LR: {}".format(curr_lr))
        updates = lasagne.updates.momentum(loss, all_weights, learning_rate=curr_lr, momentum=0.9)
        train_fun = theano.function(
            [input_X, target_y], [loss, accuracy], updates=updates, allow_input_downcast=True)
        accuracy_fun = theano.function([input_X, target_y], accuracy, allow_input_downcast=True)



Epoch 1 of 101 took 378.639s
  training loss (in-iteration):		2.983430
  train accuracy:		24.83 %
  validation accuracy:		43.26 %
Epoch 2 of 101 took 378.553s
  training loss (in-iteration):		2.056971
  train accuracy:		50.94 %
  validation accuracy:		60.61 %
Epoch 3 of 101 took 378.584s
  training loss (in-iteration):		1.551922
  train accuracy:		65.46 %
  validation accuracy:		69.97 %
Epoch 4 of 101 took 378.566s
  training loss (in-iteration):		1.252576
  train accuracy:		73.19 %
  validation accuracy:		76.62 %
Epoch 5 of 101 took 378.594s
  training loss (in-iteration):		1.069197
  train accuracy:		77.51 %
  validation accuracy:		78.84 %
Epoch 6 of 101 took 378.600s
  training loss (in-iteration):		0.954111
  train accuracy:		80.00 %
  validation accuracy:		80.04 %
Epoch 7 of 101 took 378.585s
  training loss (in-iteration):		0.874004
  train accuracy:		81.80 %
  validation accuracy:		80.34 %
Epoch 8 of 101 took 378.685s
  training loss (in-iteration):		0.814204
  train accuracy:		

KeyboardInterrupt: 

Так как осталось немного до 92.5 и видно, что accuracy скачет, остановим обучение и уменьшим learning rate раньше запланированного. 

In [14]:
print(curr_lr)

0.010000000000000002


In [15]:
curr_lr = curr_lr * 0.1
print("New LR: {}".format(curr_lr))
updates = lasagne.updates.momentum(loss, all_weights, learning_rate=curr_lr, momentum=0.9)
train_fun = theano.function(
            [input_X, target_y], [loss, accuracy], updates=updates, allow_input_downcast=True)
accuracy_fun = theano.function([input_X, target_y], accuracy, allow_input_downcast=True)

New LR: 0.0010000000000000002


In [17]:
import time

num_epochs = 100 #количество проходов по данным

batch_size = 128 #размер мини-батча


for epoch in range(55, num_epochs):
    # In each epoch, we do a full pass over the training data:
    
    train_err = 0
    train_acc = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True, augment=True):
        inputs, targets = batch
        train_err_batch, train_acc_batch = train_fun(inputs, targets)
        train_err += train_err_batch
        train_acc += train_acc_batch
        train_batches += 1

    # And a full pass over the validation data:
    val_acc = 0
    val_batches = 0
    for batch in iterate_minibatches(X_test, y_test, batch_size):
        inputs, targets = batch
        val_acc += accuracy_fun(inputs, targets)
        val_batches += 1

    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(epoch+1, num_epochs+1, time.time() - start_time))
    print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
    print("  train accuracy:\t\t{:.2f} %".format(train_acc / train_batches * 100))
    print("  validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))

    if (epoch + 1) == 41 or (epoch + 1) == 71:
        curr_lr = curr_lr * 0.1
        print("New LR: {}".format(curr_lr))
        updates = lasagne.updates.momentum(loss, all_weights, learning_rate=curr_lr, momentum=0.9)
        train_fun = theano.function(
            [input_X, target_y], [loss, accuracy], updates=updates, allow_input_downcast=True)
        accuracy_fun = theano.function([input_X, target_y], accuracy, allow_input_downcast=True)



Epoch 56 of 101 took 379.266s
  training loss (in-iteration):		0.256893
  train accuracy:		99.43 %
  validation accuracy:		91.65 %
Epoch 57 of 101 took 379.259s
  training loss (in-iteration):		0.252459
  train accuracy:		99.57 %
  validation accuracy:		91.63 %
Epoch 58 of 101 took 379.306s
  training loss (in-iteration):		0.251532
  train accuracy:		99.59 %
  validation accuracy:		91.67 %
Epoch 59 of 101 took 379.331s
  training loss (in-iteration):		0.250250
  train accuracy:		99.59 %
  validation accuracy:		91.71 %
Epoch 60 of 101 took 379.263s
  training loss (in-iteration):		0.248426
  train accuracy:		99.65 %
  validation accuracy:		91.66 %
Epoch 61 of 101 took 379.249s
  training loss (in-iteration):		0.247316
  train accuracy:		99.65 %
  validation accuracy:		91.80 %


KeyboardInterrupt: 

Остановил и попробовал стать колдуном 80 уровня, но нехватило совсем немного. Поэтому попробуем дообучить сеть.

In [19]:
import time

num_epochs = 100 #количество проходов по данным

batch_size = 128 #размер мини-батча


for epoch in range(62, num_epochs):
    # In each epoch, we do a full pass over the training data:
    
    train_err = 0
    train_acc = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True, augment=True):
        inputs, targets = batch
        train_err_batch, train_acc_batch = train_fun(inputs, targets)
        train_err += train_err_batch
        train_acc += train_acc_batch
        train_batches += 1

    # And a full pass over the validation data:
    val_acc = 0
    val_batches = 0
    for batch in iterate_minibatches(X_test, y_test, batch_size):
        inputs, targets = batch
        val_acc += accuracy_fun(inputs, targets)
        val_batches += 1

    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(epoch+1, num_epochs+1, time.time() - start_time))
    print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
    print("  train accuracy:\t\t{:.2f} %".format(train_acc / train_batches * 100))
    print("  validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))

    if (epoch + 1) == 41 or (epoch + 1) == 71:
        curr_lr = curr_lr * 0.1
        print("New LR: {}".format(curr_lr))
        updates = lasagne.updates.momentum(loss, all_weights, learning_rate=curr_lr, momentum=0.9)
        train_fun = theano.function(
            [input_X, target_y], [loss, accuracy], updates=updates, allow_input_downcast=True)
        accuracy_fun = theano.function([input_X, target_y], accuracy, allow_input_downcast=True)



Epoch 63 of 101 took 379.246s
  training loss (in-iteration):		0.246679
  train accuracy:		99.66 %
  validation accuracy:		91.75 %
Epoch 64 of 101 took 379.250s
  training loss (in-iteration):		0.245784
  train accuracy:		99.66 %
  validation accuracy:		91.83 %


KeyboardInterrupt: 

In [25]:
print(curr_lr)

0.0010000000000000002


In [26]:
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 500):
    inputs, targets = batch
    acc = accuracy_fun(inputs, targets)
    test_acc += acc
    test_batches += 1
print("Final results:")
print("  test accuracy:\t\t{:.2f} %".format(
    test_acc / test_batches * 100))

if test_acc / test_batches * 100 > 92.5:
    print("Achievement unlocked: колдун 80 уровня")
else:
    print("Нужно больше магии!")

Final results:
  test accuracy:		92.53 %
Achievement unlocked: колдун 80 уровня


## Заполните форму

https://goo.gl/forms/EeadABISlVmdJqgr2 