In [1]:
import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np
import sys
import os
sys.path.append("../lib")
from load import mnist
from load import faces
from load import getValData
import pickle
from theano.tensor.nnet.conv import conv2d
from theano.tensor.signal.downsample import max_pool_2d
from six.moves import cPickle


Using gpu device 0: GeForce GTX 980


In [2]:
currentDir =  os.getcwd();
logPickle = currentDir + "/ErrosCNN8Layers.pickle"
modelDir = currentDir + "/Model8Layer/"

In [3]:
# trX, teX, trY, teY = mnist(onehot=True)

# trX = trX.reshape(-1, 1, 28, 28)
# teX = teX.reshape(-1, 1, 28, 28)

trX, teX, trY, teY = faces(onehot=True)

ValX , ValY = getValData()

trX = trX.reshape(-1, 1, 48, 48)
teX = teX.reshape(-1, 1, 48, 48)


In [4]:
srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    W = theano.shared(floatX(np.random.randn(*shape) * 0.01))
    return W

def init_bias(shape):
    b_values = np.zeros((shape[0],), dtype=theano.config.floatX)
    b = theano.shared(value=b_values, borrow=True )
    return b

def rectify(X):
    return T.maximum(X, 0.)

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X = X *  srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X = (X/retain_prob)
    return X

def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def negative_log_likelihood(p_y_given_x,y):
        """Return the mean of the negative log-likelihood of the prediction
        of this model under a given target distribution.

        .. math::

            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
                \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
            \ell (\theta=\{W,b\}, \mathcal{D})

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label

        Note: we use the mean instead of the sum so that
              the learning rate is less dependent on the batch size
        """
        # start-snippet-2
        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
        # number of examples (call it n) in the minibatch
        # T.arange(y.shape[0]) is a symbolic vector which will contain
        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
        # Log-Probabilities (call it LP) with one row per example and
        # one column per class LP[T.arange(y.shape[0]),y] is a vector
        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
        # the mean (across minibatch examples) of the elements in v,
        # i.e., the mean log-likelihood across the minibatch.
        return -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
    
def saveModel(i):
    snapshot = modelDir + "ModelSnapshot" + str(i) +".pkl"    
    mfile = open(snapshot, 'wb')
    cPickle.dump(params, mfile, protocol=cPickle.HIGHEST_PROTOCOL)
    mfile.close()


In [5]:
def model(X, w, b , w2, b2 ,  w3, b3, w4, b4, w5 , b5, w6 , b6 , w7 , b7 , w_o, b_o, p_drop_conv, p_drop_hidden):
    l1a = conv2d(X, w)
    l1a = rectify(l1a + b.dimshuffle('x', 0, 'x', 'x'))
    l1 = max_pool_2d(l1a, (2, 2))
    l1 = dropout(l1, p_drop_conv)

    l2a = conv2d(l1, w2)
    l2a = rectify(l2a + b2.dimshuffle('x', 0, 'x', 'x'))
    #l2 = max_pool_2d(l2a, (2, 2))
    l2 = dropout(l2a, p_drop_conv)
    
    l3a = conv2d(l2, w3)
    l3a = rectify(l3a + b3.dimshuffle('x', 0, 'x', 'x'))
    l3 = max_pool_2d(l3a, (2, 2))
    l3 = dropout(l3, p_drop_conv)

    l4a = conv2d(l3, w4)
    l4b = rectify(l4a + b4.dimshuffle('x', 0, 'x', 'x'))
    l4 = T.flatten(l4b, outdim=2)
    l4 = dropout(l4, p_drop_conv)

    l5 = rectify(T.dot(l4, w5) + b5.dimshuffle('x', 0 ))
    l5 = dropout(l5, p_drop_hidden)
    
    l6 = rectify(T.dot(l5, w6) + b6.dimshuffle('x', 0))
    l6 = dropout(l6, p_drop_hidden)
    
    l7 = rectify(T.dot(l6, w7) + b7.dimshuffle('x', 0))
    l7 = dropout(l7, p_drop_hidden)

    #pyx = T.nnet.softmax(T.dot(l7, w_o) + b_o.dimshuffle('x', 0))
    opVec = T.dot(l7, w_o) + b_o.dimshuffle('x', 0)
    ydev = opVec-opVec.max(1,keepdims=True)
    pyx = ydev - T.log(T.sum(T.exp(ydev),axis=1,keepdims=True))
    return l1, l2, l3, l4, l5, l6, l7 , pyx


In [6]:

w = init_weights((32, 1, 7, 7))
b = init_bias((32,1,1,1))

w2 = init_weights((64, 32, 5, 5))
b2 = init_bias((64,1,1,1))

w3 = init_weights((128, 64, 3, 3))
b3 = init_bias((128,1,1,1))

w4 = init_weights((256, 128, 3, 3))
b4 = init_bias((256,1,1,1))

w5 = init_weights((256 * 6 * 6, 10000)) 
b5 = init_bias((10000,1))

w6 = init_weights((10000, 10000)) 
b6 = init_bias((10000 , 1))

w7 = init_weights((10000, 1000)) 
b7 = init_bias((1000 , 1))

w_o = init_weights((1000, 7))
b_o = init_bias((7,1))



In [7]:
learning_rate = 0.005
batch_size = 120

X = T.ftensor4()
Y = T.fmatrix()
#Train Loop
noise_l1, noise_l2, noise_l3, noise_l4, noise_l5 , noise_l6, noise_l7, noise_py_x = model(X, w, b, w2, b2, w3, b3, w4, b4, w5, b5, w6, b6, w7, b7, w_o, b_o, 0.2, 0.5)

params = [ w, b, w2, b2, w3, b3, w4, b4, w5, b5, w6, b6, w7, b7, w_o, b_o ]

#cost = T.nnet.categorical_crossentropy(noise_py_x, Y).mean()
#cost = negative_log_likelihood(noise_py_x , Y)
#cost = T.nnet.binary_crossentropy(noise_py_x, Y).mean()

#stabler Cross Entropy

cost = -T.sum(Y*noise_py_x,axis=1).mean()

updates = RMSprop(cost, params, lr=0.0009)

#grads = T.grad(cost, params)
#updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)]
    

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)

#Predict Loop
l1, l2, l3, l4, l5,l6, l7, py_x = model(X, w, b, w2, b2, w3, b3, w4, b4, w5, b5, w6, b6, w7, b7, w_o, b_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

In [None]:
oldError = 0.
startError = 0.
iterStart = 0
iterEnd = 1001

for i in range(iterStart,iterEnd):
    for start, end in zip(range(0, len(trX), batch_size), range(batch_size, len(trX), batch_size)):
        cost = train(trX[start:end], trY[start:end])
    error = np.mean(np.argmax(teY, axis=1) == predict(teX))
    
    if(i == iterStart):
        oldError = error
        startError = error
    
    delta = ((error - oldError)/oldError)*100
    improvement = ((startError - error)/startError)*100
    
    if (delta > 0.2 or (i%100 == 0)):
        saveModel(i)
    if (improvement > 1.0):
        saveModel(i)
        startError = error
            
    logline = "Epoch: " + str(i) + "  Error: " + str(error) + " Cost: " + str(cost)
    print logline
    f = open(logPickle, 'a+')
    cPickle.dump(logline , f);
    f.close()
    oldError = error