In [2]:
import tensorflow as tf
import numpy as np
import math
import logging

In [2]:
logging.basicConfig(filename='mlp.log', level=logging.DEBUG, format="%(levelname)s:%(message)s")
logger = logging.getLogger(__name__)

In [3]:
import mnist_loader
reload(mnist_loader)

<module 'mnist_loader' from 'mnist_loader.pyc'>

In [4]:
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [5]:
training_data[1][1]

array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [6]:
validation_data[1][1]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.])

In [7]:
test_data[1][1]

array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [8]:
# 以字典的形式保存data，
# 键包括 X_train, y_train, X_val, y_val
num_train = len(training_data)
num_val = len(validation_data)
num_test = len(test_data)

num_dim = training_data[0][0].shape[0]
num_classes = training_data[0][1].shape[0]

X_train = np.zeros((num_train, num_dim))
y_train = np.zeros((num_train, num_classes))
for i in range(num_train):
    X_train[i, :] = training_data[i][0].ravel()
    y_train[i, :] = training_data[i][1]
    
X_val = np.zeros((num_val, num_dim))
y_val = np.zeros((num_val, num_classes))
for i in range(num_val):
    X_val[i, :] = validation_data[i][0].ravel()
    y_val[i, :] = validation_data[i][1]

X_test = np.zeros((num_test, num_dim))
y_test = np.zeros((num_test, num_classes))
for i in range(num_test):
    X_test[i, :] = test_data[i][0].ravel()
    y_test[i, :] = test_data[i][1]

X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.float32)
X_val = X_val.astype(np.float32)
y_val = y_val.astype(np.float32)
X_test = X_test.astype(np.float32)
y_test = y_test.astype(np.float32)

data = {}
data['X_train'] = X_train
data['y_train'] = y_train
data['X_val'] = X_val
data['y_val'] = y_val
data['X_test'] = X_test
data['y_test'] = y_test

In [9]:
print X_train.shape, y_train.shape, X_val.shape, y_val.shape
print X_train.dtype, y_train.dtype, X_val.dtype, y_val.dtype

(50000, 784) (50000, 10) (10000, 784) (10000, 10)
float32 float32 float32 float32


In [10]:
print (X_train[1].reshape((28,-1))*255).astype(np.uint8)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  50 158 252
  158  49   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0  47 237 251 251
  251 236   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0  53 226 252 251 238
  232 251  56   5   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   9  59 223 251 252 251 201
   83 251 252 121   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0 162 251 251 251 25

In [10]:
class MLP(object):
    """
    multi layer perceptron
    -------------------------
    input_dim : 
    hidden_layers : 
    num_classes : 
    reg :           regularization penalty
    keep_prob :     if False, do not use dropout, otherwise it means keep probability
    use_batchnorm : if False, do not use bn, otherwise it means decay, usually larger than 0.9
    """
    def __init__(self, input_dim, hidden_layers, num_classes, reg=0.0, keep_prob=False, use_batchnorm=False):    
        self.input_dim = input_dim
        self.hidden_layers = hidden_layers
        self.num_hidden = len(hidden_layers)
        self.num_classes = num_classes
        
        self.X = tf.placeholder(dtype=tf.float32, shape=[None, input_dim], name='input_x')
        self.y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='output_y')
        
        self.reg = reg
        self.l2_penalty = tf.constant(0.0)
        
        self.drop_keep_rate = keep_prob
        self.use_batchnorm = use_batchnorm
        
        self.is_train = tf.placeholder(tf.bool)
        
        self.build("MLP")
        pass
    
    def build(self, prefix):
        if self.num_hidden != 0:
            layer_nodes = [self.input_dim] + self.hidden_layers + [self.num_classes]
        else:
            layer_nodes = [self.input_dim, self.num_classes]
        
        self.W = []
        self.b = []
        self.all_l2_loss = []
        
        # hidden layer
        incoming = self.X
        for i in range(self.num_hidden):
            if self.use_batchnorm:
                incoming, l2_loss = self.affine_bn_relu(incoming, layer_nodes[i], layer_nodes[i+1], \
                                                 layer_name=prefix+"_hid_"+str(i+1), act_fn=tf.nn.relu) 
                self.all_l2_loss.append(l2_loss)
            else :
                incoming, l2_loss = self.affine_relu(incoming, layer_nodes[i], layer_nodes[i+1], \
                                                       layer_name=prefix+"_hid_"+str(i+1), act_fn=tf.nn.relu) 
                self.all_l2_loss.append(l2_loss)
            
            if self.drop_keep_rate:
                incoming = self.drop_layer(incoming, layer_name=prefix+'_drop_'+str(i+1))
        
        # output layer
        self.output, l2_loss = self.affine_relu(incoming, layer_nodes[-2], layer_nodes[-1], \
                                                layer_name=prefix+"_output", act_fn=tf.nn.relu)
        self.all_l2_loss.append(l2_loss)
        
        # loss
        total_entropy_loss = tf.losses.softmax_cross_entropy(logits=self.output, onehot_labels=self.y)
        self.mean_entropy_loss = tf.reduce_mean(total_entropy_loss, name='mean_entropy_loss')
        
        for l2 in self.all_l2_loss:
            self.l2_penalty += l2
        self.loss = self.mean_entropy_loss + self.l2_penalty * self.reg
        
        # accuracy
        self.predict_score = tf.argmax(self.output, 1, name='predict')
        self.ground_truth = tf.argmax(self.y, 1, name='ground_truth')
        corrent_prediction = tf.equal(self.predict_score, self.ground_truth)
        self.accuracy = tf.reduce_mean( tf.cast(corrent_prediction, tf.float32), name='accuracy')
    
    def affine_relu(self, in_tensor, in_dim, out_dim, layer_name, act_fn=tf.nn.relu):
        with tf.name_scope(layer_name):
            init_w = tf.truncated_normal(mean=0, stddev=1./np.sqrt(in_dim), shape=[in_dim, out_dim])
            w = tf.Variable(init_w, name='weights')
            b = tf.Variable(tf.zeros([out_dim]), name='bias')
            
            out_affine = tf.nn.bias_add(tf.matmul(in_tensor, w), b, name="out_affine")
            out_act = act_fn(out_affine, name="out_act")
            
            print w.name, b.name, out_affine.name, out_act.name
            
            l2_loss = tf.nn.l2_loss(w)
            self.W.append(w)
            self.b.append(b)
            
        return out_act, l2_loss
    
    def affine_bn_relu(self, in_tensor, in_dim, out_dim, layer_name, act_fn=tf.nn.relu):
        with tf.name_scope(layer_name):
            init_w = tf.truncated_normal(mean=0, stddev=1./np.sqrt(in_dim), shape=[in_dim, out_dim])
            w = tf.Variable(init_w, name='weights')
            b = tf.Variable(tf.zeros([out_dim]), name='bias')
            
            out_affine = tf.nn.bias_add(tf.matmul(in_tensor, w), b, name='out_affine')
            out_bn = self.batch_norm(out_affine, in_dim, out_dim, layer_name)
            out_act = act_fn(out_bn, name='out_act')
            
            print w.name, b.name, out_affine.name, out_bn.name, out_act.name
            
            l2_loss = tf.nn.l2_loss(w)
            self.W.append(w)
            self.b.append(b)

        return out_act, l2_loss
    
    def batch_norm(self, in_tensor, in_dim, out_dim, layer_name):
        epsilon = 1e-4
        running_mean = tf.Variable(tf.zeros([out_dim]), trainable=False, name='running_mean')
        running_var = tf.Variable(tf.ones([out_dim]), trainable=False, name='running_var')
            
        beta = tf.Variable(tf.zeros([out_dim]), name='beta')
        gamma = tf.Variable(tf.ones([out_dim]), name='gamma')
        
        print running_mean.name, running_var.name, beta.name, gamma.name
        mean, var = tf.nn.moments(in_tensor, [0])
        def train_fn():
            train_mean = tf.assign(running_mean, running_mean * self.use_batchnorm + mean * (1.0 - self.use_batchnorm))
            train_var = tf.assign(running_var, running_var * self.use_batchnorm + var * (1.0 - self.use_batchnorm))
            
            with tf.control_dependencies([train_mean, train_var]):
                return tf.nn.batch_normalization(in_tensor, mean, var, beta, gamma, epsilon)
        
        def test_fn():
            return tf.nn.batch_normalization(in_tensor, running_mean, running_var, beta, gamma, epsilon)
        
        return tf.cond(self.is_train, train_fn, test_fn, name='bn')


    def drop_layer(self, in_tensor, layer_name):
        with tf.name_scope(layer_name):
            out_drop = tf.nn.dropout(in_tensor, self.drop_keep_rate, name='out_drop')
            print out_drop.name
        return out_drop

In [11]:
class Sovler(object):
    def __init__(self, session, model, data, **kwargs):
        self.sess = session
        self.model = model
        
        self.X_train = data['X_train']
        self.y_train = data['y_train']
        self.X_val = data['X_val']
        self.y_val = data['y_val']
        
        self.num_train = self.X_train.shape[0]
        self.num_val = self.X_val.shape[0]
        
        self.num_epochs = kwargs.pop('num_epochs', 10)
        self.batch_size = kwargs.pop('batch_size', 100)
        self.verbose = kwargs.pop('verbose', True)
        self.print_every = kwargs.pop('print_every', 10)
        
        # train Operation
        # early stopping
        self.best_validation_acc = -np.inf
        self.max_epochs_no_best = 2
        
        # sgd
#         self.optimizer = tf.train.GradientDescentOptimizer(0.5)
        # sgd + momentum
        self.optimizer = tf.train.MomentumOptimizer(0.5, momentum=0.6)
        # rmsprop
#         self.optimizer = tf.train.RMSPropOptimizer(0.5, decay=0.9)
        # adam
#         self.optimizer = tf.train.AdamOptimizer(0.5)
        
        self.train_step = self.optimizer.minimize(self.model.loss)        
        pass
    
    def run(self):
        mode = 'train'
        
        num_iter_per_epoch = math.ceil(self.num_train * 1. / self.batch_size)
        num_iters = self.num_epochs * num_iter_per_epoch
        print ("num_epochs: {0}, num_iter_per_epoch: {1}, num_iters: {2}" \
              .format(self.num_epochs, num_iter_per_epoch, num_iters))        
        iter_cnt = 0
        train_indices = np.arange(self.num_train)
        
        #early stopping
        no_better_validation_step = 0
        losses = []
        for i in range(self.num_epochs):
            np.random.shuffle(train_indices)
            for j in range(int(num_iter_per_epoch)):
                start_idx = j * self.batch_size
                end_idx = (j+1) * self.batch_size
                idx = train_indices[start_idx : end_idx]
                actual_batch_size = idx.shape[0]
                
                xt = self.X_train[idx]
                yt = self.y_train[idx]
                
                feed_dict = {self.model.X : xt, self.model.y : yt, self.model.is_train : mode == 'train'}
                fetchs = [self.model.loss, self.model.accuracy, self.model.predict_score, self.train_step]
                
                loss, acc, _, _ = self.sess.run(fetchs, feed_dict=feed_dict)
                
                losses.append(losses)
                
                if self.verbose and iter_cnt % self.print_every == 0:
                    print("Iteration({0} / {1}), actual_batch_size {2}, batch_loss {3}, acc {4}" \
                           .format(iter_cnt, num_iters, actual_batch_size, loss, acc))
                iter_cnt += 1
        
            # test阶段是不做validation的
            # 在一轮 epoch 完成以后验证, 并判断early stopping
            # 注意验证时，应该使得 is_train 为 False
            if mode == 'train':
                no_better_validation_step = self.validation(no_better_validation_step, i+1, mode)
                if no_better_validation_step > self.max_epochs_no_best:
                    break

    def validation(self, no_better_validation_step, th_epoch, mode):
        feed_dict = {self.model.X : self.X_val, self.model.y : self.y_val, self.model.is_train : mode == 'train'}
        fetchs = [self.model.loss, self.model.accuracy]
        val_loss, val_acc = self.sess.run(fetchs, feed_dict=feed_dict)
        print("validation, Epochs({0} / {1}), val_loss {2}, val_acc {3}"\
                  .format(th_epoch, self.num_epochs, val_loss, val_acc))
        if val_acc > self.best_validation_acc:
            self.best_validation_acc = val_acc
            no_better_validation_step = 0
        else:
            no_better_validation_step+=1
        return no_better_validation_step

    def test(self, X, y):
        mode = 'test'
        
        feed_dict = {self.model.X : X, self.model.y : y, self.model.is_train : False}
        fetchs = [self.model.predict_score, self.model.accuracy]
        predict_score, acc = self.sess.run(fetchs, feed_dict=feed_dict)
        
        print "test acc {0}".format(acc)

In [12]:
tf.reset_default_graph()
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.4

num_dim = 784
hidden_layer = [30]
num_classes = 10
model = MLP(784, [30, 30], 10, reg=1e-2, keep_prob=False, use_batchnorm=0.9)
# logger.info("Data Dim:" + str([num_dim] + hidden_layer + [num_classes]))

with tf.Session() as sess:
    solver = Sovler(sess, model, data, num_epochs=2, batch_size=100, verbose=True, print_every=100)
    
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    solver.run()
    
    solver.test(data['X_test'], data['y_test'])

MLP_hid_1/running_mean:0 MLP_hid_1/running_var:0 MLP_hid_1/beta:0 MLP_hid_1/gamma:0
MLP_hid_1/weights:0 MLP_hid_1/bias:0 MLP_hid_1/out_affine:0 MLP_hid_1/bn/Merge:0 MLP_hid_1/out_act:0
MLP_hid_2/running_mean:0 MLP_hid_2/running_var:0 MLP_hid_2/beta:0 MLP_hid_2/gamma:0
MLP_hid_2/weights:0 MLP_hid_2/bias:0 MLP_hid_2/out_affine:0 MLP_hid_2/bn/Merge:0 MLP_hid_2/out_act:0
MLP_output/weights:0 MLP_output/bias:0 MLP_output/out_affine:0 MLP_output/out_act:0
num_epochs: 2, num_iter_per_epoch: 500.0, num_iters: 1000.0
Iteration(0 / 1000.0), actual_batch_size 100, batch_loss 2.54713273048, acc 0.159999996424
Iteration(100 / 1000.0), actual_batch_size 100, batch_loss 0.669258236885, acc 0.870000004768
Iteration(200 / 1000.0), actual_batch_size 100, batch_loss 0.537691473961, acc 0.930000007153
Iteration(300 / 1000.0), actual_batch_size 100, batch_loss 0.731281638145, acc 0.850000023842
Iteration(400 / 1000.0), actual_batch_size 100, batch_loss 0.657475948334, acc 0.860000014305
validation, Epochs(