# This is a one layer NN

In [2]:
# do not forget to swapoff -a
import numpy as np       # linear algebra
import pylab as pl       # plots
import tensorflow as tf  # now we are actually using it 

  from ._conv import register_converters as _register_converters


In [3]:
# choose wisely
(train, label_train), (test, label_test) = tf.keras.datasets.mnist.load_data()
#(train, label_train), (test, label_test) = tf.keras.datasets.fashion_mnist.load_data()

# make sure your data is floating point
test = test.astype(np.float32)
train = train.astype(np.float32)

# print shapes
print(train.shape, label_train.shape)
print(test.shape, label_test.shape)

((60000, 28, 28), (60000,))
((10000, 28, 28), (10000,))


In [4]:
# here we can subsample in the data set, depending on the RAM
num_test, num_train = 10000, 60000
test, label_test = test[:num_test], label_test[:num_test]
train, label_train = train[:num_train], label_train[:num_train]

# print shapes
print(train.shape, label_train.shape)
print(test.shape, label_test.shape)

((60000, 28, 28), (60000,))
((10000, 28, 28), (10000,))


# Reshaping is a cheap operation!

In [5]:
# forget about the image shape
train = train.reshape((-1, 784))
test = test.reshape((-1, 784))

# Placeholder is to pre-allocate the data shape, without stating the memory

In [6]:
# now define the session graph
X_tf = tf.placeholder(shape=[None, 784], dtype=tf.float32) # for each image we have 784 pixels
Y_tf = tf.placeholder(shape=[None],      dtype=tf.int64)   # for each image we have one scalar label, int 32 does not work?
A_tf = tf.Variable(np.zeros((784, 10)), dtype=tf.float32)

a_tf = tf.Variable(np.zeros((10)),      dtype=tf.float32)
F_tf = tf.add(tf.matmul(X_tf, A_tf), a_tf) #X*A+a, a is the translation A is the scaling.



$F=Ax+a$ belongs to R^10 

softmax does softmax(y_j)=exp(y_j)/Q Q is the partition function.

the distribution is the same if we shift the y_j by subtracting the biggest y or the mean we have seen, $\mu$, therefore

softmax(y_j-\mu)=exp(y_j-mu)/Q


then we take the logarithm


log_softmax_F_tf =  (F_tf-M_tf)-log_norm_tf 
F_tf-M_tf is the norm, and the other 





In [7]:
# map scalar labels onto one-hot encoded vectors
L_tf = tf.one_hot(Y_tf, 10, dtype=tf.float32) #How many class labels I am expecting.

# compute crossentrop_softmax_with logits
M_tf = tf.reduce_max(F_tf)
norm_tf = tf.expand_dims(tf.reduce_sum(tf.exp(F_tf-M_tf), axis=1), 1) #Perform the sum in all dimension.
log_norm_tf = tf.where(norm_tf < 0, tf.zeros_like(norm_tf), tf.log(norm_tf))
log_softmax_F_tf =  (F_tf-M_tf)-log_norm_tf 

loss_tf = tf.reduce_mean(tf.reduce_sum(-L_tf * log_softmax_F_tf, axis=1)) 

# let us define the non-differentiable accuracy as metric


#Correctly predicted gives a one or a zero depending on if the prediction 
correctly_predicted_tf = tf.equal(tf.argmax(L_tf, axis=1), 
                                  tf.argmax(tf.nn.softmax(F_tf-M_tf), axis=1))
metric_tf = tf.reduce_mean(tf.cast(correctly_predicted_tf, tf.float32)) 

# define the optimizer
optimizer = tf.train.GradientDescentOptimizer(1E-6)
step_tf = optimizer.minimize(loss_tf)

In [9]:
with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())
    
    num_iterations, print_every, batch_size = 2**14, 2**10, 2**5 #Tuned to give results in a reasonable time
    for iteration in range(num_iterations):
        
        indices = np.random.choice(train.shape[0], batch_size, replace=False)
        X, Y = train[indices], label_train[indices]
        
        sess.run(step_tf, feed_dict={X_tf: X, Y_tf: Y}) #this is where the data could be transferred to a GPU. 
        #The dictionary is used because if there is no connection, tensorflow does not know the placeholder
        
        if iteration % print_every == print_every-1:
            loss, metric = sess.run([loss_tf, metric_tf], feed_dict={X_tf: train, Y_tf: label_train})
            print("train loss and metric:",loss, metric)
            
    loss, metric = sess.run([loss_tf, metric_tf], feed_dict={X_tf: test, Y_tf: label_test})
    print("test loss and metric (accuracy):", loss, metric) #Accuracy is the matrix.

('train loss and metric:', 0.38121295, 0.89566666)
('train loss and metric:', 0.3419817, 0.90498334)
('train loss and metric:', 0.32299325, 0.90955)
('train loss and metric:', 0.3157634, 0.9107)
('train loss and metric:', 0.30905077, 0.91463333)
('train loss and metric:', 0.30273232, 0.91605)
('train loss and metric:', 0.29724538, 0.91651666)
('train loss and metric:', 0.29339275, 0.9188833)
('train loss and metric:', 0.28978094, 0.92001665)
('train loss and metric:', 0.29017457, 0.9201667)
('train loss and metric:', 0.28775722, 0.91941667)
('train loss and metric:', 0.28470683, 0.92183334)
('train loss and metric:', 0.28373432, 0.9217)
('train loss and metric:', 0.28138384, 0.92245)
('train loss and metric:', 0.27971983, 0.92205)
('train loss and metric:', 0.28191596, 0.9212833)
('test loss and metric (accuracy):', 0.28628686, 0.9195)
