In [1]:
# Import modules
from __future__ import print_function
import tensorflow as tf
import numpy as np
import time
import matplotlib.pyplot as plt
from ecbm4040.cifar_utils import load_data

# Plot configurations
% matplotlib inline

# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
% load_ext autoreload
% autoreload 2

In [2]:
X_train, y_train, X_test, y_test = load_data()

In [3]:
X_train[:10]

array([[ 26,  17,  13, ...,  27,  26,  27],
       [ 94, 101,  95, ..., 182, 184, 155],
       [183, 158, 166, ..., 250, 250, 250],
       ..., 
       [225, 214, 190, ..., 144, 167, 171],
       [ 82,  69,  63, ...,  67,  57,  68],
       [198, 173, 144, ...,  40,  31,  26]], dtype=uint8)

In [4]:
num_training = 49000
num_validation = 1000
num_test = 1000
num_dev = 100

X_val = X_train[-num_validation:, :]
y_val = y_train[-num_validation:]

mask = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[mask]
y_dev = y_train[mask]

X_train = X_train[:num_training, :]
y_train = y_train[:num_training]

X_test = X_test[:num_test, :]
y_test = y_test[:num_test]

In [5]:
mean_image = np.mean(X_train, axis=0)

X_train = X_train.astype(np.float32) - mean_image.astype(np.float32)
X_val = X_val.astype(np.float32) - mean_image
X_test = X_test.astype(np.float32) - mean_image
X_dev = X_dev.astype(np.float32) - mean_image

# Append the bias dimension of ones (i.e. bias trick) so that our SVM
# only has to worry about optimizing a single weight matrix W.
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])

print(X_train.shape, X_val.shape, X_test.shape, X_dev.shape)

(49000, 3073) (1000, 3073) (1000, 3073) (100, 3073)


In [6]:
np.random.seed(2321)
W = np.random.randn(3073, 10) * 0.0001

In [7]:
def svm_loss_naive(W, X, y, reg):
    """
    Multi-class Linear SVM loss function, naive implementation (with loops).
    
    In default, delta is 1 and there is no penalty term wst delta in objective function.

    Inputs have dimension D, there are C classes, and we operate on minibatches
    of N examples.

    Inputs:
    - W: a numpy array of shape (D, C) containing weights.
    - X: a numpy array of shape (N, D) containing N samples.
    - y: a numpy array of shape (N,) containing training labels; y[i] = c means
         that X[i] has label c, where 0 <= c < C.
    - reg: (float) L2 regularization strength

    Returns:
    - loss: a float scalar
    - gradient: wrt weights W, an array of same shape as W
    """
    dW = np.zeros(W.shape).astype('float') # initialize the gradient as zero

    # compute the loss and the gradient
    num_classes = W.shape[1]
    num_train = X.shape[0]
    loss = 0.0
    for i in range(num_train):
        scores = X[i].dot(W)
        correct_class_score = scores[y[i]]
        for j in range(num_classes):
            if j == y[i]:
                continue
            margin = scores[j] - correct_class_score + 1 # note delta = 1
            if margin > 0:
                loss += margin
                dW[:,j] += X[i]
                dW[:,y[i]] -= X[i]

    # Right now the loss is a sum over all training examples, but we want it
    # to be an average instead so we divide by num_train.
    loss /= num_train
    dW /= num_train
    # Add regularization to the loss.
    loss += reg * np.sum(W * W)
    dW += reg*2*W

    return loss, dW

In [8]:
tic = time.time()
loss_naive, grad_naive = svm_loss_naive(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('naive numpy loss: {}, takes {} seconds.'.format(loss_naive, toc-tic))

naive numpy loss: 9.012755325014963, takes 0.08477616310119629 seconds.


In [9]:
def svm_loss_vectorized(W, X, y, reg):
    loss = 0.0
    dw = np.zeros(W.shape).astype('float')
    num_train = X.shape[0]
    scores = X.dot(W)
    correct_class_score = []
    for j in range(num_train):
        correct_class_score.append(-1 * scores[j][y[j]])
    correct_class_score = np.array(correct_class_score)
    correct_class_score += 1
    margin = np.transpose(np.add(np.transpose(scores), correct_class_score))
    margin = margin.clip(min=0)
    loss = np.sum(margin) - num_train
    loss /= num_train
    loss += reg * np.sum(W * W)
    return loss

def svm_loss_vec(W, X, y, reg):
    loss = 0.0
    dw = np.zeros(W.shape).astype('float')
    num_train = X.shape[0]
    scores = X.dot(W)
    margin = (np.maximum(0, scores.transpose()-scores[np.arange(num_train),y] +1)).transpose()
    margin[np.arange(len(margin)), y] = 0
    loss = np.sum(margin)
    loss /= num_train
    loss += reg * np.sum(W * W)
    
    pos_margin = margin
    pos_margin[margin >0] = 1
    n_pos_row = np.sum(pos_margin, axis=1)
    pos_margin[np.arange(num_train), y] = -1 * n_pos_row.transpose()
    dw = X.transpose().dot(pos_margin)
    dw /= num_train
    dw += reg*2*W
    
    return loss, dw

In [10]:
tic = time.time()
loss_vec = svm_loss_vectorized(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('vectorized numpy loss: {}, takes {} seconds.'.format(loss_vec, toc-tic))

vectorized numpy loss: 9.012755325014968, takes 0.005579948425292969 seconds.


In [11]:
tic = time.time()
loss_vec, grad_vec = svm_loss_vec(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('vectorized numpy loss: {}, takes {} seconds.'.format(loss_vec, toc-tic))

vectorized numpy loss: 9.012755325014968, takes 0.005389213562011719 seconds.


In [12]:
print('Is vectorized loss correct? {}'.format(np.allclose(loss_naive, loss_vec)))
print('Is vectorized gradient correct? {}'.format(np.allclose(grad_naive, grad_vec)))

Is vectorized loss correct? True
Is vectorized gradient correct? True


In [13]:
np.random.seed(2321)
W = np.random.randn(3073, 10) * 0.0001 

# ground truth of loss and gradient
W_tf = tf.placeholder(tf.float32, shape=(3073,10))
X = tf.placeholder(tf.float32, shape=(None, 3073))
y = tf.placeholder(tf.int32, shape=(None,))
reg = tf.constant(0.000005)
init_op = tf.global_variables_initializer()

cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits= tf.matmul(X, W_tf), labels=tf.one_hot(y,10))
loss0 = tf.reduce_mean(cross_entropy) + reg*tf.reduce_sum(W_tf*W_tf)
grad0 = tf.gradients(loss0, W_tf)
out0 = (loss0, grad0)
with tf.Session() as sess:
    sess.run(init_op)
    tic = time.time()
    loss_gt, grad_gt = sess.run(out0, feed_dict={W_tf: W, X: X_dev, y: y_dev})
    toc = time.time()
    print("tensorflow loss: {}, and tesnsorflow gradient: {}, takes {} seconds".format(loss_gt, grad_gt, toc-tic))

tensorflow loss: 2.356956720352173, and tesnsorflow gradient: [array([[-0.91497672, -5.06715679, -1.63596535, ...,  1.62781715,
        -1.5216608 ,  1.39993322],
       [-0.89814317, -4.64034128, -1.64831865, ...,  1.119488  ,
        -1.64277577,  1.39054656],
       [-0.58115494, -5.12523413, -1.36927342, ...,  0.8880381 ,
        -1.87494397,  1.08563268],
       ..., 
       [-1.4052726 , -1.3936727 ,  2.4050858 , ...,  0.24068165,
        -2.01775455,  0.46139181],
       [-1.76994145, -0.95524085,  1.54262614, ..., -0.24725348,
        -1.54199946,  0.30317867],
       [-0.02036226, -0.02305621,  0.02698388, ...,  0.02119692,
        -0.0108738 ,  0.01071431]], dtype=float32)], takes 0.33017420768737793 seconds


In [52]:
def softmax_loss_vec(W, X, y, reg):
    loss = 0.0
    dW = np.zeros_like(W)
    num_train = X.shape[0]
    scores = X.dot(W)
    scores -= np.amax(scores, axis=1, keepdims=True)
    exp_scores = np.exp(scores)
    sigma_scores = exp_scores/ np.sum(exp_scores, axis=1, keepdims=True)
    req_sigma_scores = sigma_scores[np.arange(num_train), y]
    loss = np.sum(-np.log(req_sigma_scores))
    loss /= num_train
    loss += reg * np.sum(W * W)
    
    Z_gradient = sigma_scores
    Z_gradient[np.arange(num_train), y] -= 1
    dW = X.transpose().dot(Z_gradient)
    dW /= num_train
    dW += reg * 2 * W
    return loss, dW

In [53]:
tic = time.time()
soft_loss_vec, soft_grad_vec = softmax_loss_vec(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('vectorized numpy loss: {}, takes {} seconds.'.format(soft_loss_vec, toc-tic))

vectorized numpy loss: 2.356956723546791, takes 0.004266262054443359 seconds.


In [70]:
def softmax_loss_naive(W, X, y, reg):
    loss = 0.0
    dW = np.zeros_like(W)
    num_train = X.shape[0]
    num_classes = W.shape[1]
    for i in range(num_train):
        scores = X[i].dot(W)
        scores -= np.max(scores)
        exp_scores = np.exp(scores)
        sigma_score = exp_scores/np.sum(exp_scores)
        loss -= np.log(sigma_score[y[i]])
        sigma_score[y[i]] -= 1
        for j in range(num_classes):
            dW[:,j] += (X[i] * sigma_score[j])
    
    loss /= num_train
    loss += reg* np.sum(W*W)
    
    dW /= num_train
    dW += reg * 2 * W
    
    return loss, dW

In [71]:
tic = time.time()
soft_loss_naive, soft_grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('naive numpy loss: {}, takes {} seconds.'.format(soft_loss_naive, toc-tic))

naive numpy loss: 2.3569567235467903, takes 0.08016157150268555 seconds.


In [72]:
print('Gradient error of naive softmax is {}'.format(rel_err(grad_gt,soft_grad_naive)))

Gradient error of naive softmax is 3.211899978540529e-07


In [1]:
import numpy as np

In [2]:
a = np.random.randn(3,2,2)

In [3]:
a

array([[[  9.08963726e-04,   1.44710292e+00],
        [ -9.15731138e-01,  -4.24139151e-01]],

       [[  8.13202508e-02,   7.68532526e-02],
        [  2.26363056e-01,  -2.21916469e+00]],

       [[ -2.17271120e+00,  -3.27054119e-01],
        [  2.88456776e-01,  -1.55368269e+00]]])

In [7]:
np.prod(a.shape[1:])

4

In [9]:
b = np.reshape(a, [a.shape[0], np.prod(a.shape[1:])])

In [10]:
b.shape

(3, 4)

In [11]:
b

array([[  9.08963726e-04,   1.44710292e+00,  -9.15731138e-01,
         -4.24139151e-01],
       [  8.13202508e-02,   7.68532526e-02,   2.26363056e-01,
         -2.21916469e+00],
       [ -2.17271120e+00,  -3.27054119e-01,   2.88456776e-01,
         -1.55368269e+00]])