In [1]:
import numpy as np
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing

In [16]:
scaler = StandardScaler()
housing = fetch_california_housing()
housing_data = scaler.fit_transform(housing.data)
housing_target = housing.target
m,n = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing_data]

In [3]:
n_epochs = int(1e3)
learning_rate = 0.01
batch_size = 64
n_batches = int(np.ceil(m/batch_size))

## With normal equation

In [4]:
def r1():
    X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
    y = tf.constant(housing_target.reshape(-1, 1), dtype=tf.float32, name='y')
    XT = tf.transpose(X)
    theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)
    with tf.Session() as sess:
        theta_value = theta.eval()

In [5]:
%timeit -n 1 -r 1 r1()

34.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## With batch gradient descent

In [6]:

def r2():
    X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
    y = tf.constant(housing_target.reshape(-1, 1), dtype=tf.float32, name='y')
    theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name='theta')
    y_pred = tf.matmul(X, theta, name='predictions')
    error = y_pred - y
    mse = tf.reduce_mean(tf.square(error), name='mse')
    gradients = 2/m * tf.matmul(tf.transpose(X), error)
    training_op = tf.assign(theta, theta-learning_rate*gradients)
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(n_epochs+1):
            if epoch % 100 == 0:
                print('Epoch', epoch, 'MSE =', mse.eval())
            sess.run(training_op)
        best_theta = theta.eval()

In [7]:
%timeit -n 1 -r 1 r2()

Epoch 0 MSE = 10.833167
Epoch 100 MSE = 0.9559737
Epoch 200 MSE = 0.7172014
Epoch 300 MSE = 0.6610752
Epoch 400 MSE = 0.6232295
Epoch 500 MSE = 0.59596765
Epoch 600 MSE = 0.5762571
Epoch 700 MSE = 0.5619971
Epoch 800 MSE = 0.5516755
Epoch 900 MSE = 0.5442006
Epoch 1000 MSE = 0.5387841
228 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## With batch gradient descent using autodiff

In [8]:
def r3():
    X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
    y = tf.constant(housing_target.reshape(-1, 1), dtype=tf.float32, name='y')
    theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name='theta')
    y_pred = tf.matmul(X, theta, name='predictions')
    error = y_pred - y
    mse = tf.reduce_mean(tf.square(error), name='mse')
    gradients = tf.gradients(mse, [theta])[0]
    training_op = tf.assign(theta, theta-learning_rate*gradients)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(n_epochs+1):
            if epoch % 100 == 0:
                print('Epoch', epoch, 'MSE =', mse.eval())
            sess.run(training_op)
        best_theta = theta.eval()

In [9]:
%timeit -n 1 -r 1 r3()

Epoch 0 MSE = 9.108579
Epoch 100 MSE = 0.9831704
Epoch 200 MSE = 0.753373
Epoch 300 MSE = 0.6861746
Epoch 400 MSE = 0.6410712
Epoch 500 MSE = 0.60868454
Epoch 600 MSE = 0.5853082
Epoch 700 MSE = 0.5684246
Epoch 800 MSE = 0.5562275
Epoch 900 MSE = 0.54741377
Epoch 1000 MSE = 0.5410433
288 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## With batch gradient descent using autodiff and optimizer

In [10]:
def r4():
    X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
    y = tf.constant(housing_target.reshape(-1, 1), dtype=tf.float32, name='y')
    theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name='theta')
    y_pred = tf.matmul(X, theta, name='predictions')
    error = y_pred - y
    mse = tf.reduce_mean(tf.square(error), name='mse')
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(mse)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(n_epochs+1):
            if epoch % 100 == 0:
                print('Epoch', epoch, 'MSE =', mse.eval())
            sess.run(training_op)
        best_theta = theta.eval()

In [11]:
%timeit -n 1 -r 1 r4()

Epoch 0 MSE = 17.148422
Epoch 100 MSE = 0.85283506
Epoch 200 MSE = 0.6274514
Epoch 300 MSE = 0.59657615
Epoch 400 MSE = 0.57652324
Epoch 500 MSE = 0.5620782
Epoch 600 MSE = 0.5516423
Epoch 700 MSE = 0.5441003
Epoch 800 MSE = 0.53864795
Epoch 900 MSE = 0.5347053
Epoch 1000 MSE = 0.53185296
279 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## With minibatch gradient descent using autodiff and optimizer

In [12]:
def fetch_batch(epoch, batch_index, batch_size):
    rand_index = np.random.choice(m, size=batch_size)
    X_batch = housing_data_plus_bias[rand_index]
    y_batch = housing_target[rand_index].reshape(-1, 1)
    return X_batch, y_batch
    
def r5():
    X = tf.placeholder(tf.float32, shape=(None, n+1), name='X')
    y = tf.placeholder(tf.float32, shape=(None, 1), name='y')
    theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name='theta')
    y_pred = tf.matmul(X, theta, name='predictions')
    error = y_pred - y
    mse = tf.reduce_mean(tf.square(error), name='mse')
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(mse)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(n_epochs+1):
            for batch_index in range(n_batches):
                X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
                sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
                
            if epoch % 100 == 0:
                print('Epoch', epoch, 'MSE =', mse.eval(feed_dict={X: X_batch, y: y_batch}))
        best_theta = theta.eval()

In [13]:
%timeit -n 1 -r 1 r5()

Epoch 0 MSE = 1.3822281
Epoch 100 MSE = 0.71288264
Epoch 200 MSE = 0.42527574
Epoch 300 MSE = 0.5109476
Epoch 400 MSE = 0.513522
Epoch 500 MSE = 0.4315561
Epoch 600 MSE = 0.7776046
Epoch 700 MSE = 2.1978917
Epoch 800 MSE = 0.39121222
Epoch 900 MSE = 0.571352
Epoch 1000 MSE = 0.43982285
1min 15s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
