In [1]:
from sklearn.datasets import load_boston
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [2]:
boston = load_boston()
print(boston.data.shape)

(506, 13)


In [3]:
type(boston)

sklearn.datasets.base.Bunch

In [4]:
print(boston.DESCR)
print(boston.keys())
print(boston.feature_names)
print(boston.data.shape)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [5]:
X = boston.data

In [6]:
X[0:2, :]

array([[  6.32000000e-03,   1.80000000e+01,   2.31000000e+00,
          0.00000000e+00,   5.38000000e-01,   6.57500000e+00,
          6.52000000e+01,   4.09000000e+00,   1.00000000e+00,
          2.96000000e+02,   1.53000000e+01,   3.96900000e+02,
          4.98000000e+00],
       [  2.73100000e-02,   0.00000000e+00,   7.07000000e+00,
          0.00000000e+00,   4.69000000e-01,   6.42100000e+00,
          7.89000000e+01,   4.96710000e+00,   2.00000000e+00,
          2.42000000e+02,   1.78000000e+01,   3.96900000e+02,
          9.14000000e+00]])

In [7]:
y = boston.target

In [8]:
y[0:2]

array([ 24. ,  21.6])

## Prepare train/test sets

In [9]:
# For now pick first 400 for training and remainder for test
# Later, we will do something more randomised
X_train = X[0:400, :]
y_train = y[0:400]
X_test = X[400:,:]
y_test = y[400:]
print("X_train shape: " + str(X_train.shape))
print("y_train shape: " + str(y_train.shape))
print("X_test shape: " + str(X_test.shape))
print("y_test shape: " + str(y_test.shape))

X_train shape: (400, 13)
y_train shape: (400,)
X_test shape: (106, 13)
y_test shape: (106,)


## Define Functions

In [68]:
def sum_squared_error(X, y, theta):
    hypothesis = X.dot(theta.T)
    square_of_errors = np.square(hypothesis - y)
    return np.sum(square_of_errors)

def mean_squared_error(X, y, theta):
    m = X.shape[0]
    return (1 / (2 * m)) * sum_squared_error(X, y, theta)
    
def gradient(X, y, theta):
    m = X.shape[0]
    hypothesis = X.dot(theta.T)
    print("hypothesis: " + str(hypothesis))
    error = hypothesis - y
    print("error: " + str(error))
    gradient = (1/m) * X.T.dot(error)
    print("gradient: " + str(gradient))
    return gradient
    
def gradient_descent_batch(X, y, theta, alpha, iterations):
    m = X.shape[0]
    for i in range(iterations):
        # Move in opposite direction to gradient, hence minus gradient
        print("theta: " + str(theta))
        g = alpha * gradient(X, y, theta)
        theta = theta - g
    return theta

def gradient_descent_stochastic(X, y, theta, alpha, iterations):
    print("To Do")


test_X = np.array([[1, 2, 3],[4, 5, 6]])
test_y = [6, 15]
test_theta = np.array([2, 1, 1])
print("Theta: " + str(test_theta))

# Test cost function
print("Cost: " + str(mean_squared_error(test_X, test_y, test_theta)))
    
# Test gradient function
print("Gradient: " + str(gradient(test_X, test_y, test_theta)))

# Test gradient descent
test_initial_theta = np.array([2,2,2])
print("Testing gradient descent")
gradient_descent_batch(test_X, test_y, test_initial_theta, 1, 1000)


Theta: [2 1 1]
Cost: 4.25
hypothesis: [ 7 19]
error: [1 4]
gradient: [  8.5  11.   13.5]
Gradient: [  8.5  11.   13.5]
Testing gradient descent
theta: [2 2 2]
hypothesis: [12 30]
error: [ 6 15]
gradient: [ 33.   43.5  54. ]
theta: [-31.  -41.5 -52. ]
hypothesis: [-270.  -643.5]
error: [-276.  -658.5]
gradient: [-1455.   -1922.25 -2389.5 ]
theta: [ 1424.    1880.75  2337.5 ]
hypothesis: [ 12198.    29124.75]
error: [ 12192.    29109.75]
gradient: [  64315.5     84966.375  105617.25 ]
theta: [ -62891.5    -83085.625 -103279.75 ]
hypothesis: [ -538902.    -1286672.625]
error: [ -538908.    -1286687.625]
gradient: [-2842829.25   -3755627.0625 -4668424.875 ]
theta: [ 2779937.75    3672541.4375  4565145.125 ]
hypothesis: [ 23820456.      56873328.9375]
error: [ 23820450.      56873313.9375]
gradient: [  1.25656853e+08   1.66003735e+08   2.06350617e+08]
theta: [ -1.22876915e+08  -1.62331193e+08  -2.01785472e+08]
hypothesis: [ -1.05289572e+09  -2.51387646e+09]
error: [ -1.05289572e+09  -2.5138




theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ nan  nan]
gradient: [ nan  nan  nan]
theta: [ nan  nan  nan]
hypothesis: [ nan  nan]
error: [ na

array([ nan,  nan,  nan])

## Run Linear Regression With Full Batch Gradient Descent

In [36]:
num_of_iterations = 100
initial_theta = np.ones((1, X.shape[1]))
alpha = 0.5 # learning rate
gradient_descent_batch(X_train, y_train, initial_theta, alpha, num_of_iterations)

[[ -1.50455676e+08  -1.50455676e+08  -1.50455676e+08  -1.50455676e+08
   -1.50455676e+08  -1.50455676e+08  -1.50455676e+08  -1.50455676e+08
   -1.50455676e+08  -1.50455676e+08  -1.50455676e+08  -1.50455676e+08
   -1.50455676e+08]]
[[  2.32712497e+16   2.32712497e+16   2.32712497e+16   2.32712497e+16
    2.32712497e+16   2.32712497e+16   2.32712497e+16   2.32712497e+16
    2.32712497e+16   2.32712497e+16   2.32712497e+16   2.32712497e+16
    2.32712497e+16]]
[[ -3.59940599e+24  -3.59940599e+24  -3.59940599e+24  -3.59940599e+24
   -3.59940599e+24  -3.59940599e+24  -3.59940599e+24  -3.59940599e+24
   -3.59940599e+24  -3.59940599e+24  -3.59940599e+24  -3.59940599e+24
   -3.59940599e+24]]
[[  5.56726590e+32   5.56726590e+32   5.56726590e+32   5.56726590e+32
    5.56726590e+32   5.56726590e+32   5.56726590e+32   5.56726590e+32
    5.56726590e+32   5.56726590e+32   5.56726590e+32   5.56726590e+32
    5.56726590e+32]]
[[ -8.61099017e+40  -8.61099017e+40  -8.61099017e+40  -8.61099017e+40
   -8.

array([[ nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
         nan,  nan]])

## Run Linear Regression With Stochastic Gradient Descent