In [1]:
from sklearn.datasets import load_boston
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from numpy import random
import time

from sklearn import datasets, linear_model
from sklearn import metrics, cross_validation
from sklearn.utils import shuffle
from sklearn.model_selection import KFold



In [2]:
boston = load_boston()
print(boston.data.shape)

(506, 13)


In [3]:
print(boston.DESCR)
print(boston.keys())
print(boston.feature_names)
print(boston.data.shape)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [18]:
X = boston.data
X[0:2, :]

array([[  6.32000000e-03,   1.80000000e+01,   2.31000000e+00,
          0.00000000e+00,   5.38000000e-01,   6.57500000e+00,
          6.52000000e+01,   4.09000000e+00,   1.00000000e+00,
          2.96000000e+02,   1.53000000e+01,   3.96900000e+02,
          4.98000000e+00],
       [  2.73100000e-02,   0.00000000e+00,   7.07000000e+00,
          0.00000000e+00,   4.69000000e-01,   6.42100000e+00,
          7.89000000e+01,   4.96710000e+00,   2.00000000e+00,
          2.42000000e+02,   1.78000000e+01,   3.96900000e+02,
          9.14000000e+00]])

## Convert prices to classes

In order to create a classification problem, we are going to map continous price labels to class labels.

In [22]:
y = boston.target

def map_to_class(price):
    if ( 0 <= price <= 10):
        return "0-10"
    elif (10 < price <= 20):
        return "11-20"
    elif (20 < price <= 30):
        return "21-30"
    elif (30 < price <= 40):
        return "31-40"
    elif (40 < price <= 45):
        return "41-45"
    elif (price > 45):
        return "46+"

vfunc = np.vectorize(map_to_class)

y = vfunc(y)

y[0:2]

array(['21-30', '21-30'], 
      dtype='<U5')

## Prepare train/test sets

In [23]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.33, random_state = 5)

print("X_train shape: " + str(X_train.shape))
print("y_train shape: " + str(y_train.shape))
print("X_test shape: " + str(X_test.shape))
print("y_test shape: " + str(y_test.shape))

X_train shape: (339, 13)
y_train shape: (339,)
X_test shape: (167, 13)
y_test shape: (167,)


## Define logistic regression functions

In [24]:
def sigmoid(z):
    return  1 / (1 + np.exp(-z))

# theta are the coefficients
def predict(X, theta):
    sigmoid(X.dot(theta.T))

# error (cost) function is specific to logistic regression. 
# It varies from linear regression due to sigmoid function 
# in the hypothesis and need for a convex cost function 
# to allow gradient descent.
def error_function(y, y_pred):
    m = y.shape[0]
    -(1/m) * (y * np.log(y_pred)) + ((1-y) * np.log(1 - y_pred))
    
# gradient and SGD same as linear regression 
def gradient(X, y, y_pred):
    m = X.shape[0]
    error = y_pred - y
    gradient = (1/m) * X.T.dot(error)
    return (gradient, error)

def gradient_descent_stochastic(X_y_gen, theta, alpha, iterations):
    thetas=[]
    errors=[]
    for it in range(iterations):
        for X, y in X_y_gen():
            for i, _ in enumerate(X):
                X_i = X[i]
                y_i = y[i]
                y_i_pred = predict(X_i, theta)
                grad, error = gradient(X_i, y_i, y_i_pred)
                theta = theta - (alpha * grad)

        thetas.append(theta)
        errors.append(error)
    
    log_gradient_descent_iterations(thetas, errors)
        
    return (theta, thetas)

## One vs. rest classification

This basically combines multiple binary classifications.

## Next steps

- Regularisation to prevent over-fitting