*     importing essential libraries

In [55]:
import numpy as np
import pandas as pd
import math
import sys

* importing the BOSTON Dataset from sklearn

In [56]:
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [57]:
X = boston.data
y = boston.target

* Now let's convert the data into pandas DataFrame and explore it a little bit

In [58]:
boston = pd.DataFrame(boston.data, columns = boston.feature_names)

In [59]:
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [60]:
boston.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


* **We can clearly see the irregular scale of data, since we are going to implement Gradient Descent which when scale is improper takes long time to converge to the optimal value of parameters, So we need to scale the data**

In [61]:
from sklearn.preprocessing import StandardScaler
Scale = StandardScaler()
boston = Scale.fit_transform(boston)

In [62]:
boston = pd.DataFrame(boston, columns = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT'])

In [63]:
boston['bias']=1

In [64]:
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,bias
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562,1
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439,1
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727,1
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517,1
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501,1


Now we will split the data into training data and test data

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(boston, y, test_size = 0.2)

* Now we're talking! We've done little bit of required preprocessing, lets write down the code that will use Gradient Descent that will optimize the hyperparameter in order to minimise the cost function

* We will first write code for 1 step of gradient descent working, initializing the parameters randomly, calculating gradients using them and then updating the value of parameters by substracting the gradient from them

In [66]:
X_train = np.array(X_train)
X_test=np.array(X_test)

In [67]:
def step_gradient(X, y, learning_rate, theta):
    k = X.shape[0]
    n = X.shape[1]
    gradients = np.zeros(n)
    for i in range(k):
        for j in range(n):
            gradients[j] += (-2/k) * ( y[i] - (theta.dot(X[i,:])) ) * X[i,j]
    theta = theta - learning_rate * gradients
    return theta

* Now defining the Gradient Descent function that will call step gradient fuction for the defined no. of iterations. 

In [68]:
def gradient_descent(X, y, learning_rate, iterations):
    k = X.shape[0]
    n = X.shape[1]
    theta = np.zeros(n)                #random initialization
    for i in range(iterations):
#         gradient = 2/k * X.T.dot(X.dot(theta) - y)   (direct formula for gradient vector)
#         theta = theta - learning_rate * gradient
        theta = step_gradient(X, y, learning_rate, theta)
        print(i, 'cost:', cost(X, y, theta))
    return theta

* It's pretty much done, let's now write down the main business - the cost function.

In [69]:
def cost(X, y, theta):
    k = X.shape[0]
    total_cost = 0
    for i in range(k):
        total_cost += 1/k * (y[i] - (theta.dot(X[i,:])))**2
    return total_cost

* We're done now, except for the main command giving function 'run' , which will run our algorithm just taking the training data and its labels as input from us, to train itself.

In [70]:
def run(X, y):
    learning_rate = 0.04
    iterations = 300
    theta = gradient_descent(X, y, learning_rate, iterations)
    return theta

Let's try this on our boston training data 

In [71]:
theta = run(X_train, y_train)


0 cost: 496.7183418700804
1 cost: 417.727998422731
2 cost: 354.89470562401516
3 cost: 302.99013858431954
4 cost: 259.54613779707796
5 cost: 223.0167637696991
6 cost: 192.24843097631629
7 cost: 166.31217845183073
8 cost: 144.43872750336786
9 cost: 125.98479144830647
10 cost: 110.41055213673926
11 cost: 97.26237312522896
12 cost: 86.15876392334638
13 cost: 76.77875006555017
14 cost: 68.85216674516168
15 cost: 62.151535358901995
16 cost: 56.485256101690844
17 cost: 51.691898985516524
18 cost: 47.63541321585998
19 cost: 44.20110513244426
20 cost: 41.29225981893488
21 cost: 38.82730211587408
22 cost: 36.737409916613764
23 cost: 34.96450689823811
24 cost: 33.45957373259702
25 cost: 32.1812267418552
26 cost: 31.094521242530895
27 cost: 30.169943738026255
28 cost: 29.382562900763123
29 cost: 28.71131412071709
30 cost: 28.138396444586373
31 cost: 27.648764119594052
32 cost: 27.22969779651822
33 cost: 26.870442828259375
34 cost: 26.561904098261493
35 cost: 26.296388490063343
36 cost: 26.06738751

294 cost: 23.467856393147915
295 cost: 23.467248614572334
296 cost: 23.466647141547266
297 cost: 23.4660519045548
298 cost: 23.465462834954433
299 cost: 23.464879864969184


In [72]:
theta

array([-1.14412931,  0.95760734,  0.19832997,  0.92967289, -2.26076775,
        2.72607617,  0.0602581 , -2.96523504,  2.55655277, -1.64377654,
       -2.13322166,  0.95435653, -3.65686694, 22.6581299 ])

In [73]:
def predict(X, m):
    N = X.shape[0]
    Y = np.zeros(N)
    for i in range(N):
        Y[i] = (theta * X[i,:]).sum()
    return Y

In [74]:
y_pred = predict(X_test, theta)

In [75]:
def score(y_pred,y_test):
    u = ((y_test - y_pred)**2).sum()
    v = ((y_test - y_test.mean())**2).sum()
    return 1 - u/v

In [76]:
score(y_pred, y_test)

0.7830207062531909