In [1]:
import xlwings as xw 
from sklearn import datasets
import pandas as pd
import numpy as np
import utils as ut
import matplotlib.pyplot as plt 

% matplotlib inline

## Visualise univariate data

### Extract Data 
Copy the data from excel and check for the first 5 rows

In [6]:
input_data = xw.sheets('univariate_data').range('b6:c107').options(pd.DataFrame,index = False).value
input_data.head()

Unnamed: 0,X,y
0,0.433377,2.041898
1,0.378328,1.408507
2,0.179826,0.555435
3,0.433074,2.002243
4,0.043973,0.961575


## Visualisation

Visualise how different choices of slopes and intercepts will affect the prediction errors

In [7]:
import imp 
imp.reload(ut)
ut.get_linreg_interactive(input_data)

VBox(children=(VBox(children=(FloatSlider(value=0.0, description='m', max=5.0, step=0.25), FloatSlider(value=0…

## Find slope m and intercept c


There is a functionality in Python which allows us to find $m$ and $c$ which minimize the error

In [8]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression() 
lr.fit(input_data['X'].values.reshape(-1,1),input_data['y'])

print('m : {} and c : {}'.format(lr.coef_,lr.intercept_))


m : [2.52597752] and c : 0.5501250398842417


# Gradient Descent with one variable

Suppose that we want to find the minimizer of a quadratic function $f(x) = x^2 - 6x + 5 $ using gradient descent algorithm. We know that the first derivative of this function is $Df(x) = 2x-6$. Using this information, we can write a little script.

In [14]:
import numpy as np 

f = lambda x: x**2 - 6*x + 5
Df = lambda x: 2*x - 6

x0 = -5 # initial value of the algorithm 
nu = 0.1 # learning rate

while True:     
    x1 = x0 - nu * Df(x0)
    
    if np.abs(x1-x0) < 0.001:
        break
    else:
        x0 = x1

print(x0)

2.9959435180792697


# Gradient descent with two variables (original problem)

In [26]:
y = input_data['y']
x = input_data['X']

Da = lambda a,b : np.sum( (y-(a*x+b))*(-x) )
Db = lambda a,b : np.sum( (y-(a*x+b))*(-1) ) 

nu = 0.001

a0 = -2; b0 = -2; 
for i in range(1000): 
    
    a = a0 - nu*Da(a0,b0)
    b = b0 - nu*Db(a0,b0)
    
    a0 = a; b0=b; 

print('a : {} and b : {}'.format(a0,b0))


a : 2.5235004406682395 and b : 0.5513630710769419


## Can we check the numerical results analytically?

In [34]:
v1 = input_data['X'].values.reshape(-1,1)
v2 = np.ones(len(v1))

X = np.matrix(np.c_[v1,v2])
y = np.matrix(input_data['y']).T

params = np.linalg.inv(X.T*X)*X.T*y
params


matrix([[2.52597752],
        [0.55012504]])

## Exercises

## Estimating regression coefficients from real dataset

Can you calculate the regression coefficients from a real dataset manually without using any standard library? 

In [93]:
raw_data = ut.get_boston_data()
raw_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
