# Under the hood

In [9]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({'weight': [0.7, 2.4, 2.8], 'height':[1.5, 1.8, 3.2]})

In [3]:
data

Unnamed: 0,weight,height
0,0.7,1.5
1,2.4,1.8
2,2.8,3.2


In [4]:
from sklearn.linear_model import LinearRegression

In [6]:
model = LinearRegression()
model.fit(data[['weight']], data['height']);

In [7]:
print('beta_0 intercept =', model.intercept_)
print('beta_1 slope =', model.coef_[0])

beta_0 intercept = 0.9434316353887398
beta_1 slope = 0.6219839142091154


### 1D Descent

In [10]:
X = data['weight']
y = data['height']
b1 = 0.64 # b1 fixed at 0.64
eta = 0.1 # Learning rate fixed 

# Hypothesis function h
def h(x,b0):
    return b0 + b1*x 

# Initialize intercept at 0 for this example
b0_epoch0 = 0

# L(b0_epoch_0)
np.sum((y-h(X,b0_epoch0)) ** 2)

3.1588640000000012

In [11]:
# Step 1: Compute the derivative of the Loss function at b0_epoch_0
derivative = np.sum(-2*(y-h(X,b0_epoch0)))
derivative

-5.448

In [12]:
# Step 2: Update the intercept
b0_epoch1 = b0_epoch0 - (eta * derivative)
b0_epoch1

0.5448000000000001

In [13]:
# Step1: Compute the new derivative at b0_epoch1
derivative = np.sum(-2*(y-h(X,b0_epoch1)))

# Step2: Update the previsouly updated intercept
b0_epoch2 = b0_epoch1 - eta * derivative
b0_epoch2

0.7627200000000002

### SGD

In [14]:
b0 = 0
eta = 0.1
n_epoch = 5 # We have to choose when to stop

for epoch in range(n_epoch):    
    # Loop randomly over all 3 data points in our example
    for i in np.random.permutation(3):
        
        # Select a minibatch (of size 1)
        X_mini = X[i]
        
        # Compute gradient of the loss at b_0
        y_pred = h(X_mini,b0)
        y_true = y[i]
        derivative = -2*(y_true-y_pred)
        
        # Update b_0
        b0 = b0 - eta * derivative        
        print(f'b0 epoch {epoch}:', b0)

b0 epoch 0: 0.05280000000000001
b0 epoch 0: 0.32384000000000013
b0 epoch 0: 0.4694720000000001
b0 epoch 1: 0.4283776000000001
b0 epoch 1: 0.55310208
b0 epoch 1: 0.7240816640000002
b0 epoch 2: 0.7896653312000002
b0 epoch 2: 0.6845322649600001
b0 epoch 2: 0.8292258119680002
b0 epoch 3: 0.9449806495744002
b0 epoch 3: 0.9663845196595202
b0 epoch 3: 0.8259076157276162
b0 epoch 4: 0.871126092582093
b0 epoch 4: 0.9785008740656744
b0 epoch 4: 0.8356006992525395


In [20]:
from sklearn.linear_model import SGDRegressor, LinearRegression

lin_reg = LinearRegression() # OLS solved by matrix inversion (SVD method)

lin_reg_sgd = SGDRegressor(loss='squared_error') # OLS solved by SGD

In [21]:
from sklearn.datasets import make_regression

# Create a "fake problem" to solve
X, y = make_regression(n_samples=10000, n_features=1000)

In [22]:
%%time
lin_reg.fit(X,y);

CPU times: user 6.53 s, sys: 5.75 s, total: 12.3 s
Wall time: 1.86 s


In [23]:
%%time
lin_reg_sgd.fit(X,y);

CPU times: user 322 ms, sys: 0 ns, total: 322 ms
Wall time: 317 ms


### Regressors

In [None]:
LinearRegressor() # OLS regression
KNeighborsRegressor() # KNN
SVR() # Support Vector Regressor

In [None]:
SGDRegressor(loss='squared_loss') # eq. to OLS regression
SGDRegressor(loss='huber') # non-OLS linear regression

### Classifiers

In [None]:
LogisticRegressor() # Logit regression
KNeighborsClassifier() # KNN
SVC() # Support Vector Classifier

In [None]:
SGDClassifier(loss='log') # eq. to Logit
SGDClassifier(loss='hinge') # eq. to SVC

### Select **hyper-parameters**

In [None]:
SGDRegressor(loss='squared_loss', learning_rate=0.1, eta0=0.01)
KNeighborsRegressor(n_neighbors=5)
LogisticRegression(solver='newton')