# Stochastic Gradient Descent - Linear Regression

## Data for the linear regression model

In [5]:
import numpy as np

In [24]:
# Data points
data_amount = 15
max_num = 10
X = np.random.randint(max_num, size=(data_amount, 3))

# We generate them by "knowing" the output weights for this example (this is not the case for real data!)
final_weights = np.random.rand(X.shape[1])
final_weights = final_weights / np.sum(final_weights)

print (final_weights)

final_bias = 0.2

# Corresponding labels
random_noise = np.random.rand(X.shape[0]) / 7.5 # ranges from 0-1. We divide that by 7.5 to not get to much noise in here
y = np.dot(final_weights, X.T) + final_bias + random_noise

# print('data set X\n', X)
# print('labels y\n', y)

[0.28775241 0.42190513 0.29034247]


# Training and test data

In [28]:
train_len = int(data_amount * 0.75)

# We train with the following data
X_train = X[:train_len]
y_train = y[:train_len]

# We test / evaluate with the following data
X_test = X[train_len:]
y_test = y[train_len:]

## Information about the model

In [20]:
# We set the inital weights randomly
weights = np.random.rand(X.shape[1])

# The bias value is set to 1 initially
bias = np.array([1])

[1]


### Some more information

We know the regression equation:

$y_{pred}= w_1x_1 + w_2x_2 + \ldots + w_nx_n + b$

In [10]:
# What are the current results of the untrained model?
y_untrained = np.dot(weights, X_test.T) + bias
print('Outputs for our untrained model:', y_untrained)

# What are the results of the final model (that we want to achieve by updating the weights by the Stochastic gradient descent method)
y_final = np.dot(final_weights, X_test.T) + final_bias
print('Outputs for the final model:', y_final)

Outputs for our untrained model: [9.33558385 9.51887893 7.25460812 6.93375236]
Outputs for the final model: [5.1272694  6.84051433 7.11154746 4.6422118 ]


### Loss function

We want to use the mean squarred error to calculate the loss for the model outputs which is defined as follows:

$$MSE = \frac{1}{n}\sum_{i=1}^n (y_i-y_{i_{pred}})^2$$

In [11]:
mse = lambda y, y_pred: np.mean(np.sum((y-y_pred)**2))

In [12]:
# In our example the loss for our untrained model is:
loss_untrained = mse(y_test, y_untrained)
print('The loss of the untrained model is:', loss_untrained)

# Loss for the final model
loss_final = mse(y_test, y_final)
print('The loss of the final model is:', loss_final)

The loss of the untrained model is: 29.10222559645061
The loss of the final model is: 0.01919505721180261


## Your stochastic gradient descent implementation to optimize the weights of your model

In [None]:
# Summary on what we know so far:

# We know the loss function: Variable 'mse' (Mean squared error)
# We know the initial weights that we want to optimize: variable 'weights'
# We know the initial bias value: variable 'bias'

In [40]:
# Use the training data to optimize the weights of the linear regression model

# use these variables for your sgd implementation
learning_rate = 0.005
iterations = 1000

# YOUR CODE FOR THE STOCHASTIC GRADIENT DESCENT IMPLEMENTATION

w_1 = weights[0]
w_2 = weights[1]
w_3 = weights[2]
b = bias[0]

for iteration in range(iterations):
    sum_of_squared_errors = 0

    for row in range (len(X)):
        x_1 = X[row][0]
        x_2 = X[row][1]
        x_3 = X[row][2]
        y_i = y[row]
        y_i_hat = w_1 * (x_1) + w_2 * (x_2) + w_3 * (x_3) + b

        squared_loss = (y_i_hat - y_i) ** 2

        sum_of_squared_errors += squared_loss

        m_L_wrt_yhat = 2 * (y_i_hat - y_i)

        m_yhat_wrt_w1 = x_1
        m_yhat_wrt_w2 = x_2
        m_yhat_wrt_w3 = x_3
        m_yhat_wrt_b = 1

        m_L_wrt_w1 = m_L_wrt_yhat * m_yhat_wrt_w1
        m_L_wrt_w2 = m_L_wrt_yhat * m_yhat_wrt_w2
        m_L_wrt_w3 = m_L_wrt_yhat * m_yhat_wrt_w3
        m_L_wrt_b = m_L_wrt_yhat * m_yhat_wrt_b

        w_1 = w_1 - learning_rate * m_L_wrt_w1
        w_2 = w_2 - learning_rate * m_L_wrt_w2
        w_3 = w_3 - learning_rate * m_L_wrt_w3
        b = b - learning_rate * m_L_wrt_b

print(f'final: w_1={w_1}, w_2={w_2}, w_3={w_3}, b={b}, loss={squared_loss}')
    
mean_sum_of_squared_errors = sum_of_squared_errors / (len(X))

final: w_1=0.39277362991338244, w_2=0.37345856409997974, w_3=0.23882845718936865, b=0.276592183895827, loss=0.0016721257521136118


## Compare the results with the Test data

In [23]:
# Assuming you have your test data in variables X_test and y_test
sum_of_squared_errors_test = 0

for row in range(len(X_test)):
    x_1_test = X_test[row][0]
    x_2_test = X_test[row][1]
    x_3_test = X_test[row][2]
    y_i_test = y_test[row]
    y_i_hat_test = w_1 * x_1_test + w_2 * x_2_test + w_3 * x_3_test + b

    squared_loss_test = (y_i_hat_test - y_i_test) ** 2

    sum_of_squared_errors_test += squared_loss_test

mean_sum_of_squared_errors_test = sum_of_squared_errors_test / len(X_test)

print(f'Test Loss: {mean_sum_of_squared_errors_test}')

# Compare with training loss
print(f'Training Loss: {mean_sum_of_squared_errors}')

Test Loss: 0.006285252550242331
Training Loss: 0.0034051217229112358
