# Notebook for training methods. (Theoretical)

## Linear Regression : Normal Equation Method and Gradient Descent

### Normal Equation

In [1]:
import numpy as np

In [2]:
X = 2 * np.random.randn(100,1)
y = 4 + 3 * X + np.random.randn(100,1) #here np.random.rand(100,1) is used as Gaussian Noise

In [3]:
X_b = np.c_[np.ones((100,1)), X]

In [4]:
# np.linalg is the Linear Algebra module in NumPy
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

In [5]:
theta_best

array([[ 4.23330168],
       [ 3.02152859]])

In [6]:
# try to predict on new set
X_new = np.array([[0], [2]])
X_new_b = np.c_[np.ones((2,1)), X_new] #to put 1 before every x in the matrix (c_ is used for matrix concatenation)
y_predict = X_new_b.dot(theta_best)
y_predict # EXACTLY WHAT SKLEARN.LINEAR_REGRESSION USES.

array([[  4.23330168],
       [ 10.27635886]])

In [7]:
import matplotlib.pyplot as plt
plt.plot(X_new, y_predict, "r-")
plt.plot(X,y,"b.")
plt.axis([0,2,0,15])
plt.show()

<matplotlib.figure.Figure at 0x7a1e978>

In [8]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
lin_reg.predict(X_new) # SAME AS MANUALLY APPLYING LR

array([[  4.23330168],
       [ 10.27635886]])

### Gradient Descent - Batch, Stochastic, Mini-Batch

#### Batch Gradient Descent

In [9]:
eta = 0.1 #LEARNING RATE
n_iterations = 1000
m = 100

theta = np.random.randn(2,1) #random initialisation

for iteration in range(n_iterations):
    gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
    theta = theta - eta * gradients

In [10]:
theta #values of theta

array([[ 4.23330168],
       [ 3.02152859]])

#### Stochastic Gradient Descent (SGD)
##### Picks random instances

In [11]:
n_epochs = 50
t0, t1 = 5, 50 # LEARNING SCHEDULE HYPERPARAMETERS

def learning_schedule(t):
    return t0 / (t + t1)

theta = np.random.randn(2,1) #Random Initialisation

for epoch in range(n_epochs):
    for i in range(m):
        random_index = np.random.randint(m)
        xi = X_b[random_index:random_index+1]
        yi = y[random_index:random_index+1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients

In [12]:
theta #similar results as batch GD

array([[ 4.21472835],
       [ 2.9948032 ]])

##### SGD using SciKit Learn's SGDRegressor class

In [13]:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(max_iter = 50, penalty = None, eta0 = 0.1)
sgd_reg.fit(X, y.ravel())

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.1,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=50, n_iter=None, penalty=None,
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [14]:
sgd_reg.intercept_, sgd_reg.coef_ #Similar results

(array([ 4.19612562]), array([ 3.04420695]))

#### Mini-Batch Gradient Descent : Computes on random sets
##### TO-DO - code