# Linear Regression from Scratch
The steps we will follow are:
1. Generate a dataset with some random noise
2. Use sklearn linear regression as benchmark
3. Implement linear regression from scratch and compare

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
seed = 7
np.random.seed(seed)

### Generate a dataset with random noise

In [2]:
from sklearn.datasets.samples_generator import make_regression
x,y = make_regression(n_samples = 200, n_features= 3, n_informative=1, random_state=0, noise=30)
print('x shape = ', x.shape, 'y shape = ', y.shape)

x shape =  (200, 3) y shape =  (200,)


### Benchmark with sklearn Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x,y)
print(lin_reg.coef_)
print(lin_reg.intercept_)
pred_y = lin_reg.predict(x)
mean_squared_error(y, pred_y)

[56.03321382  3.66237626 -2.01575327]
-0.862435693597918


866.5524414508043

### Linear Regression from scratch

The hypothesis is given by, 
${H_\theta }(X) = X{\theta}$

where, $\theta _0$ and $\theta _1$ are model values to be found

The cost function to be minimized is given by,
$J(\theta ) = \frac{1}{{2m}}{\left( {X\theta  - y} \right)^2}$

In [4]:
class CustomLinReg():
    def __init__(self, X, y, learning_rate, max_iter = 1000):
        np.random.seed(42)
        self.X, self.y, self.lr, self.max_iter = X, y, learning_rate, max_iter
        # Add column of random constants to X
        self.X = np.column_stack((np.ones(len(self.X)), self.X))
        # Pre-processing of Data
        scaler = StandardScaler()
        scaler.fit(self.X)
        self.X = scaler.transform(self.X)
        # Initialization of Theta in Hypothesis
        self.theta = np.random.rand(self.X.shape[1])
        self.gradient_descent()
    
    def gradient_descent(self):
        # Find number of rows
        m = len(y)
        
        # Calculate Initial Cost
        J = self.compute_cost(self.X, self.y, self.theta)
        
        # Minimize cost by gradient descent
        for i in range(self.max_iter):
            grad = (1/m) * np.matmul(self.X.T, np.matmul(self.X, self.theta) - y)
            self.theta = self.theta - self.lr * grad
        
    def compute_cost(self, X, y, theta):
        return np.sum(np.square(np.matmul(X, theta) - y) / (2*len(y)))        

In [5]:
lin_reg = CustomLinReg(x,y, 0.01)
lin_reg.theta

array([ 0.37454012, 57.13387403,  3.54547456, -2.04015725])

In [6]:
x = np.column_stack((np.ones(len(x)), x))

In [7]:
y_pred = np.matmul(x, lin_reg.theta)

In [8]:
mean_squared_error(y, y_pred)

869.476164076133

This mean squared error is in close agreement with that of sklearn's Linear Regression