In [101]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score

#                                                   Introduction
```js
        The Gradient Descent is used here is for Multiple Linear Regression where we have m Input Columns and obviously 1 Output Column.

        The equations and intutions are hand written from scratch in "Gradient Descent for Multiple LR.pdf".

        NOTE : This is also called "Batch Gradient Descent", "Vanilla Gradient Descent".
```

In [102]:
X, Y = load_diabetes(return_X_y=True) # returns np.ndarray. X.shape = (442, 10), Y.shape = (442,).
Y = Y.reshape((Y.shape[0], 1))        # Y.shape = (442, 1).

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2) # (353, 10), (89, 10), (353, 1), (89, 1)

lr = LinearRegression()
lr.fit(X = x_train, y = y_train) # calculated m, b for the future prediction.
m, b = lr.coef_, lr.intercept_
print(f"Coefficients = \n{m.reshape(m.shape[::-1])}, \n\nIntercept = {b}\n")

print(f"Predictions for the First 5 values = \n{lr.predict(x_test[:5])}\n")

print(f"r2 score = {r2_score(y_true=y_test, y_pred=lr.predict(x_test))}")

Coefficients = 
[[  -9.15865318]
 [-205.45432163]
 [ 516.69374454]
 [ 340.61999905]
 [-895.5520019 ]
 [ 561.22067904]
 [ 153.89310954]
 [ 126.73139688]
 [ 861.12700152]
 [  52.42112238]], 

Intercept = [151.88331005]

Predictions for the First 5 values = 
[[154.1213881 ]
 [204.81835118]
 [124.93755353]
 [106.08950893]
 [258.5348576 ]]

r2 score = 0.4399338661568968


In [None]:
#   Here I calculated 'B0 and B' using Gradient Descent from the Error/Loss Function and USING EPOCH i.e. TILL SPECIFIC LIMIT.

class GDRegressor:
    def __init__(self, learning_rate: float, epochs: int):
        self.coef_ = None
        self.intercept_ = None
        self.learning_rate = learning_rate
        self.epochs = epochs
    
    def fit(self, x_train: np.ndarray, y_train: np.ndarray):
        B0 = 0 # -> Intercept.
        B = np.ones(shape = (x_train.shape[1], 1)) # B1, B2, B3 ..., B10 = [ [1], [1], [1], ..., [1] ] -> Coefficient. (10, 1).
        n = x_train.shape[1] # 10, Column numbers.

        for i in range(self.epochs):
            y_hat = B0 + np.dot(x_train, B) # (353, 1) shape.
            slope_B0 = (-2 / n) * np.sum(y_train - y_hat) # Scaler value.
            slope_Bs = (-2 / n) * np.dot((y_train - y_hat).T, x_train) # (1, 10) shape.
            slope_Bs = slope_Bs.reshape(slope_Bs.shape[::-1]) # (10, 1). B and slope_Bs must be in the same (_, 1) shape.

            B0 = B0 - self.learning_rate * slope_B0 # Scaler Value.
            B  = B  - self.learning_rate * slope_Bs # (10, 1) shape.

        self.intercept_, self.coef_ = B0, B
        print(f"Coefficients = \n{self.coef_}, \n\nIntercept = {self.intercept_}\n")
    
    def predict(self, x_test: np.ndarray):
        """
        The main equation is np.dot(self.coef_, x_test) + self.intercept_.

        But here self.coef_.shape = (10, 1). x_test.shape (5, 10). But we need to put the column numbers which is constant(10)
        side by side i.e. (1, 10) (10, 5) since row numbers can be different but not column numbers.

        (1, 10) (10, 5) = (1, 5) but I want to visualize it to (5, 1) shape. So I did ".T" on the whole output.
        """
        return ( np.dot(self.coef_.T, x_test.T) + self.intercept_ ).T

def main():
    gdr = GDRegressor(learning_rate = 0.02, epochs = 300000) # 0.02, 300000
    gdr.fit(x_train, y_train)

    print(f"Predictions for the First 5 values = \n{gdr.predict(x_test[:5])}\n")

    print(f"r2 score = {r2_score(y_true=y_test, y_pred=gdr.predict(x_test))}")

if __name__ == "__main__":
    main()

explanation = """
At above codeblock, sklearn returned us :

Coefficients = 
[[  -9.15865318]
 [-205.45432163]
 [ 516.69374454]
 [ 340.61999905]
 [-895.5520019 ]
 [ 561.22067904]
 [ 153.89310954]
 [ 126.73139688]
 [ 861.12700152]
 [  52.42112238]], 

Intercept = [151.88331005]

Predictions for the First 5 values = 
[[154.1213881 ]
 [204.81835118]
 [124.93755353]
 [106.08950893]
 [258.5348576 ]]

r2 score = 0.4399338661568968

This output and our output are same except some difference in decimals.
"""

Coefficients = 
[[  -9.15830466]
 [-205.45292726]
 [ 516.69716303]
 [ 340.61931932]
 [-895.3830603 ]
 [ 561.08657372]
 [ 153.81793572]
 [ 126.70993046]
 [ 861.06383   ]
 [  52.4214399 ]], 

Intercept = 151.8833119378528

Predictions for the First 5 values = 
[[154.121059  ]
 [204.8186888 ]
 [124.93735432]
 [106.08877848]
 [258.53753532]]

r2 score = 0.43993516054291604


#                                               Cons of Batch Gradient Descent
```js
        for i in range(self.epochs):
            y_hat = B0 + np.dot(x_train, B) // (353, 1) shape.
            slope_B0 = (-2 / n) * np.sum(y_train - y_hat) // Scaler value.
            slope_Bs = (-2 / n) * np.dot((y_train - y_hat).T, x_train) // (1, 10) shape.

            B0 = B0 - self.learning_rate * slope_B0 // Scaler Value.
            B  = B  - self.learning_rate * slope_Bs // (10, 1) shape.
        
    Cons of Batch Gradient Descent :
    --------------------------------
    All those operations are Vectorization, means e.g. to calculate y_hat we put the whole x_train into memory and do dot product between x_train and B. If x_train is large e.g. 10_000 rows with 50 input columns, then putting such a huge dataset in the memory entirely may give us a "memory out error". We are again doing dot product with x_train in 'slope_Bs'. So we are doing a lot computation in each epoch/iteration. So we need significant memory for Batch Gradient Descent if the dataset is large.

    Addressing these issues, there exists Stochastic Gradient Descent.
```