In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score

#                                              Introduction (Why needed Mini Batch Gradient Descent?)
```js
        The Mini Batch Gradient Descent is used here is for Multiple Linear Regression where we have m Input Columns and obviously 1 Output Column. To understand this Gradient Descent, first have a CONSPICUOUS GRASP on 'Batch Gradient Descent' i.e. the Vanilla Gradient Descent.
        
        Inside an epoch or each epoch :
                i)  'Batch Gradient Descent' : Update ALL THE COEFFICIENTS(B1, B2 ... Bm) and INTERCEPT(B0) after traversing the whole Input Dataset(x_train, y_train).
                ii) 'Stochastic Gradient Descent' : Update ALL THE COEFFICIENTS(B1, B2 ... Bm) and INTERCEPT(B0) for "A RANDOM ROW of the input dataset(x_train, y_train)" by manually traversing the input dataset using loop.

                iii) 'Mini Batch Gradient Descent' : Update ALL THE COEFFICIENTS(B1, B2 ... Bm) and INTERCEPT(B0) for "SPECIFIC NUMBER RANDOM ROWS of the input dataset(x_train, y_train)" by manually traversing the input dataset using loop.
                      For example, batch_size = 10 and total row numbers = 353. int(353 / 10) = 35. So we can have total 35 batches from 353 rows and we will run the nested loop for 35 batches and each time we randomly pick 10 indexes since batch_size = 10. Now for the random 10 indexes :

                                Assume x_train_subset = x_train[:10] and y_train_subset = y_train[:10]
                                Now we will update all the COEFFICIENTS(B1, B2 ... Bm) and INTERCEPT(B0) for input dataset i.e. "x_train_subset and y_train_subset".
                                
                                Code :
                                ------
                                for _ in range(self.epochs):
                                    batch_numbers = x_train.shape[0] // self.batch_size
                                    for _ in  range(batch_numbers):
                                        row_indexes = np.random.randint(low = 0, high = x_train.shape[0], size = self.batch_size)
                                        // Now update COEFFICIENTS and INTERCEPT for x_train[row_indexes] and y_train[row_indexes].
        
        Purpose :
        ---------
        1) Minimizing the Loss Function finding the 'minima'. In our case, minimizing the Linear Regression''s Loss Function (MSE) finding the 'minima'.

        2) Watch [https://youtu.be/_scscQ4HVTY?si=WHZmMD5Eoq_TmEze&t=1001] from 16:41 to 19:00.

        3) From Medium.com (https://medium.com/data-science/batch-mini-batch-stochastic-gradient-descent-7a62ecba642a) :
           Batch Gradient Descent can be used for smoother curves. SGD can be used when the dataset is large. Batch Gradient Descent converges directly to minima. SGD converges faster for larger datasets. But, since in SGD we use only one example at a time, we cannot implement the vectorized implementation on it. This can slow down the computations. To tackle this problem, a mixture of Batch Gradient Descent and SGD is used.
           Neither we use all the dataset all at once nor we use the single example at a time. We use a batch of a fixed number of training examples which is less than the actual dataset and call it a mini-batch. Doing this helps us achieve the advantages of both the former variants we saw.
        
```

#                                                     Code Implementation

In [2]:
X, Y = load_diabetes(return_X_y=True) # returns np.ndarray. X.shape = (442, 10), Y.shape = (442,).

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2) # (353, 10), (89, 10), (353,), (89,).

lr = LinearRegression()
lr.fit(X = x_train, y = y_train) # calculated m, b for the future prediction.
m, b = lr.coef_, lr.intercept_

print(f"Coefficients = \n{m}, \n\nIntercept = {b}\n")

print(f"Predictions for the First 5 values = \n{lr.predict(x_test[:5])}\n")

print(f"r2 score = {r2_score(y_true=y_test, y_pred=lr.predict(x_test))}")

Coefficients = 
[  -9.15865318 -205.45432163  516.69374454  340.61999905 -895.5520019
  561.22067904  153.89310954  126.73139688  861.12700152   52.42112238], 

Intercept = 151.88331005254167

Predictions for the First 5 values = 
[154.1213881  204.81835118 124.93755353 106.08950893 258.5348576 ]

r2 score = 0.4399338661568968


In [None]:
class MBGDRegressor:
    def __init__(self, learning_rate: float, epochs: int, batch_size: int):
        self.coef_ = None
        self.intercept_ = None
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
    
    def fit(self, x_train: np.ndarray, y_train: np.ndarray):
        B0 = 0 # -> Intercept.
        B = np.ones(shape = (x_train.shape[1])) # Coefficient -> B1, B2 ... B10 = [1, 1 ... 1], (10,) -> 1D Vertical Array.
        n = x_train.shape[0] # 353, Row numbers.

        for _ in range(self.epochs):
            batch_numbers = x_train.shape[0] // self.batch_size
            
            for _ in  range(batch_numbers):
                row_indexes = np.random.randint(low = 0, high = x_train.shape[0], size = self.batch_size) # (batch_size,).
                y_hat = B0 + np.dot(x_train[row_indexes], B) # dot( (batch_size, 10), (10,) ) = (batch_size).
                slope_B0 = (-2 / n) * np.sum(y_train[row_indexes] - y_hat) # np.sum((batch_size)) -> Scaler Value.
                slope_Bs = (-2 / n) * np.dot(y_train[row_indexes] - y_hat, x_train[row_indexes])
                #                     np.dot((batch_size), (batch_size, 10)) = (10,) shape.

                B0 = B0 - self.learning_rate * slope_B0 # Scaler Value.
                B  = B  - self.learning_rate * slope_Bs # (10,) shape.

        self.intercept_, self.coef_ = B0, B # Scaler Value, (10,) shape.
        print(f"Coefficients = \n{self.coef_}, \n\nIntercept = {self.intercept_}\n")
    
    def predict(self, x_test: np.ndarray):
        return np.dot(x_test, self.coef_) + self.intercept_ # np.dot((k, 10), (10)) = (k).

def main():
    mbgdr = MBGDRegressor(learning_rate = 0.9, epochs = 1_00000, batch_size = 10)
    mbgdr.fit(x_train, y_train)

    print(f"Predictions for the First 5 values = \n{mbgdr.predict(x_test[:5])}\n")

    print(f"r2 score = {r2_score(y_true=y_test, y_pred=mbgdr.predict(x_test))}")

if __name__ == "__main__":
    main()

explanation = """
Sklearn's Linear Regression Model returned in above codeblock :   (Compare sklearn's output with our output)
----------------------------------------------------------------------------------
Coefficients = 
[  -9.15865318 -205.45432163  516.69374454  340.61999905 -895.5520019
  561.22067904  153.89310954  126.73139688  861.12700152   52.42112238],

Intercept = 151.88331005254167

Predictions for the First 5 values = 
[154.1213881  204.81835118 124.93755353 106.08950893 258.5348576 ]

r2 score = 0.4399338661568968

-----------------------------------------------------------------------------------
Sklearn's output and our output are almost close in most of the values. Anyway its not like we will implement this from scratch
since we will use sklearn(below).
"""

Coefficients = 
[  -8.81335946 -202.79510928  522.19250671  335.51992698 -873.33140848
  537.04111143  139.30591763  122.78211934  852.1468558    51.7089542 ], 

Intercept = 158.30531159558737

Predictions for the First 5 values = 
[159.74171465 210.95121661 131.50046921 111.98095187 265.69711083]

r2 score = 0.43558425148402025


#                                       Sklearn's Mini Batch Gradient Descent
```js
    Sklearn doesn''t have direct algo for this but we can do this using SGDRegressor. How? Watch [https://youtu.be/_scscQ4HVTY?si=cdvJ8TJj-q2Mf78E&t=1142] from where this link has started.
```

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score

from sklearn.linear_model import SGDRegressor

In [34]:
X, Y = load_diabetes(return_X_y=True) # returns np.ndarray. X.shape = (442, 10), Y.shape = (442,).
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2) # (353, 10), (89, 10), (353,), (89,).

epochs, batch_size = 500, 20
mbgd = SGDRegressor(loss = 'squared_error', learning_rate = 'constant', eta0 = 0.2) # 'max_iter' parameter is not applicable if
# we use "partial_fit"(it is equal 1 Epoch) which is used for Mini Batch Gradient Descent.

for _ in range(epochs):
    row_indexes = np.random.randint(low = 0, high = x_train.shape[0], size = batch_size) # (batch_size,).
    mbgd.partial_fit(X = x_train[row_indexes], y = y_train[row_indexes])

print(f"Coefficients = \n{mbgd.coef_}, \n\nIntercept = {mbgd.intercept_}\n")

print(f"Predictions for the First 5 values = \n{mbgd.predict(x_test[:5])}\n")

print(f"r2 score = {r2_score(y_true=y_test, y_pred=mbgd.predict(x_test))}")

explanation = """
Sklearn's Linear Regression Model returned in above codeblock :   (Compare sklearn's output with our output)
----------------------------------------------------------------------------------
Coefficients = 
[  -9.15865318 -205.45432163  516.69374454  340.61999905 -895.5520019
  561.22067904  153.89310954  126.73139688  861.12700152   52.42112238],

Intercept = 151.88331005254167

Predictions for the First 5 values = 
[154.1213881  204.81835118 124.93755353 106.08950893 258.5348576 ]

r2 score = 0.4399338661568968
"""

Coefficients = 
[   5.9427603  -186.60169712  481.94172852  347.38067392  -47.22740426
 -120.78600566 -210.89844745  112.66268496  524.1436238    80.7130849 ], 

Intercept = [149.69788186]

Predictions for the First 5 values = 
[148.28557345 198.85592977 123.38942604  99.69895936 270.25731618]

r2 score = 0.4422891772020714
