In [109]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler

In [25]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/linear-regression-assumptions/refs/heads/main/data.csv")
df.head()

Unnamed: 0,feature1,feature2,feature3,target
0,-0.570563,1.420342,0.49558,-9.763182
1,-0.990563,0.556965,1.045064,-24.029355
2,-0.674728,0.150617,1.774645,45.616421
3,0.38825,-0.387127,-0.110229,34.135737
4,1.167882,-0.024104,0.145063,86.663647


In [26]:
X = df.drop("target",axis=1).values
y = df["target"].values

# Batch Gradient Descent

In [35]:

class MultipleLinearRegressionGD:

    def __init__(self, epochs=1000, learning_rate=0.01):
        self.intercept = 0  # Bias term
        self.coeffs = None  # Coefficients for features
        self.epochs = epochs
        self.lr = learning_rate

    def fit(self, X, y):
        # Initialize coefficients to zeros (or small random values)
        self.coeffs = np.ones(X.shape[1])
        n = len(X)  # Number of samples

        for i in range(self.epochs):
            # Predictions
            y_pred = self.intercept + np.dot(X, self.coeffs)
            
            # Errors
            error = y - y_pred

            # Gradients
            dm = (-2/n) * np.dot(X.T, error)  # Gradient for coefficients
            db = (-2/n) * np.sum(error)       # Gradient for intercept

            # Update parameters
            self.intercept -= self.lr * db
            self.coeffs -= self.lr * dm

        print("Weights are:")
        print(self.coeffs)

        print("-----------")

        print("Bias is:")
        print(self.intercept)

    def predict(self, X):
        # Include the intercept in predictions
        return self.intercept + np.dot(X, self.coeffs)

In [36]:
mlr = MultipleLinearRegressionGD(epochs=100,learning_rate=0.1)
mlr.fit(X,y)

Weights are:
[ 7.39943877e+01 -4.15645712e-02  5.38326058e+01]
-----------
Bias is:
0.2502409375138604


In [37]:
pred = mlr.predict(X)
r2_score(y,pred)

0.9605512034455976

# Stochastic Gradient Descent

N Rows:

Updates

=> 1 Epoch : N Updates

=> 2 Epoch : N Updates

and so on

We dont have to load all data at once on machine as we did in batch GD, so, memory required will be low. And faster because it requires less number of epochs. Since we are updating whole data in each epoch. It means the time complexity is O(epoch*rows).

Stochastic means having a random probability distribution or pattern that may be analysed statistically but may not be predicted precisely. So we pick randomly rows in each epochs. So we will have noises while convergence. So we use this when we have big data. But if the dataset is small we should use Batch GD as it is not noisy.

In [78]:
X,y = make_regression(n_samples=10000,n_features=20)


In [81]:

class MultipleLinearRegressionSGD:

    def __init__(self, epochs=1000, learning_rate=0.01):
        self.intercept = 0  # Bias term
        self.coeffs = None  # Coefficients for features
        self.epochs = epochs
        self.lr = learning_rate

    def fit(self, X, y):
        # Initialize coefficients to zeros (or small random values)
        self.coeffs = np.ones(X.shape[1])
        n = len(X)  # Number of samples

        for i in range(self.epochs):
            for j in range(n):
                random_idx = np.random.randint(0,n)
                # Prediction
                y_pred = self.intercept + np.dot(X[random_idx], self.coeffs)
                
                # Errors
                error = y[random_idx] - y_pred

                # Gradients
                dm = np.dot(X[random_idx].T, error)  # Gradient for coefficients
                db = error      # Gradient for intercept

                # Update parameters
                self.intercept -= self.lr * db
                self.coeffs -= self.lr * dm

        print("Weights are:")
        print(self.coeffs)

        print("-----------")

        print("Bias is:")
        print(self.intercept)

    def predict(self, X):
        # Include the intercept in predictions
        return self.intercept + np.dot(X, self.coeffs)

In [82]:
mlr = MultipleLinearRegressionSGD()
mlr.fit(X,y)

Weights are:
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan]
-----------
Bias is:
nan


In [74]:
pred = mlr.predict(X)
r2_score(y,pred)

  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


-inf

The issue we're encountering, where the weights and bias become NaN, is likely due to exploding gradients caused by a combination of:

High Learning Rate:

If the learning rate is too large, the updates to the weights and bias can become extremely large, leading to numerical instability and NaN values.

Unscaled Data:

The make_regression dataset may have features with very different scales. If the features are not normalized, gradients can become very large, causing updates to diverge.

Large Dataset Size:

With n_samples=10000 and n_features=20, the gradients can accumulate to very large values if the learning rate is not appropriately scaled.

In [99]:

class MultipleLinearRegressionSGD:
    def __init__(self, epochs=1000, learning_rate=0.001):
        self.intercept = 0  # Bias term
        self.coeffs = None  # Coefficients for features
        self.epochs = epochs
        self.lr = learning_rate

    def fit(self, X, y):
        self.coeffs = np.ones(X.shape[1])
        n = len(X)  # Number of samples

        for epoch in range(self.epochs):
            for j in range(n):
                random_idx = np.random.randint(0, n)
                # Prediction
                y_pred = self.intercept + np.dot(X[random_idx], self.coeffs)
                
                # Errors
                error = y[random_idx] - y_pred

                # Gradients
                dm = -2*np.dot(X[random_idx].T, error)  # Gradient for coefficients
                db = -2*error  # Gradient for intercept

                # Update parameters
                self.intercept -= self.lr * db
                self.coeffs -= self.lr * dm

        print("Weights are:")
        print(self.coeffs)

        print("-----------")

        print("Bias is:")
        print(self.intercept)

    def predict(self, X):
        return self.intercept + np.dot(X, self.coeffs)


In [100]:
X,y = make_regression(n_samples=1000,n_features=20)

# Normalize data
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Train model
mlr = MultipleLinearRegressionSGD(learning_rate=0.01)
mlr.fit(X_normalized, y)

Weights are:
[ 9.27834005e-16  8.33551620e-15  9.32293671e+01  9.27498527e+01
 -4.84832148e-16 -5.03139797e-15  8.35958806e+01  4.45411762e+01
  8.34933559e+01 -7.47850383e-16  9.38078963e+01  1.31446443e-15
  4.15273702e+01 -5.48335925e-15  4.22029328e-15  4.97857528e+01
 -6.61007125e-15  3.15802527e-16  9.62359244e+01  2.58791695e+01]
-----------
Bias is:
9.76716249329481


In [101]:
pred = mlr.predict(X)
r2_score(y,pred)

0.9976797624493334

# Mini Batch Gradient Descent

In [93]:
random_idx = np.random.randint(0, 100,10)

array([30, 47, 98,  2, 77, 83, 72,  4, 10, 14], dtype=int32)

In [95]:
X[random_idx].shape

(10, 20)

In [106]:


class MultipleLinearRegressionSGD:
    def __init__(self, epochs=1000, learning_rate=0.01,batch_size=32):
        self.intercept = 0  # Bias term
        self.coeffs = None  # Coefficients for features
        self.epochs = epochs
        self.lr = learning_rate
        self.batch_size = batch_size

    def fit(self, X, y):
        self.coeffs = np.ones(X.shape[1])
        n = len(X)  # Number of samples


        for epoch in range(self.epochs):
            for j in range(self.batch_size):
                random_idx = np.random.randint(0, n,self.batch_size)
                # Prediction
                y_pred = self.intercept + np.dot(X[random_idx], self.coeffs)
                
                # Errors
                error = y[random_idx] - y_pred

                # Gradients and dividing by batch size
                dm = (-2/self.batch_size) * np.dot(X[random_idx].T, error)  # Gradient for coefficients
                db = (-2/self.batch_size) * np.sum(error)  # Gradient for intercept

                # Update parameters
                self.intercept -= self.lr * db
                self.coeffs -= self.lr * dm

        print("Weights are:")
        print(self.coeffs)

        print("-----------")

        print("Bias is:")
        print(self.intercept)

    def predict(self, X):
        return self.intercept + np.dot(X, self.coeffs)

In [107]:
X,y = make_regression(n_samples=1000,n_features=20)

# Normalize data
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Train model
mlr = MultipleLinearRegressionSGD(learning_rate=0.01)
mlr.fit(X_normalized, y)

Weights are:
[-2.36537569e-15  7.18634488e+01 -9.14247880e-15  8.30346930e-15
 -8.45705060e-15  4.64663413e+01  1.04968913e-14 -2.05800740e-18
  3.68358306e+01 -2.15138824e-14  9.27134037e+01  2.85590424e+01
  3.33374419e+00  8.59081724e+01  6.34505521e+01  2.30477572e-15
  5.78385515e+01  1.08658237e-14 -7.84949272e-15  2.67546373e+01]
-----------
Bias is:
9.009016433945385


In [108]:
pred = mlr.predict(X)
r2_score(y,pred)

0.9971572633209864