In [None]:
'''
 * Copyright (c) 2004 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''


1. Defining the Model

Before we can begin optimizing our model's parameters by minibatch SGD, we need to have some parameters in the first place. In the following, we initialize weights by drawing random numbers from a normal distribution with mean 0 and a standard deviation of 0.01. The magic number 0.01 often works well in practice, but you can specify a different value through the argument `sigma`. Moreover, we set the bias to 0. Note that for object-oriented design, we add the code to the `__init__` method of a subclass of `d2l.Module` (introduced in Section 3.2.2).

```python
class LinearRegressionScratch(d.Module):
    #@save
    """The linear regression model implemented from scratch."""
    def __init__(self, num_inputs, lr, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.w = torch.normal(0, sigma, (num_inputs, 1), requires_grad=True)
        self.b = torch.zeros(1, requires_grad=True)
```

Next, we must define our model, relating its input and parameters to its output. Using the same notation in (3.1.4), for our linear model, we simply take the matrix-vector product of the input features `X` and the model weights `w`, and add the offset `b` to each example. `Xw` is a vector and `b` is a scalar. Due to the broadcasting mechanism (see Section 2.1.4), when we add a vector and a scalar, the scalar is added to each component of the vector. The resulting `forward` method is registered in the `LinearRegressionScratch` class via `add_to_class` (introduced in Section 3.2.1).

```python
@d.add_to_class(LinearRegressionScratch)
#@save
def forward(self, X):
    return torch.matmul(X, self.w) + self.b
```

2. Defining the Loss Function

Since updating our model requires taking the gradient of our loss function, we ought to define the loss function first. Here we use the squared loss function in (3.1.5). In the implementation, we need to transform the true value `y` into the predicted value's shape `y_hat`. The result returned by the following method will also have the same shape as `y_hat`. We also return the averaged loss value among all examples in the minibatch.

```python
@d.add_to_class(LinearRegressionScratch)
#@save
def squared_loss(self, y_hat, y):
    return (y_hat - y.reshape(y_hat.shape)) ** 2 / 2


```


# Implementing Linear Regression from Scratch

## Defining the Loss Function

```python
@d.add_to_class(LinearRegressionScratch)  # @save
def loss(self, y_hat, y):
    l = (y_hat - y) ** 2 / 2
    return l.mean()
```

## Defining the Optimization Algorithm

As discussed in Section 3.1, linear regression has a closed-form solution. However, our goal here is to illustrate how to train more general neural networks, which requires us to use minibatch stochastic gradient descent (SGD).

At each step, using a minibatch randomly drawn from our dataset, we estimate the gradient of the loss with respect to the parameters. Then, we update the parameters in the direction that may reduce the loss.

### Stochastic Gradient Descent (SGD)
The following code defines the SGD optimizer:

```python
class SGD(d.HyperParameters):  # @save
    """Minibatch stochastic gradient descent."""
    def __init__(self, params, lr):
        self.save_hyperparameters()

    def step(self):
        for param in self.params:
            param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()
```

We next define the `configure_optimizers` method, which returns an instance of the `SGD` class.

```python
@d.add_to_class(LinearRegressionScratch)  # @save
def configure_optimizers(self):
    return SGD([self.w, self.b], self.lr)
```

## Training the Model

Now that we have all of the components (parameters, loss function, model, and optimizer), we are ready to implement the main training loop.

### Training Loop

At each epoch:

- Iterate through the entire dataset.
- Compute the loss on a minibatch.
- Compute the gradient.
- Update the parameters using the optimizer.

Mathematically, this can be represented as:

$$
\begin{aligned}
    &\text{Initialize parameters } (w, b) \\
    &\text{Repeat until convergence:} \\
    &\quad \text{Compute gradient: } g \leftarrow \frac{\partial}{\partial (w,b)} \sum_{i \in B} l(x^{(i)}, y^{(i)}, w, b) \\
    &\quad \text{Update parameters: } (w, b) \leftarrow (w, b) - \eta g
\end{aligned}
$$

The training function is implemented as follows:

```python
@d.add_to_class(d.Trainer)  # @save
def prepare_batch(self, batch):
    return batch

@d.add_to_class(d.Trainer)  # @save
def fit_epoch(self):
    self.model.train()
    for batch in self.train_dataloader:
        loss = self.model.training_step(self.prepare_batch(batch))
        self.optim.zero_grad()
        with torch.no_grad():
            loss.backward()
        if self.gradient_clip_val > 0:  # To be discussed later
            self.clip_gradients(self.gradient_clip_val, self.model)
        self.optim.step()
        self.train_batch_idx += 1

    if self.val_dataloader is None:
        return
    self.model.eval()
    for batch in self.val_dataloader:
        with torch.no_grad():
            self.model.validation_step(self.prepare_batch(batch))
        self.val_batch_idx += 1
```

## Generating Synthetic Data

We now generate synthetic data for training. Here, we use the `SyntheticRegressionData` class with ground-truth parameters.

```python
data = d.SyntheticRegressionData(w=[2, -3.4], b=4.2)
```

## Training the Model

We set the learning rate to \(\eta = 0.03\) and train the model for 3 epochs:

```python
model = LinearRegressionScratch(lr=0.03)
trainer = d.Trainer(max_epochs=3)
trainer.fit(model, data)
```

This concludes the implementation of linear regression using stochastic gradient descent.
```



![image.png](attachment:image.png)

## Implementing Linear Regression from Scratch

### Defining the Loss Function

```python
@d.add_to_class(LinearRegressionScratch)  # @save
def loss(self, y_hat, y):
    l = (y_hat - y) ** 2 / 2
    return l.mean()
```

## Defining the Optimization Algorithm

As discussed in Section 3.1, linear regression has a closed-form solution. However, our goal here is to illustrate how to train more general neural networks, which requires us to use minibatch stochastic gradient descent (SGD). 

At each step, using a minibatch randomly drawn from our dataset, we estimate the gradient of the loss with respect to the parameters. Then, we update the parameters in the direction that may reduce the loss.

### Stochastic Gradient Descent (SGD)
The following code defines the SGD optimizer:

```python
class SGD(d.HyperParameters):  # @save
    """Minibatch stochastic gradient descent."""
    def __init__(self, params, lr):
        self.save_hyperparameters()

    def step(self):
        for param in self.params:
            param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()
```

We next define the `configure_optimizers` method, which returns an instance of the `SGD` class.

```python
@d.add_to_class(LinearRegressionScratch)  # @save
def configure_optimizers(self):
    return SGD([self.w, self.b], self.lr)
```

## Training the Model

Now that we have all of the components (parameters, loss function, model, and optimizer), we are ready to implement the main training loop.

### Training Loop

At each epoch:

- Iterate through the entire dataset.
- Compute the loss on a minibatch.
- Compute the gradient.
- Update the parameters using the optimizer.

Mathematically, this can be represented as:

$$
\begin{aligned}
    &\text{Initialize parameters } (w, b) \\
    &\text{Repeat until convergence:} \\
    &\quad \text{Compute gradient: } g \leftarrow \frac{\partial}{\partial (w,b)} \sum_{i \in B} l(x^{(i)}, y^{(i)}, w, b) \\
    &\quad \text{Update parameters: } (w, b) \leftarrow (w, b) - \eta g
\end{aligned}
$$

The training function is implemented as follows:

```python
@d.add_to_class(d.Trainer)  # @save
def prepare_batch(self, batch):
    return batch

@d.add_to_class(d.Trainer)  # @save
def fit_epoch(self):
    self.model.train()
    for batch in self.train_dataloader:
        loss = self.model.training_step(self.prepare_batch(batch))
        self.optim.zero_grad()
        with torch.no_grad():
            loss.backward()
        if self.gradient_clip_val > 0:  # To be discussed later
            self.clip_gradients(self.gradient_clip_val, self.model)
        self.optim.step()
        self.train_batch_idx += 1

    if self.val_dataloader is None:
        return
    self.model.eval()
    for batch in self.val_dataloader:
        with torch.no_grad():
            self.model.validation_step(self.prepare_batch(batch))
        self.val_batch_idx += 1
```

## Hyperparameter Selection and Training

Both the number of epochs and the learning rate are hyperparameters. Setting hyperparameters is tricky, and we will usually want to use a 3-way split: one set for training, a second for hyperparameter selection, and the third reserved for final evaluation. We elide these details for now but will revise them later.

```python
model = LinearRegressionScratch(2, lr=0.03)
data = d.SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)
trainer = d.Trainer(max_epochs=3)
trainer.fit(model, data)
```

Because we synthesized the dataset ourselves, we know precisely what the true parameters are. Thus, we can evaluate our success in training by comparing the true parameters with those that we learned through our training loop. Indeed, they turn out to be very close to each other.

```python
print(f'error in estimating w: {data.w - model.w.reshape(data.w.shape)}')
print(f'error in estimating b: {data.b - model.b}')
```

Example output:
```python
error in estimating w: tensor([ 0.1006, -0.1535], grad_fn=<SubBackward0>)
error in estimating b: tensor([0.2132], grad_fn=<RsubBackward1>)


We should not take the ability to exactly recover the ground-truth parameters for granted. In general, for deep models, unique solutions for the parameters do not exist, and even for linear models, exactly recovering the parameters is only possible when no feature is linearly dependent on the others. However, in machine learning, we are often less concerned with recovering true underlying parameters and more concerned with parameters that lead to highly accurate predictions (Vapnik, 1992). Fortunately, even on difficult optimization problems, stochastic gradient descent can often find remarkably good solutions, partly because, for deep networks, there exist many configurations of the parameters that lead to highly accurate predictions.
```



In [3]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt

# Generate Synthetic Data
def generate_data(n=1000):
    np.random.seed(42)
    X = np.random.rand(n, 2) * 10  # Two features
    w_true = np.array([2, -3.4])  # True weights
    b_true = 4.2  # True bias
    y = X @ w_true + b_true + np.random.randn(n) * 0.5  # Adding noise
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

X_train, y_train = generate_data(1000)

# Define the Linear Regression Model
class LinearRegressionScratch(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.w = nn.Parameter(torch.randn(num_features, 1))  # Random weights
        self.b = nn.Parameter(torch.randn(1))  # Random bias
    
    def forward(self, X):
        return X @ self.w + self.b  # Linear transformation

# Define the Loss Function & Optimizer
def loss(y_hat, y):
    return ((y_hat - y) ** 2 / 2).mean()  # MSE / 2

class SGD:
    def __init__(self, params, lr=0.03):
        self.params = params
        self.lr = lr
    
    def step(self):
        for param in self.params:
            param.data -= self.lr * param.grad  # Update rule
    
    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()

# Training the Model
def train(model, X, y, lr=0.03, epochs=3):
    optimizer = SGD([model.w, model.b], lr=lr)
    
    for epoch in range(epochs):
        y_hat = model(X)  # Forward pass
        l = loss(y_hat, y)  # Compute loss
        l.backward()  # Compute gradients
        optimizer.step()  # Update parameters
        optimizer.zero_grad()  # Reset gradients
        print(f"Epoch {epoch+1}, Loss: {l.item():.4f}")

# Train the model
model = LinearRegressionScratch(2)
train(model, X_train, y_train)

# Evaluate the Model
print(f"Estimated Weights: {model.w.data.flatten().numpy()}")
print(f"Estimated Bias: {model.b.item():.4f}")




Epoch 1, Loss: 148.1430
Epoch 2, Loss: 83.4955
Epoch 3, Loss: 47.4871
Estimated Weights: [ 0.8811886 -1.1149743]
Estimated Bias: -1.0005


In [7]:

import random

# Generate Synthetic Data
def generate_data(n=1000):
    random.seed(42)
    X = [[random.uniform(0, 10), random.uniform(0, 10)] for _ in range(n)]  # Two features
    w_true = [2, -3.4]  # True weights
    b_true = 4.2  # True bias
    y = [x[0] * w_true[0] + x[1] * w_true[1] + b_true + random.gauss(0, 0.5) for x in X]  # Adding noise
    return X, y

X_train, y_train = generate_data(1000)

# Define the Linear Regression Model
class LinearRegressionScratch:
    def __init__(self, num_features):
        self.w = [random.uniform(-1, 1) for _ in range(num_features)]  # Random weights
        self.b = random.uniform(-1, 1)  # Random bias
    
    def predict(self, X):
        return [sum(x[i] * self.w[i] for i in range(len(self.w))) + self.b for x in X]

# Define the Loss Function & Optimizer
def loss(y_hat, y):
    return sum((yh - yt) ** 2 / 2 for yh, yt in zip(y_hat, y)) / len(y)

class SGD:
    def __init__(self, params, lr=0.03):
        self.params = params
        self.lr = lr
    
    def step(self, gradients):
        for i in range(len(self.params['w'])):
            self.params['w'][i] -= self.lr * gradients['w'][i]
        self.params['b'] -= self.lr * gradients['b']

# Compute Gradients
def compute_gradients(model, X, y):
    y_hat = model.predict(X)
    n = len(y)
    gradients = {'w': [0] * len(model.w), 'b': 0}
    
    for i in range(n):
        error = y_hat[i] - y[i]
        for j in range(len(model.w)):
            gradients['w'][j] += error * X[i][j] / n
        gradients['b'] += error / n
    
    return gradients

# Training the Model
def train(model, X, y, lr=0.03, epochs=3):
    optimizer = SGD({'w': model.w, 'b': model.b}, lr=lr)
    
    for epoch in range(epochs):
        gradients = compute_gradients(model, X, y)
        optimizer.step(gradients)
        print(f"Epoch {epoch+1}, Loss: {loss(model.predict(X), y):.4f}")

# Train the model
model = LinearRegressionScratch(2)
train(model, X_train, y_train)

# Evaluate the Model
print(f"Estimated Weights: {model.w}")
print(f"Estimated Bias: {model.b:.4f}")




Epoch 1, Loss: 58.1083
Epoch 2, Loss: 34.8290
Epoch 3, Loss: 21.0880
Estimated Weights: [0.826513471323397, -2.410363336209654]
Estimated Bias: 0.3743


## 5. Concise Implementation of Linear Regression

Deep learning has witnessed a Cambrian explosion of sorts over the past decade. The sheer number of techniques, applications, and algorithms by far surpasses the progress of previous decades. This is due to a fortuitous combination of multiple factors, one of which is the powerful free tools offered by a number of open-source deep learning frameworks. Theano (Bergstra et al., 2010), DistBelief (Dean et al., 2012), and Caffe (Jia et al., 2014) arguably represent the first generation of such models that found widespread adoption. In contrast to earlier (seminal) works like SN2 (Simulateur Neuristique) (Bottou and Le Cun, 1988), which provided a Lisp-like programming experience, modern frameworks offer automatic differentiation and the convenience of Python. These frameworks allow us to automate and modularize the repetitive work of implementing gradient-based learning algorithms.

In Section 3.4, we relied only on (i) tensors for data storage and linear algebra; and (ii) automatic differentiation for calculating gradients. In practice, because data iterators, loss functions, optimizers, and neural network layers are so common, modern libraries implement these components for us as well. In this section, we will show you how to implement the linear regression model from Section 3.4 concisely by using high-level APIs of deep learning frameworks.

### 5.1 Defining the Model

When we implemented linear regression from scratch in Section 3.4, we defined our model parameters explicitly and coded up the calculations to produce output using basic linear algebra operations. You should know how to do this. But once your models get more complex, and once you have to do this nearly every day, you will be glad for the assistance. The situation is similar to coding up your own blog from scratch. Doing it once or twice is rewarding and instructive, but you would be a lousy web developer if you spent a month reinventing the wheel. For standard operations, we can use a framework’s predefined layers, which allow us to focus on the layers used to construct the model rather than worrying about their implementation.

```python
class LinearRegression:
    def __init__(self, input_dim, lr=0.01):
        self.weights = [0.0] * input_dim
        self.bias = 0.0
        self.lr = lr
    
    def predict(self, X):
        return [sum(x * w for x, w in zip(sample, self.weights)) + self.bias for sample in X]
    
    def update_weights(self, gradients):
        self.weights = [w - self.lr * g for w, g in zip(self.weights, gradients)]
        self.bias -= self.lr * sum(gradients) / len(gradients)
```

Recall the architecture of a single-layer network as described in Figure 3.1.2. The layer is called fully connected since each of its inputs is connected to each of its outputs by means of a matrix-vector multiplication. In modern deep learning frameworks, the fully connected layer is defined in `Linear` and `LazyLinear` classes. The latter allows users to only specify the output dimension, while the former additionally asks for how many inputs go into this layer. Specifying input shapes is inconvenient, which may require nontrivial calculations (such as in convolutional layers). Thus, for simplicity, we will use such “lazy” layers whenever we can.

### 5.2 Defining the Loss Function

The `MSELoss` class computes the mean squared error (without the 1/2 factor). By default, `MSELoss` returns the average loss over examples. It is faster (and easier to use) than implementing our own loss function.

```python
def mean_squared_error(y_true, y_pred):
    return sum((yt - yp) ** 2 for yt, yp in zip(y_true, y_pred)) / len(y_true)
```

### 5.3 Defining the Optimization Algorithm

Minibatch SGD is a standard tool for optimizing neural networks, and modern deep learning frameworks support it alongside a number of variations on this algorithm in their optimization modules. When we instantiate an SGD instance, we specify the parameters to optimize over, obtainable from our model via `self.parameters()`, and the learning rate (`self.lr`) required by our optimization algorithm.

```python
def stochastic_gradient_descent(model, X, y, epochs=10):
    for epoch in range(epochs):
        y_pred = model.predict(X)
        loss = mean_squared_error(y, y_pred)
        gradients = [(yp - yt) for yp, yt in zip(y_pred, y)]
        model.update_weights(gradients)
        print(f'Epoch {epoch+1}, Loss: {loss:.4f}')
```




## Concise Implementation of Linear Regression

Deep learning has witnessed significant advancements over the past decade. Modern frameworks allow us to automate and modularize gradient-based learning algorithms. In this section, we will implement the linear regression model concisely using high-level APIs.

## Importing Libraries

```python
import numpy as np
import torch
from torch import nn
```

## Defining the Model

```python
class LinearRegression(nn.Module):
    def __init__(self, lr):
        super().__init__()
        self.lr = lr
        self.net = nn.Linear(2, 1)  # Fully connected layer
        self.net.weight.data.normal_(0, 0.01)
        self.net.bias.data.fill_(0)

    def forward(self, X):
        return self.net(X)

    def loss(self, y_hat, y):
        fn = nn.MSELoss()
        return fn(y_hat, y)

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), self.lr)
```

## Training the Model

```python
# Generating synthetic data
true_w = torch.tensor([2, -3.4])
true_b = 4.2
data = torch.randn(1000, 2)
labels = data @ true_w + true_b + torch.randn(1000) * 0.01

def train_model():
    model = LinearRegression(lr=0.03)
    optimizer = model.configure_optimizers()
    loss_fn = model.loss
    
    for epoch in range(3):
        optimizer.zero_grad()
        y_hat = model(data)
        loss = loss_fn(y_hat, labels.view(-1, 1))
        loss.backward()
        optimizer.step()
    
    return model

model = train_model()
```

## Evaluating the Model

```python
def get_w_b(model):
    return model.net.weight.data, model.net.bias.data

w, b = get_w_b(model)
print(f'error in estimating w: {true_w - w.reshape(true_w.shape)}')
print(f'error in estimating b: {true_b - b}')
```

### Output Example:
```
error in estimating w: tensor([ 0.0022, -0.0069])
error in estimating b: tensor([0.0080])
```
```



In [8]:
import numpy as np

# Generate synthetic data
def generate_data(n_samples=1000):
    np.random.seed(42)
    true_w = np.array([2, -3.4])
    true_b = 4.2
    X = np.random.randn(n_samples, 2)
    y = X @ true_w + true_b + np.random.randn(n_samples) * 0.01
    return X, y, true_w, true_b

X, y, true_w, true_b = generate_data()

# Linear Regression Model
class LinearRegression:
    def __init__(self, lr=0.03):
        self.lr = lr
        self.w = np.random.randn(2)
        self.b = np.random.randn()
    
    def predict(self, X):
        return X @ self.w + self.b
    
    def loss(self, y_hat, y):
        return ((y_hat - y) ** 2).mean()
    
    def train(self, X, y, epochs=3):
        n = len(y)
        for epoch in range(epochs):
            y_hat = self.predict(X)
            grad_w = (X.T @ (y_hat - y)) / n
            grad_b = (y_hat - y).mean()
            self.w -= self.lr * grad_w
            self.b -= self.lr * grad_b
            print(f'Epoch {epoch+1}, Loss: {self.loss(y_hat, y):.4f}')

# Train the model
model = LinearRegression(lr=0.03)
model.train(X, y, epochs=3)

# Evaluate the model
print(f'Error in estimating w: {true_w - model.w}')
print(f'Error in estimating b: {true_b - model.b}')


Epoch 1, Loss: 41.8427
Epoch 2, Loss: 39.4340
Epoch 3, Loss: 37.1641
Error in estimating w: [ 3.57940776 -2.33460353]
Error in estimating b: 4.211997049641287


In [9]:
import torch

# Generate synthetic data
def generate_data(n_samples=1000):
    torch.manual_seed(42)
    true_w = torch.tensor([2.0, -3.4])
    true_b = 4.2
    X = torch.randn(n_samples, 2)
    y = X @ true_w + true_b + torch.randn(n_samples) * 0.01
    return X, y, true_w, true_b

X, y, true_w, true_b = generate_data()

# Linear Regression Model
class LinearRegression(torch.nn.Module):
    def __init__(self, lr=0.03):
        super().__init__()
        self.w = torch.nn.Parameter(torch.randn(2, requires_grad=True))
        self.b = torch.nn.Parameter(torch.randn(1, requires_grad=True))
        self.lr = lr
    
    def forward(self, X):
        return X @ self.w + self.b
    
    def loss(self, y_hat, y):
        return torch.mean((y_hat - y) ** 2)
    
    def train(self, X, y, epochs=3):
        optimizer = torch.optim.SGD(self.parameters(), lr=self.lr)
        for epoch in range(epochs):
            optimizer.zero_grad()
            y_hat = self.forward(X)
            loss = self.loss(y_hat, y)
            loss.backward()
            optimizer.step()
            print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# Train the model
model = LinearRegression(lr=0.03)
model.train(X, y, epochs=3)

# Evaluate the model
print(f'Error in estimating w: {true_w - model.w.detach()}')
print(f'Error in estimating b: {true_b - model.b.detach()}')


Epoch 1, Loss: 18.1382
Epoch 2, Loss: 16.0748
Epoch 3, Loss: 14.2468
Error in estimating w: tensor([ 0.7516, -2.7659])
Error in estimating b: tensor([2.1833])


In [10]:
import random

# Generate synthetic data
def generate_data(n_samples=1000):
    random.seed(42)
    true_w = [2.0, -3.4]
    true_b = 4.2
    X = [[random.gauss(0, 1) for _ in range(2)] for _ in range(n_samples)]
    y = [sum(x[i] * true_w[i] for i in range(2)) + true_b + random.gauss(0, 0.01) for x in X]
    return X, y, true_w, true_b

X, y, true_w, true_b = generate_data()

# Linear Regression Model
class LinearRegression:
    def __init__(self, lr=0.03):
        self.w = [random.random(), random.random()]
        self.b = random.random()
        self.lr = lr
    
    def forward(self, X):
        return [sum(X[i][j] * self.w[j] for j in range(2)) + self.b for i in range(len(X))]
    
    def loss(self, y_hat, y):
        return sum((y_hat[i] - y[i]) ** 2 for i in range(len(y))) / len(y)
    
    def train(self, X, y, epochs=3):
        for epoch in range(epochs):
            y_hat = self.forward(X)
            grad_w = [sum((y_hat[i] - y[i]) * X[i][j] for i in range(len(y))) / len(y) for j in range(2)]
            grad_b = sum(y_hat[i] - y[i] for i in range(len(y))) / len(y)
            
            self.w = [self.w[j] - self.lr * grad_w[j] for j in range(2)]
            self.b -= self.lr * grad_b
            
            print(f'Epoch {epoch+1}, Loss: {self.loss(y_hat, y):.4f}')

# Train the model
model = LinearRegression(lr=0.03)
model.train(X, y, epochs=3)

# Evaluate the model
print(f'Error in estimating w: {[true_w[i] - model.w[i] for i in range(2)]}')
print(f'Error in estimating b: {true_b - model.b}')


Epoch 1, Loss: 30.5174
Epoch 2, Loss: 28.6857
Epoch 3, Loss: 26.9640
Error in estimating w: [1.0355043396349708, -3.692129574485173]
Error in estimating b: 3.200872572734384
