# Imports

In [5]:
import math
import numpy as np
import pandas as pd
import plotly.express as px
import pickle

# Data Loading

In [6]:
train_data= pd.read_csv('/kaggle/input/task2-a-data/train.csv') 
train_data.dropna(inplace=True)
X_train=train_data.x
y_train=train_data.y


test_data=pd.read_csv('/kaggle/input/task2-a-data/test.csv')
test_data.dropna(inplace=True)
X_test=test_data.x
y_test=test_data.y

In [7]:
train_data.head()

Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984


plot the data here

In [8]:
plt = px.scatter(train_data, x='x', y='y', color='x')
plt.show()

# Data Preprocessing

## Standardize the data
    fill this
### Why Use Standardization in Machine Learning?
We standardize dta to make sure they are of the same scale and comparable range. After scaling the outliers will not heavily influence model prdictions. Also makes it easier to interpret data features.
### How to Standardize Data
The formula for data standardization is :
X(standardized)=(X-mean)/(standard deviation)


In [9]:
mean_X = X_train.mean(axis=0)
std_X = X_train.std(axis=0)

mean_y = y_train.mean(axis=0)
std_y = y_train.std(axis=0)

X_train = (X_train - mean_X) / std_X
X_test = (X_test - mean_X) / std_X

y_train = (y_train - mean_y) / std_y
y_test = (y_test - mean_y) / std_y

## Reshaping data for the correct shape for the model

why cant we make the model without reshaping?

Reshaping is necessary because many machine learning models, especially those in scikit-learn, expect the input data to be in a 2D array format, even if there's only one feature.

# Model Implementation

# Linear Regression Model

Linear regression is a fundamental model in machine learning used for predicting a continuous output variable based on input features. The model function for linear regression is represented as:

$$f_{w,b}(x) = wx + b$$

In this equation, $f_{w,b}(x)$ represents the predicted output, $w$ is the weight parameter, $b$ is the bias parameter, and $x$ is the input feature.

## Model Training

To train a linear regression model, we aim to find the best values for the parameters $(w, b)$ that best fit our dataset.

### Forward Pass

The forward pass is a step where we compute the linear regression output for the input data $X$ using the current weights and biases. It's essentially applying our model to the input data.

### Cost Function

The cost function is used to measure how well our model is performing. It quantifies the difference between the predicted values and the actual values in our dataset. The cost function is defined as:

$$J(w,b) = \frac{1}{2m} \sum_{i=1}^{m}(f_{w,b}(x^{(i)}) - y^{(i)})^2$$

Here, $J(w, b)$ is the cost, $m$ is the number of training examples, $x^{(i)}$ is the input data for the $i$-th example, $y^{(i)}$ is the actual output for the $i$-th example, and $w$ and $b$ are the weight and bias parameters, respectively.

### Backward Pass (Gradient Computation)

The backward pass computes the gradients of the cost function with respect to the weights and biases. These gradients are crucial for updating the model parameters during training. The gradient formulas are as follows:

$$
\frac{\partial J(w,b)}{\partial b} = \frac{1}{m} \sum_{i=0}^{m-1} (f_{w,b}(X^{(i)}) - y^{(i)})
$$

$$
\frac{\partial J(w,b)}{\partial w} = \frac{1}{m} \sum_{i=0}^{m-1} (f_{w,b}(X^{(i)}) - y^{(i)})X^{(i)}
$$

## Training Process

The training process involves iteratively updating the weights and biases to minimize the cost function. This is typically done through an optimization algorithm like gradient descent. The update equations for parameters are:

$$w \leftarrow w - \alpha \frac{\partial J}{\partial w}$$

$$b \leftarrow b - \alpha \frac{\partial J}{\partial b}$$

Here, $\alpha$ represents the learning rate, which controls the step size during parameter updates.

By iteratively performing the forward pass, computing the cost, performing the backward pass, and updating the parameters, the model learns to make better predictions and fit the data.


In [10]:
import numpy as np
import pickle

class LinearRegression:
    
    def __init__(self, learning_rate=0.001):
        self.w = None
        self.b = None
        self.learning_rate = learning_rate

    def initialize_parameters(self):
        self.w = 0
        self.b = 0

    def forward(self, X):
        return self.w*X+self.b
    
    def compute_cost(self,predictions):
        n = self.X.shape[0]
        return (np.sum((self.y - predictions)**2))/(2 * n)

    def backward(self,predictions):
        n = self.X.shape[0]
        self.w_grad = np.dot(self.X.T, (predictions - self.y)) / n
        self.b_grad=np.mean(predictions - self.y)

    def fit(self, X, y, iterations, plot_cost=True):

        self.X = X
        self.y = y

        self.initialize_parameters()
        weights = []
        costs = []
        for i in range(iterations):
            predictions = self.forward(self.X)

            cost = self.compute_cost(predictions)
            costs.append(cost)
            
            self.backward(predictions)

            self.w = self.w - self.learning_rate * self.w_grad
            self.b = self.b - self.learning_rate * self.b_grad

            if i % 1000 == 0:
                print("Cost after iteration {}: {}".format(i, cost))

        if plot_cost:
            fig = px.line(y=costs,title="Cost vs Iteration",template="plotly_dark")
            fig.update_layout(
                title_font_color="#41BEE9", 
                xaxis=dict(color="#41BEE9",title="Iterations"), 
                yaxis=dict(color="#41BEE9",title="cost")
            )
            fig.show()

    def predict(self, X):
        return self.forward(X)

    def save_model(self, filename="task2a.pkl"):
        with open(filename, "wb") as file:
            pickle.dump({"w": self.w, "b": self.b}, file)

    @classmethod
    def load_model(cls, filename):
        with open(filename, "rb") as file:
            model_para = pickle.load(file)
        model = cls()
        model.w = model_para["w"]
        model.b = model_para["b"]
        return model


In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train, 10000)

Cost after iteration 0: 0.4992846924177398
Cost after iteration 1000: 0.07170997148606575
Cost after iteration 2000: 0.013736096133765259
Cost after iteration 3000: 0.005875551394261954
Cost after iteration 4000: 0.004809758168843606
Cost after iteration 5000: 0.004665249710843328
Cost after iteration 6000: 0.004645656140593881
Cost after iteration 7000: 0.0046429994933606875
Cost after iteration 8000: 0.004642639284657945
Cost after iteration 9000: 0.004642590444786429


In [12]:
lr.save_model('model.pkl')

# Evaluation



### 1. Mean Squared Error (MSE)

**Formula:**
$$
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_{\text{true}_i} - y_{\text{pred}_i})^2
$$

**Description:**
The average squared difference between the value observed in a statistical study and the values predicted from a model
**Interpretation:**
Valuable tool for assessing the performance of regression models
### 2. Root Mean Squared Error (RMSE)

**Formula:**
$$
\text{RMSE} = \sqrt{\text{MSE}}
$$

**Description:**
Square root of MSE
**Interpretation:**
Valuable tool for assessing the performance of regression models


### 3. R-squared ($R^2$)

**Formula:**
$$
R^2 = 1 - \frac{\text{SSR}}{\text{SST}}
$$

**Description:**
RSS	=	sum of squares of residuals
TSS	=	total sum of squares
**Interpretation:**
R-squared is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by an independent variable.

In [13]:
class RegressionMetrics:
    @staticmethod
    def mean_squared_error(y_true, y_pred):
        return np.mean((y_test-y_pred)**2)
        
    @staticmethod
    def root_mean_squared_error(y_true, y_pred):
        return (np.mean((y_test-y_pred)**2))**0.5
        
    @staticmethod
    def r_squared(y_true, y_pred):
        Residual_sum=np.sum((y_true-y_pred)**2)
        Total_sum=np.sum((y_true-np.mean(y_true))**2)
        return 1-Residual_sum/Total_sum
        

In [14]:
model=LinearRegression.load_model('model.pkl')


In [16]:
y_pred = model.predict(X_test)
mse_value = RegressionMetrics.mean_squared_error(y_test, y_pred)
rmse_value = RegressionMetrics.root_mean_squared_error(y_test, y_pred)
r_squared_value = RegressionMetrics.r_squared(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse_value}")
print(f"Root Mean Squared Error (RMSE): {rmse_value}")
print(f"R-squared (Coefficient of Determination): {r_squared_value}")

Mean Squared Error (MSE): 0.011133542765124403
Root Mean Squared Error (RMSE): 0.1055156043679057
R-squared (Coefficient of Determination): 0.9888002020186158
