## Simple Linear Regression

In [93]:
import numpy as np

### Fetching the data

In [94]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing()
X, y = data.data, data.target



In [95]:
# Look at available feature names
print(data.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [96]:
# Convert to DataFrame to easily select one column
df = pd.DataFrame(X, columns=data.feature_names)

# Select one feature column, e.g., 'MedInc'
X = df['MedInc'] # double brackets keep it as 2D array shape (n_samples, 1)

print(X.shape)  # should be (20640, 1)
print(y.shape)

(20640,)
(20640,)


In [97]:
X = np.array(X)
X

array([8.3252, 8.3014, 7.2574, ..., 1.7   , 1.8672, 2.3886])

In [98]:
from sklearn.model_selection import train_test_split

In [99]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [100]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(16512,)
(4128,)
(16512,)
(4128,)


## The formula of Simple Linear Regression

### Formula for m (slope):
$$ m = \frac{\sum (x_i - \bar{x})(y_i - \bar{y})}{\sum (x_i - \bar{x})^2} $$

### Formula for b (intercept):
$$ b = \bar{y} - m \cdot \bar{x} $$


### Code from Scratch

In [101]:

class SimpleLinearRegression:
    def __init__(self):
        self.m = None
        self.b = None

    def fit(self, X, y):
        self.X = np.array(X, dtype=float)
        self.y = np.array(y, dtype=float)

        self.numerator = 0
        self.denominator = 0

        # calculating the numerator and denominator
        for i in range(len(self.X)):
            self.numerator += (self.X[i] - self.X.mean()) * (self.y[i] - self.y.mean())
            self.denominator += (self.X[i] - self.X.mean()) ** 2

        # Calculate the slope (m) and intercept (b)
        self.m = self.numerator / self.denominator
        self.b = self.y.mean() - self.m * self.X.mean()

    def predict(self, X):
        """X = np.array(X, dtype=float).flatten()
        self.y_pred = self.m * X + self.b

        return y_pred"""
        return self.m * X + self.b


In [102]:
MyLr = SimpleLinearRegression()

In [103]:
MyLr.fit(X_train,y_train)

In [104]:
MyLr.m

np.float64(0.4210242513544825)

In [105]:
MyLr.b

np.float64(0.43731625550206954)

In [106]:
X_test[0]

np.float64(4.5156)

In [107]:
MyLr.predict(X_test)

array([2.33849336, 1.61428955, 3.45067103, ..., 4.6584633 , 2.14398016,
       1.53256874])

## Comparing the result with Scikit-Learn Library

In [108]:
from sklearn.linear_model import  LinearRegression

In [109]:
lr = LinearRegression()

In [110]:
lr.fit(X_train.reshape(-1,1),y_train)

In [111]:
lr.coef_

array([0.42102425])

In [112]:
lr.intercept_

np.float64(0.4373162555020693)

In [113]:
lr.predict(X_test.reshape(-1,1))

array([2.33849336, 1.61428955, 3.45067103, ..., 4.6584633 , 2.14398016,
       1.53256874])