In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [2]:
def computeMSE(y_true, y_pred): return np.mean((y_true - y_pred)**2)

In [3]:
def linearRegression(
    X,
    y,
    epochs = 1001,
    LR = 0.01 # Learning Rate
):
    n_samples, n_features = X.shape
    weights = np.zeros(n_features)
    bias = 0

    for epoch in range(epochs):
        y_pred = np.dot(X, weights) + bias

        dw = -(2 / n_samples) * np.dot(X.T, (y - y_pred))
        db = -(2/n_samples) * np.sum(y-y_pred)

        weights -= LR * dw
        bias -= LR * db

        mse = computeMSE(y, y_pred)
        print(f"Epoch: {epoch}: MSE = {mse}")

    return weights, bias

In [4]:
ch = fetch_california_housing()
print(ch)

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]]), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset:\n\nCalifornia Housing dataset\n-

In [5]:
data = ch.data
target = ch.target

df = pd.DataFrame(data, columns=ch.feature_names)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [6]:
df['Target'] = target
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [7]:
df.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# train, test = train_test_split(df, 0.2)
print(data)
print(target)
print(len(data), len(target))

[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]
[4.526 3.585 3.521 ... 0.923 0.847 0.894]
20640 20640


In [10]:
data, target

(array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]))

In [11]:
w, b = linearRegression(np.array(data), np.array(target), 20, 0.00000001)

Epoch: 0: MSE = 5.610483198987253
Epoch: 1: MSE = 5.278817774533281
Epoch: 2: MSE = 4.989732106868037
Epoch: 3: MSE = 4.7377524126978425
Epoch: 4: MSE = 4.518108585475579
Epoch: 5: MSE = 4.326643734938561
Epoch: 6: MSE = 4.159735355700862
Epoch: 7: MSE = 4.01422662993861
Epoch: 8: MSE = 3.8873665613900332
Epoch: 9: MSE = 3.776757805369425
Epoch: 10: MSE = 3.680311205441585
Epoch: 11: MSE = 3.5962061745887532
Epoch: 12: MSE = 3.522856169537078
Epoch: 13: MSE = 3.458878603496618
Epoch: 14: MSE = 3.403068626739029
Epoch: 15: MSE = 3.354376277786927
Epoch: 16: MSE = 3.311886571909313
Epoch: 17: MSE = 3.2748021493206014
Epoch: 18: MSE = 3.242428154023109
Epoch: 19: MSE = 3.2141590565348648


In [12]:
w, b

(array([ 2.94911254e-06,  1.86293941e-05,  3.49697344e-06,  6.60311215e-07,
         6.54701959e-04,  1.62217837e-06,  2.15068815e-05, -7.25893632e-05]),
 6.064428179810929e-07)