In [41]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes

In [42]:
dt = load_diabetes()
print(dt['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [43]:
x = dt.data
y = dt.target
print('features: ', dt.feature_names)
print('First five: ', y[:5])

features:  ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
First five:  [151.  75. 141. 206. 135.]


In [44]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

feature_scaler = StandardScaler()
target_scaler = StandardScaler()

x = feature_scaler.fit_transform(x)

y = y.reshape(-1, 1)
y = target_scaler.fit_transform(y)
y = y.ravel()

x, x_test , y, y_test = train_test_split(x, y ,test_size=0.2, random_state=42)

In [45]:
m, n = x.shape
w = np.zeros(n)
b =0

In [46]:
def predict(x, w, b):
    return np.dot(x, w) + b

In [47]:
def comp_cost(x, y, w, b):
    m = len(y)
    y_pred = predict(x , w , b)
    cost = (1 / (2 * m)) * np.sum((y_pred - y) ** 2)

    return cost

In [48]:
def comp_grad(X, y, w, b):

    m = len(y)
    y_pred = predict(X, w, b)
    error = y_pred - y

    dw = (1 / m) * np.dot(X.T, error)
    db = (1 / m) * np.sum(error)

    return dw, db

In [49]:
def update_para(w, b, dw, db, learning_rate):

    w = w - learning_rate * dw
    b = b - learning_rate * db

    return w, b

In [50]:
w = np.zeros(n)
b = 0

learning_rate = .001
num_iterations = 10000
cost_history = []

parameters = {}

for i in range(num_iterations):

    y_pred = predict(x, w, b)
    cost = comp_cost(x, y, w, b)
    dw, db = comp_grad(x, y, w, b)
    w, b = update_para(w, b, dw, db, learning_rate)

    if i % 1000 == 0:
      cost_history.append(cost)
      print(f'Iteration {i}: Cost = {cost:.4f}')

      parameters = {'weights': w.tolist(), 'bias': b}

Iteration 0: Cost = 0.5126
Iteration 1000: Cost = 0.2580
Iteration 2000: Cost = 0.2465
Iteration 3000: Cost = 0.2448
Iteration 4000: Cost = 0.2444
Iteration 5000: Cost = 0.2443
Iteration 6000: Cost = 0.2442
Iteration 7000: Cost = 0.2442
Iteration 8000: Cost = 0.2442
Iteration 9000: Cost = 0.2441


In [51]:
y_pred = predict(x , w, b)
final_cost = comp_cost(x, y, w, b)
mse = 2 * final_cost

print(f'Final Cost: {final_cost:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print('Final Weights:', w)
print('Final Bias:', b)

Final Cost: 0.2441
Mean Squared Error: 0.4881
Final Weights: [ 0.0259686  -0.14838151  0.34424672  0.21164906 -0.08594714 -0.06166533
 -0.12128665  0.09817543  0.26812243  0.0340318 ]
Final Bias: -0.010765538732703505


In [52]:
y_pred_test = predict(x_test, w, b)

print(f"Residual Analysis : {(np.abs(y_test - y_pred_test)).mean()}")

mae = np.abs(y_test - y_pred_test).mean()
mse = ((y_test - y_pred_test)**2).mean()
rmse = np.sqrt(((y_test - y_pred_test)**2).sum()/len(y_test))


print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


SS_res = np.sum((y_test - y_pred_test)**2)        
SS_tot = np.sum((y_test - np.mean(y_test))**2)    

r2_ = 1 - (SS_res / SS_tot)

print(f"R-squared: {r2_:.4f}")

Residual Analysis : 0.5570976012136875
Mean Absolute Error (MAE): 0.5571
Mean Squared Error (MSE): 0.4865
Root Mean Squared Error (RMSE): 0.6975
R-squared: 0.4554


In [59]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = SGDRegressor(
    loss='squared_error',
    alpha=0.0,
    learning_rate='constant', 
    eta0=0.001,                 
    max_iter=1000,
    random_state=42
  )


model.fit(x, y)  

y_pred_test = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

Mean Squared Error: 0.4869
R-squared: 0.4551


In [60]:
print('scratch model weights : ' , model.coef_)
print('sklearn model weights : ' , w)

scratch model weights :  [ 0.02570603 -0.14462578  0.33891888  0.20913889 -0.0605443  -0.08162086
 -0.13815758  0.08992119  0.25121163  0.04993703]
sklearn model weights :  [ 0.0259686  -0.14838151  0.34424672  0.21164906 -0.08594714 -0.06166533
 -0.12128665  0.09817543  0.26812243  0.0340318 ]
