In [2]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_diabetes  # only for the dataset


In [3]:
SEED = 42
RUN_SKLEARN_BASELINES = False  # flip to True if you want quick scikit-learn baselines for comparison
SHOW_PLOTS = True     
rng = np.random.default_rng(SEED)

# Discussion! We're going to make a model that predicts things from input data. What do we need? 

## Your job: pull the model, examine the data. Tell me what you think!
What do we need to get a working model? How do we know our model is working? 

In [23]:
import numpy as np
from sklearn.datasets import load_diabetes, fetch_california_housing

In [25]:
X,y = load_diabetes(return_X_y=True)

In [26]:
# What should we do with our data? The numbers are all weird! Having some numbers big and some numbers small is usually bad for machine learning.
# How do we deal with that?

mu = X.mean(axis=0)
std = X.std(axis=0)
std[std == 0] = 1.0

X = (X - mu) / std


X_train = X[:int(0.9*N)]
y_train = y[:int(0.9*N)]

X_test = X[int(0.9*N):]
y_test = y[int(0.9*N):]


## Your job: implement linear regression!

In [27]:
# now let's fit our data

Xb = np.concatenate((X_train, np.ones((X_train.shape[0], 1))), axis=-1)
c = (np.linalg.inv(Xb.T @ Xb) @ Xb.T) @ y_train

# Let's predict and see how we did!
Xb = np.concatenate((X_test, np.ones((X_test.shape[0], 1))), axis=-1)
pred = Xb @ c

# Okay, here are some numbers. But how do we know how well we're doing?
print(pred)

[181.95177307 167.50597779 187.32749901 186.48982097  90.53889486
 152.29290681 249.68514648 198.16516254 280.17000011  50.22865624
 176.55241981 201.52703178 172.46995153 154.24646795 151.5408002
 234.64265801 123.10723898 165.12038061 174.58177662 226.79967638
 152.27612963 100.83566119  84.39873487 144.19264954 191.85739585
 195.09067236 153.4649039  172.68980388 111.18932973 164.07442157
 132.26666768 258.73725864 100.30622703 117.14136621 123.18224018
 219.40774991  63.59865959 133.17750337 120.60336132  53.78776693
 190.71934981 104.81970426 123.76995157 209.04586296  55.47592774]


  c = (np.linalg.inv(Xb.T @ Xb) @ Xb.T) @ y_train
  c = (np.linalg.inv(Xb.T @ Xb) @ Xb.T) @ y_train
  c = (np.linalg.inv(Xb.T @ Xb) @ Xb.T) @ y_train


# Your job: Implement R2 -- how do we know how well our model is doing?

In [29]:
mse = ((y_test - pred)**2).sum()
std = ((y_test - y_test.mean())**2).sum()
r2 = 1.0 - mse / (std + 1.e-8)
r2

np.float64(0.687298951549071)

# Your job: Implement ridge regression! It shouldn't take too many changes from above

In [10]:
lam = 10
Xb = np.concatenate((X_train, np.ones((X_train.shape[0], 1))), axis=-1)

reg = lam * np.eye(Xb.shape[1])
reg[-1,-1] = 0
A = Xb.T @ Xb + reg
c = (np.linalg.inv(A) @ Xb.T) @ y_train

# Let's predict and see how we did!
Xb = np.concatenate((X_test, np.ones((X_test.shape[0], 1))), axis=-1)
pred = Xb @ c

# Okay, here are some numbers. But how do we know how well we're doing?
print(pred)
mse = ((y_test - pred)**2).sum()
std = ((y_test - y_test.mean())**2).sum()
r2 = 1.0 - mse / (std + 1.e-8)
r2

[134.72899285  54.63676056 174.51018216  82.13952989 204.87677874
 105.43083279 150.50759784  69.96848838 142.22822348 210.26839222
 218.4714877  142.90037255 208.10461505 201.29847611 247.78614823
 216.78476254  77.70270435  97.21517407 248.00187449  94.96664279
 152.24761686 218.71058689 149.17143062 137.46418987 184.15241299
 137.00963333  88.18320401  96.48169195 116.06097024 259.51857631
 134.74399245 187.83929836 140.80935189 142.86002186 174.8058601
  67.71293423  90.88436565 183.22716498 194.53489088  70.55534178
 156.71635819 237.19976306 127.40959575 142.44089547 141.50463094]


  A = Xb.T @ Xb + reg
  A = Xb.T @ Xb + reg
  A = Xb.T @ Xb + reg
  c = (np.linalg.inv(A) @ Xb.T) @ y_train
  c = (np.linalg.inv(A) @ Xb.T) @ y_train
  c = (np.linalg.inv(A) @ Xb.T) @ y_train


np.float64(0.47536917375078813)