In [95]:
import polars as pl
from sklearn.model_selection import train_test_split
import numpy as np

f_name = 'data.csv'
df = pl.read_csv(f_name, has_header=True, sep=',') # comma separated data
X, y = df.drop('CRIM'), df.get_column('CRIM')
X = X.with_columns(pl.col('RM').fill_null(pl.median('RM')),)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2023)
X_train, X_test = X_train.to_numpy(), X_test.to_numpy()
y_train, y_test = y_train.to_numpy(), y_test.to_numpy()

In [103]:
l = [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4] # todo: choose lambda values

def linear_regression(X, y, lambdas):
    weights = []
    phi = X
    I = np.eye(X.shape[1])
    for l in lambdas:
        inside_part = np.linalg.inv(l*I + np.matmul(phi.T, phi))
        w = np.matmul(np.matmul(inside_part, phi.T), y)
        weights.append(w)
    return weights

weights = linear_regression(X_train, y_train, l)

In [104]:
def score(X, y, w):
    preds = []
    for x in X:
        preds.append(np.dot(x, w))
    preds = np.array(preds)
    u = ((y - preds)**2).sum()
    v = ((y - y.mean())**2).sum()
    r2 = 1 - (u/v)
    return r2

scores = [score(X_test, y_test, w) for w in weights]
scores

[SeriesView(0.6458918),
 SeriesView(0.64606729),
 SeriesView(0.64712548),
 SeriesView(0.64963495),
 SeriesView(0.65362169),
 SeriesView(0.6515999),
 SeriesView(0.60433599)]

In [102]:
from sklearn import linear_model

reg = linear_model.Ridge(alpha = 0.5)
reg.fit(X_train, y_train)

print(reg.score(X_test, y_test))
print('coef', reg.coef_)
print('intercept', reg.intercept_)

print(y.mean())
print(y.std())

0.655139480215041
coef [ 4.89336777e-02 -7.69160128e-02 -1.11339495e+00 -7.74148758e+00
  3.36686359e-01  9.15322510e-04 -9.65475754e-01  6.06510642e-01
 -4.43596486e-03 -1.99097686e-01 -8.64132516e-03  1.36950240e-01
 -1.85077068e-01]
intercept 15.102402999255764
3.584138577299413
8.564433333509855
