In [2]:
import numpy as np

## Regression - Polynomial features

In [3]:
import sklearn

# check sklearn version
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.4.1.post1.


# 3a 

In [4]:
# import the dataset
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()

# not required by the assignment but good to see
print(california.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

Creating the data matrix

In [None]:
# create the data matrix
D = california.data
y = california.target
n,d = D.shape
print(f"Number of samples: {n}")
print(f"Dimension of the features: {d}")

Number of samples: 20640
Dimension of the features: 8


Preprocessing

In [10]:
from sklearn.preprocessing import StandardScaler

# Scale the data matrix using the StandardScaler
scaler = StandardScaler()
D_scaled = scaler.fit_transform(D)

Creating a design matrix with polynomial features

In [11]:
# using the scaled data matrix
from sklearn.preprocessing import PolynomialFeatures
aff = PolynomialFeatures(2,include_bias=True)
X = aff.fit_transform(D_scaled)

print("Shape of the design matrix:", X.shape)

Shape of the design matrix: (20640, 45)


# 3b

Training the model

In [12]:
from sklearn.linear_model import LinearRegression

# fit the LinearRegression model
reg_model = LinearRegression()
reg_model.fit(X, y)


feature_names = aff.get_feature_names_out(california.feature_names)
coefficients = reg_model.coef_

Print the results

In [15]:
# get the regression parameters for the required features
beta_MedInc = coefficients[np.where(feature_names == "MedInc")]
beta_MedIncAveBedrms = coefficients[np.where(feature_names == "MedInc AveBedrms")]
beta_HouseAgeAveBedrms = coefficients[np.where(feature_names == "HouseAge AveBedrms")]

# print the results
print(f"Beta_MedInc = {beta_MedInc[0]}, Beta_MedIncAveBedrms = {beta_MedIncAveBedrms[0]}, Beta_HouseAgeAveBedrms = {beta_HouseAgeAveBedrms[0]}")

Beta_MedInc = 0.9224368884326505, Beta_MedIncAveBedrms = -0.1675843580431766, Beta_HouseAgeAveBedrms = 0.06328854538476117


# 3c

Scale the input

In [16]:
# given the built ridge regression has a different objective 
# we must first scale the input
n_samples = X.shape[0]
X_scaled = X / np.sqrt(n_samples)
y_scaled = y / np.sqrt(n_samples)

Train the model

In [17]:
from sklearn.linear_model import Ridge

# fit the ridge model, with alpha = 0.1
ridge_model = Ridge(alpha=0.1, fit_intercept=True)
ridge_model.fit(X_scaled, y_scaled)

ridge_feature_names = aff.get_feature_names_out(california.feature_names)
ridge_coefficients = ridge_model.coef_

Print the results

In [18]:
# get the regression parameters for the required features
ridge_beta_MedInc = ridge_coefficients[np.where(ridge_feature_names == "MedInc")]
ridge_beta_MedIncAveBedrms = ridge_coefficients[np.where(ridge_feature_names == "MedInc AveBedrms")]
ridge_beta_HouseAgeAveBedrms = ridge_coefficients[np.where(ridge_feature_names == "HouseAge AveBedrms")]

print(f"Ridge_Beta_MedInc = {ridge_beta_MedInc[0]}, Ridge_Beta_MedIncAveBedrms = {ridge_beta_MedIncAveBedrms[0]}, Ridge_Beta_HouseAgeAveBedrms = {ridge_beta_HouseAgeAveBedrms[0]}")

Ridge_Beta_MedInc = 0.7515683813318791, Ridge_Beta_MedIncAveBedrms = -0.0716395704911665, Ridge_Beta_HouseAgeAveBedrms = 0.01625976841452806
