In [1]:
import numpy as np

## Regression - Polynomial features

In [2]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.3.2.


# 3a 

In [3]:
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()
print(california.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

Creating the data matrix

In [4]:
D = california.data
y = california.target
n,d = D.shape
print(n,d)

20640 8


Preprocessing

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
D_scaled = scaler.fit_transform(D)

Creating a design matrix with polynomial features

In [6]:
from sklearn.preprocessing import PolynomialFeatures
aff = PolynomialFeatures(2,include_bias=True)
X = aff.fit_transform(D_scaled)

print("Shape of the design matrix:", X.shape)

Shape of the design matrix: (20640, 45)


# 3b

Training the model

In [7]:
from sklearn.linear_model import LinearRegression

reg_model = LinearRegression()
reg_model.fit(X, y)


feature_names = aff.get_feature_names_out(california.feature_names)

coefficients = reg_model.coef_

Print the results

In [16]:
beta_MedInc = coefficients[np.where(feature_names == "MedInc")]
beta_MedIncAveBedrms = coefficients[np.where(feature_names == "MedInc AveBedrms")]
beta_HouseAgeAveBedrms = coefficients[np.where(feature_names == "HouseAge AveBedrms")]

print("Results: ", beta_MedInc[0], ", ", beta_MedIncAveBedrms[0], ", ", beta_HouseAgeAveBedrms[0])

Results:  0.9224368884326493 ,  -0.16758435804317226 ,  0.06328854538476211


# 3c

Scale the input

In [9]:
n_samples = X.shape[0]
X_scaled = X / np.sqrt(n_samples)
y_scaled = y / np.sqrt(n_samples)

Train the model

In [10]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=0.1, fit_intercept=True)
ridge_model.fit(X_scaled, y_scaled)

ridge_feature_names = aff.get_feature_names_out(california.feature_names)
ridge_coefficients = ridge_model.coef_

Print the results

In [17]:
ridge_beta_MedInc = ridge_coefficients[np.where(ridge_feature_names == "MedInc")]
ridge_beta_MedIncAveBedrms = ridge_coefficients[np.where(ridge_feature_names == "MedInc AveBedrms")]
ridge_beta_HouseAgeAveBedrms = ridge_coefficients[np.where(ridge_feature_names == "HouseAge AveBedrms")]

print("Results: ", ridge_beta_MedInc[0], ", ", ridge_beta_MedIncAveBedrms[0], ", ", ridge_beta_HouseAgeAveBedrms[0])

Results:  0.7515683813318952 ,  -0.07163957049124096 ,  0.016259768414560047
