In [114]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    dataset_path = Path("datasets/housing")
    if not dataset_path.is_dir():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
        tarball_path.unlink()
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

import numpy as np
from sklearn.model_selection import train_test_split

housing["income_cat"] = pd.cut(x=housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1,2,3,4,5])
strat_test_set, strat_train_set = train_test_split(housing, test_size=0.8, random_state=42, stratify=housing["income_cat"])

for set in (strat_test_set, strat_train_set):
    set.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.copy()
housing_labels = housing["median_house_value"].copy()
housing = housing.drop("median_house_value", axis=1)

from sklearn.cluster import KMeans
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.k_means = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        self.k_means.fit(X, sample_weight=sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.k_means.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, input_features=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return [f"ratio"]

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )

def log_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(np.log, feature_names_out="one-to-one"),
        StandardScaler()
    )

ratio_pipeline = ratio_pipeline()
log_pipeline = log_pipeline()
cluster_similarity_pipeline = ClusterSimilarity(random_state=42)
default_num_pipeline = make_pipeline( SimpleImputer(strategy="median"), StandardScaler())
cat_pipeline = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))

from sklearn.compose import ColumnTransformer, make_column_selector

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline, ["total_bedrooms", "total_rooms"]),
    ("rooms_per_household", ratio_pipeline, ["total_rooms", "households"]),
    ("people_per_household", ratio_pipeline, ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ("geo", cluster_similarity_pipeline, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object))
], remainder=default_num_pipeline)

preprocessing

housing_prepared = preprocessing.fit_transform(housing)

df_housing_prepared = pd.DataFrame(housing_prepared, columns=preprocessing.get_feature_names_out())
df_housing_prepared

Unnamed: 0,bedrooms__ratio,rooms_per_household__ratio,people_per_household__ratio,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,geo__Cluster 0 similarity,geo__Cluster 1 similarity,...,geo__Cluster 6 similarity,geo__Cluster 7 similarity,geo__Cluster 8 similarity,geo__Cluster 9 similarity,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN,remainder__housing_median_age
0,-0.115463,0.175901,-0.081611,-0.233330,-0.224629,-0.763468,-0.390884,-0.652966,4.330079e-10,8.720414e-01,...,2.072444e-01,3.062673e-18,1.917707e-01,5.922961e-15,0.0,0.0,0.0,1.0,0.0,1.852006
1,3.392673,-1.269818,-0.000687,0.948421,-0.068932,1.084226,0.983747,-1.753042,4.220179e-01,7.523119e-13,...,1.151436e-09,4.001500e-01,1.027699e-14,9.771878e-01,1.0,0.0,0.0,0.0,0.0,1.058308
2,0.113894,-0.514412,0.037248,-1.068173,-1.127598,-0.568310,-0.864053,-0.559418,2.728091e-09,9.883300e-01,...,2.457096e-01,3.188926e-17,1.204039e-01,5.819218e-14,0.0,0.0,0.0,1.0,0.0,1.613897
3,-0.574187,-0.041776,0.028806,0.636091,0.826702,1.055236,0.812218,0.202467,2.006308e-03,4.316807e-21,...,1.115871e-16,4.555236e-01,5.278432e-23,1.336819e-01,1.0,0.0,0.0,0.0,0.0,-0.132240
4,1.555326,-0.949710,-0.027059,1.096122,0.502605,1.150751,1.192014,-0.929259,4.357781e-01,1.048803e-12,...,1.233398e-09,3.244096e-01,9.886507e-15,9.501603e-01,1.0,0.0,0.0,0.0,0.0,0.423349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-1.079713,0.568447,-0.016369,-0.144165,0.343426,0.015881,-0.012034,1.179642,1.788325e-01,1.140715e-14,...,4.178290e-11,7.217128e-01,2.125380e-16,9.294250e-01,1.0,0.0,0.0,0.0,0.0,0.105870
16508,0.253309,-0.384569,0.068304,-0.133415,-0.273921,0.332242,-0.087195,-1.108663,1.470529e-01,7.987878e-06,...,2.192926e-03,4.182783e-04,3.932690e-06,4.281375e-03,0.0,1.0,0.0,0.0,0.0,-0.052870
16509,0.365687,-0.488483,0.080688,0.015287,-0.171176,0.561659,0.095085,-0.946379,3.567930e-01,2.949598e-13,...,5.830456e-10,4.883137e-01,4.788814e-15,9.918576e-01,1.0,0.0,0.0,0.0,0.0,0.978938
16510,-0.493076,0.720625,-0.047573,0.062513,0.230949,-0.346967,-0.198301,0.042774,7.432146e-09,1.521295e-03,...,7.534887e-02,1.675363e-14,1.812703e-01,7.741209e-13,0.0,1.0,0.0,0.0,0.0,-0.290980


In [116]:
from sklearn.linear_model import LinearRegression

lin_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("linear_regression", LinearRegression())
])

lin_reg.fit(housing, housing_labels)

from sklearn.metrics import root_mean_squared_error

train_predictions = lin_reg.predict(housing)

root_mean_squared_error(housing_labels, train_predictions)

cross_val_lin_reg = cross_val_score(lin_reg, housing, housing_labels, cv=10, scoring="neg_root_mean_squared_error")
cv_result = pd.Series(-cross_val_lin_reg)
cv_result.describe()

count       10.000000
mean     70142.751198
std       3113.888756
min      65771.372741
25%      69153.743520
50%      69513.154904
75%      70103.989545
max      78044.916910
dtype: float64

In [117]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("tree_regressor", DecisionTreeRegressor())
])

tree_reg.fit(housing, housing_labels)

train_predictions = tree_reg.predict(housing)

root_mean_squared_error(housing_labels, train_predictions)

from sklearn.model_selection import cross_val_score

cross_val_tree_reg = cross_val_score(tree_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

cv_result = pd.Series(-cross_val_tree_reg)
cv_result.describe()

count       10.000000
mean     65491.999411
std       2507.167240
min      63344.942215
25%      63870.724479
50%      64534.752842
75%      66295.234628
max      71475.841311
dtype: float64

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest_regressor", RandomForestRegressor())
])

rf_reg.fit(housing, housing_labels)
train_predictions = rf_reg.predict(housing)

root_mean_squared_error(housing_labels, train_predictions)

cross_val_rf_reg = cross_val_score(rf_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

cv_result = pd.Series(-cross_val_rf_reg)
cv_result.describe()