# Make a preprocessing pipeline

In [37]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    dataset_path = Path("datasets/housing")
    if not dataset_path.is_dir():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
        tarball_path.unlink()
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

import numpy as np
from sklearn.model_selection import train_test_split

housing["income_cat"] = pd.cut(x=housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1,2,3,4,5])
strat_test_set, strat_train_set = train_test_split(housing, test_size=0.8, random_state=42, stratify=housing["income_cat"])

for set in (strat_test_set, strat_train_set):
    set.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.copy()
housing_labels = housing["median_house_value"].copy()
housing = housing.drop("median_house_value", axis=1)

from sklearn.cluster import KMeans
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.k_means = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        self.k_means.fit(X, sample_weight=sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.k_means.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, input_features=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return [f"ratio"]

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )

def log_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(np.log, feature_names_out="one-to-one"),
        StandardScaler()
    )

ratio_pipeline = ratio_pipeline()
log_pipeline = log_pipeline()
cluster_similarity_pipeline = ClusterSimilarity(random_state=42)
default_num_pipeline = make_pipeline( SimpleImputer(strategy="median"), StandardScaler())
cat_pipeline = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))

from sklearn.compose import ColumnTransformer, make_column_selector

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline, ["total_bedrooms", "total_rooms"]),
    ("rooms_per_household", ratio_pipeline, ["total_rooms", "households"]),
    ("people_per_household", ratio_pipeline, ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ("geo", cluster_similarity_pipeline, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object))
], remainder=default_num_pipeline)

preprocessing

housing_prepared = preprocessing.fit_transform(housing)

df_housing_prepared = pd.DataFrame(housing_prepared, columns=preprocessing.get_feature_names_out())
df_housing_prepared

Unnamed: 0,bedrooms__ratio,rooms_per_household__ratio,people_per_household__ratio,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,geo__Cluster 0 similarity,geo__Cluster 1 similarity,...,geo__Cluster 6 similarity,geo__Cluster 7 similarity,geo__Cluster 8 similarity,geo__Cluster 9 similarity,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN,remainder__housing_median_age
0,-0.115463,0.175901,-0.081611,-0.233330,-0.224629,-0.763468,-0.390884,-0.652966,4.330079e-10,8.720414e-01,...,2.072444e-01,3.062673e-18,1.917707e-01,5.922961e-15,0.0,0.0,0.0,1.0,0.0,1.852006
1,3.392673,-1.269818,-0.000687,0.948421,-0.068932,1.084226,0.983747,-1.753042,4.220179e-01,7.523119e-13,...,1.151436e-09,4.001500e-01,1.027699e-14,9.771878e-01,1.0,0.0,0.0,0.0,0.0,1.058308
2,0.113894,-0.514412,0.037248,-1.068173,-1.127598,-0.568310,-0.864053,-0.559418,2.728091e-09,9.883300e-01,...,2.457096e-01,3.188926e-17,1.204039e-01,5.819218e-14,0.0,0.0,0.0,1.0,0.0,1.613897
3,-0.574187,-0.041776,0.028806,0.636091,0.826702,1.055236,0.812218,0.202467,2.006308e-03,4.316807e-21,...,1.115871e-16,4.555236e-01,5.278432e-23,1.336819e-01,1.0,0.0,0.0,0.0,0.0,-0.132240
4,1.555326,-0.949710,-0.027059,1.096122,0.502605,1.150751,1.192014,-0.929259,4.357781e-01,1.048803e-12,...,1.233398e-09,3.244096e-01,9.886507e-15,9.501603e-01,1.0,0.0,0.0,0.0,0.0,0.423349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-1.079713,0.568447,-0.016369,-0.144165,0.343426,0.015881,-0.012034,1.179642,1.788325e-01,1.140715e-14,...,4.178290e-11,7.217128e-01,2.125380e-16,9.294250e-01,1.0,0.0,0.0,0.0,0.0,0.105870
16508,0.253309,-0.384569,0.068304,-0.133415,-0.273921,0.332242,-0.087195,-1.108663,1.470529e-01,7.987878e-06,...,2.192926e-03,4.182783e-04,3.932690e-06,4.281375e-03,0.0,1.0,0.0,0.0,0.0,-0.052870
16509,0.365687,-0.488483,0.080688,0.015287,-0.171176,0.561659,0.095085,-0.946379,3.567930e-01,2.949598e-13,...,5.830456e-10,4.883137e-01,4.788814e-15,9.918576e-01,1.0,0.0,0.0,0.0,0.0,0.978938
16510,-0.493076,0.720625,-0.047573,0.062513,0.230949,-0.346967,-0.198301,0.042774,7.432146e-09,1.521295e-03,...,7.534887e-02,1.675363e-14,1.812703e-01,7.741209e-13,0.0,1.0,0.0,0.0,0.0,-0.290980


# Train a linear regression model

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lin_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("linear_regression", LinearRegression())
])

lin_reg.fit(housing, housing_labels)

from sklearn.metrics import root_mean_squared_error

train_predictions = lin_reg.predict(housing)

root_mean_squared_error(housing_labels, train_predictions)

cross_val_lin_reg = cross_val_score(lin_reg, housing, housing_labels, cv=10, scoring="neg_root_mean_squared_error")
cv_result = pd.Series(-cross_val_lin_reg)
cv_result.describe()

count       10.000000
mean     70135.820724
std       3082.130114
min      65789.881531
25%      69178.249323
50%      69500.358780
75%      70103.989545
max      77943.870341
dtype: float64

# Train a decision tree model

In [39]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("tree_regressor", DecisionTreeRegressor())
])

tree_reg.fit(housing, housing_labels)

train_predictions = tree_reg.predict(housing)

root_mean_squared_error(housing_labels, train_predictions)

cross_val_tree_reg = cross_val_score(tree_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

cv_result = pd.Series(-cross_val_tree_reg)
cv_result.describe()

count       10.000000
mean     65819.423787
std       2594.779987
min      61901.216356
25%      64173.231362
50%      65820.421453
75%      66607.943633
max      70347.372844
dtype: float64

# Train a random forest model

In [40]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest_regressor", RandomForestRegressor())
])

rf_reg.fit(housing, housing_labels)
train_predictions = rf_reg.predict(housing)

root_mean_squared_error(housing_labels, train_predictions)

cross_val_rf_reg = cross_val_score(rf_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

cv_result = pd.Series(-cross_val_rf_reg)
cv_result.describe()

count       10.000000
mean     46918.587670
std       1963.944960
min      43743.022420
25%      45422.351427
50%      47422.356499
75%      48364.621269
max      49664.975683
dtype: float64

# Use a grid search to pick the best hyperparameters for random forest model

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'random_forest_regressor__n_estimators': [5,8,10],
        'preprocessing__geo__n_clusters': [4,6,8]
    },
    {
        'random_forest_regressor__n_estimators': [10, 15],
        'preprocessing__geo__n_clusters': [6, 8, 10]
    }
]

grid_search = GridSearchCV(rf_reg, param_grid, scoring="neg_root_mean_squared_error", cv=5)

grid_search.fit(housing, housing_labels)

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                SimpleImputer(strategy='median')),
                                               ('standardscaler',
                                                StandardScaler())]),
                     transformers=[('bedrooms',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('functiontransformer',
                                                     FunctionTransformer(feature_names_out=<function ratio_name at 0x7b6844095b20>,
                                                                         func=<function column_ratio...
                                    ['total_bedrooms', 'total_rooms', 'population',
                                     'households', 'median_income']),

In [20]:
pd.DataFrame(grid_search.cv_results_).sort_values("mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_random_forest_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
14,2.23536,0.034106,0.012278,8.5e-05,10,15,"{'preprocessing__geo__n_clusters': 10, 'random...",-50861.81073,-48837.962555,-48767.886243,-48369.333155,-46866.775762,-48740.753689,1277.404806,1
12,1.959425,0.013851,0.0123,0.00012,8,15,"{'preprocessing__geo__n_clusters': 8, 'random_...",-50382.080798,-49892.334545,-48266.988268,-49155.341988,-47316.877346,-49002.724589,1104.600413,2
13,1.52282,0.018469,0.010059,0.0001,10,10,"{'preprocessing__geo__n_clusters': 10, 'random...",-51633.701395,-49502.158693,-48717.661168,-49568.756564,-47674.271881,-49419.30994,1301.552654,3
10,1.777652,0.021691,0.012175,7.2e-05,6,15,"{'preprocessing__geo__n_clusters': 6, 'random_...",-51626.179987,-50346.922544,-48493.4506,-50173.258846,-48700.776202,-49868.117636,1154.60331,4
8,1.354608,0.014889,0.009868,0.000143,8,10,"{'preprocessing__geo__n_clusters': 8, 'random_...",-51516.834135,-50596.75903,-49022.245989,-50443.191493,-48218.585459,-49959.523221,1181.065168,5
11,1.342147,0.007077,0.009873,0.000115,8,10,"{'preprocessing__geo__n_clusters': 8, 'random_...",-51001.898311,-50419.619359,-49455.06078,-50534.525059,-48737.462999,-50029.713302,818.796886,6
5,1.238253,0.007402,0.009625,9.3e-05,6,10,"{'preprocessing__geo__n_clusters': 6, 'random_...",-52529.207627,-50962.066142,-49880.759516,-50281.3194,-48401.035181,-50410.877573,1351.177854,7
7,1.110964,0.021576,0.008778,9.5e-05,8,8,"{'preprocessing__geo__n_clusters': 8, 'random_...",-51289.204826,-51364.477594,-50163.167034,-50500.685036,-49040.091498,-50471.525198,849.811914,8
9,1.224621,0.011524,0.009805,0.000156,6,10,"{'preprocessing__geo__n_clusters': 6, 'random_...",-52629.902104,-50778.878139,-49259.384851,-51234.819951,-49425.455283,-50665.688066,1241.831387,9
4,1.028748,0.011415,0.009055,9.8e-05,6,8,"{'preprocessing__geo__n_clusters': 6, 'random_...",-53459.951479,-51136.046066,-50583.672164,-51417.604943,-49272.105275,-51173.875985,1360.297648,10


# Evaluate the best model found by grid search on a test dataset

In [30]:
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set[["median_house_value"]]

grid_search_best_model = grid_search.best_estimator_
y_predicted = grid_search_best_model.predict(X_test)

final_rsme = root_mean_squared_error(y_test, y_predicted)
final_rsme

48906.15129156975

# Use a randomized search to pick the best hyperparameters for a random forest regressor

In [100]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'random_forest_regressor__n_estimators': randint(low=2, high=50),
    'preprocessing__geo__n_clusters': randint(low=2, high=20)
}

rnd_search = RandomizedSearchCV(rf_reg, param_distribs, scoring="neg_root_mean_squared_error", n_iter=20, cv=10, n_jobs=-1)
rnd_search.fit(housing, housing_labels)

# Evaluate the best model found by randomized search on a test dataset

In [101]:
best_overall_model = rnd_search.best_estimator_
y_predicted = rnd_search_best_model.predict(X_test)

final_rsme = root_mean_squared_error(y_test, y_predicted)
final_rsme

47663.91755290739

# Exercise 1
Try a support vector machine regressor (sklearn.svm.SVR) with various
hyperparameters, such as kernel="linear" (with various values for the
C hyperparameter) or kernel="rbf" (with various values for the C and gamma
hyperparameters). Note that support vector machines don’t scale well to large
datasets, so you should probably train your model on just the first 5,000 instances
of the training set and use only 3-fold cross-validation, or else it will take hours.
Don’t worry about what the hyperparameters mean for now; we’ll discuss them
in Chapter 5. How does the best SVR predictor perform?

In [70]:
# housing_small_dataset = housing[:5000]
# housing_small_labels = housing_labels[:5000]

# len(housing_small_dataset), len(housing_small_labels)

from sklearn.svm import SVR

svr = Pipeline ([
    ("preprocessing", preprocessing),
    ("svr", SVR(kernel="linear"))
])

svr.fit(housing, housing_labels)
svr.predict(X_test)

def describe_cross_val_score(cross_val_score):
    print(pd.Series(cross_val_score).describe())

cross_val_svr = cross_val_score(svr, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
describe_cross_val_score(cross_val_svr)

svr = Pipeline([
    ("preprocessing", preprocessing),
    ("svr", SVR(kernel="rbf"))
])

cross_val_svr = -cross_val_score(svr, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
describe_cross_val_score(cross_val_svr)

count        10.000000
mean    -111609.820263
std        3240.423460
min     -116496.253975
25%     -113473.600528
50%     -111322.953146
75%     -110136.717250
max     -106655.686986
dtype: float64
count        10.000000
mean     118074.042081
std        3278.332875
min      113127.237618
25%      116846.502097
50%      117821.887487
75%      119902.255172
max      122935.620633
dtype: float64


## Explore different kernels, C param, and gamma param using grid search cv

In [89]:
svr = Pipeline ([
    ("preprocessing", preprocessing),
    ("svr", SVR())
])

param_grid = [
    {
        "svr__kernel": ["linear", "rbf"],
        "svr__C": [0.001, 0.01, 0.1, 1., 10., 50., 100.],
        "svr__gamma": [0.001, 0.01, 0.1, 1., 10., 50., 100.]
    }
]

grid_search = GridSearchCV(svr, param_grid, scoring="neg_root_mean_squared_error", cv=3, n_jobs=-1)

In [90]:
grid_search.fit(housing, housing_labels)

In [91]:
grid_search.best_params_

{'svr__C': 100.0, 'svr__gamma': 0.001, 'svr__kernel': 'linear'}

In [80]:
best_svr_model = grid_search.best_estimator_

In [81]:
cross_val_svr = -cross_val_score(best_svr_model, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
describe_cross_val_score(cross_val_svr)

count       10.000000
mean     74892.891869
std       2378.700548
min      70761.935434
25%      73915.368623
50%      75338.680338
75%      76767.589127
max      77396.284836
dtype: float64


In [82]:
svr_preds = best_svr_model.predict(X_test)
svr_test_score = root_mean_squared_error(y_test, svr_preds)
svr_test_score

75323.9563864252

# Exercise 2
Try replacing the GridSearchCV with a RandomizedSearchCV.

In [87]:
from scipy.stats import uniform

param_dist = [
    {
        "svr__kernel": ["linear", "rbf"],
        "svr__C": uniform(0.001, 100.0 - 0.001),
        "svr__gamma": uniform(0.001, 100.0 - 0.001)
    }
]

rnd_search = RandomizedSearchCV(svr, param_dist, cv=3, n_iter=10, scoring="neg_root_mean_squared_error", n_jobs=-1)

In [88]:
rnd_search.fit(housing, housing_labels)

In [92]:
rnd_search.best_params_

{'svr__C': 96.51957893450366,
 'svr__gamma': 39.08809697929379,
 'svr__kernel': 'linear'}

In [93]:
best_svr_model = rnd_search.best_estimator_

In [97]:
cross_val_svr = -cross_val_score(best_svr_model, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10, n_jobs=-1)
describe_cross_val_score(cross_val_svr)

count       10.000000
mean     74986.790880
std       2387.651084
min      70848.724524
25%      74001.015362
50%      75429.661011
75%      76853.817975
max      77496.149474
dtype: float64


In [98]:
svr_preds = best_svr_model.predict(X_test)
svr_test_score = root_mean_squared_error(y_test, svr_preds)

svr_test_score

75409.70983322247

# Exercise 3
Try adding a SelectFromModel transformer in the preparation pipeline to select
only the most important attributes.

In [135]:
rf_with_selection = Pipeline([
    ("preprocessing", preprocessing),
    ("select_features", SelectFromModel(RandomForestRegressor(n_jobs=-1, random_state=42, n_estimators=47, max_features=1.0))),
    ("random_forest_regressor", (RandomForestRegressor(n_jobs=-1, random_state=42, n_estimators=47, max_features=1.0)))
])

In [136]:
rf_with_selection.fit(housing, housing_labels)

In [141]:
final_preds = rf_with_selection.predict(X_test)
final_test_score = root_mean_squared_error(y_test, final_preds)

rf_with_selection[:-2].get_feature_names_out()

array(['bedrooms__ratio', 'rooms_per_household__ratio',
       'people_per_household__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'remainder__housing_median_age'], dtype=object)

# Exercise 4
Try creating a custom transformer that trains a k-nearest neighbors regressor
(sklearn.neighbors.KNeighborsRegressor) in its fit() method, and outputs
the model’s predictions in its transform() method. Then add this feature to
the preprocessing pipeline, using latitude and longitude as the inputs to this
transformer. This will add a feature in the model that corresponds to the housing
median price of the nearest districts.


In [158]:
from sklearn.neighbors import KNeighborsRegressor

class KNearestNeighbors(TransformerMixin, BaseEstimator):
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y=None):
        self.k_neigh_reg = KNeighborsRegressor(n_neighbors=self.n_neighbors)
        self.k_neigh_reg.fit(X, y)
        return self

    def transform(self, X):
        return self.k_neigh_reg.predict(X)

knn = KNearestNeighbors()
knn_predictions = knn.fit_transform(housing[["latitude", "longitude"]], housing_labels)
pd.Series(knn_predictions)

0        163840.0
1        186700.0
2        136600.0
3        145020.0
4        371120.0
           ...   
16507    231920.0
16508     84200.0
16509    145160.0
16510    109180.0
16511    338900.2
Length: 16512, dtype: float64

# Exercise 5
Automatically explore some preparation options using GridSearchCV.

In [164]:
from sklearn.dummy import DummyRegressor

# Create a pipeline with the custom transformer
pipeline = Pipeline([
    ("k_nearest_neighbors", KNearestNeighbors()),
    ("dummy_regressor", DummyRegressor())  # Add a dummy regressor after the transformer
])

# Define the parameter grid
param_grid = [{
    "k_nearest_neighbors__n_neighbors": [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 50]
}]

# Instantiate the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, scoring="neg_root_mean_squared_error", n_jobs=-1)

# Fit the grid search on the data
grid_search.fit(housing[["latitude", "longitude"]], housing_labels)

grid_search.best_score_, grid_search.best_params_

(-115345.29086921542, {'k_nearest_neighbors__n_neighbors': 2})

# Exercise 6
Try to implement the StandardScalerClone class again from scratch,
then add support for the inverse_transform() method: executing scaler.
inverse_transform(scaler.fit_transform(X)) should return an array very
close to X. 

In [181]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):
        self.with_mean = with_mean

    def fit(self, X, y=None):
        X = check_array(X)
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]
        if isinstance(X, pd.DataFrame):
            self.feature_names_in = X.columns
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        assert self.n_features_in_ == X.shape[1]
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_

    def inverse_transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        assert self.n_features_in_ == X.shape[1]
        X = X * self.scale_
        if self.with_mean:
            X = X + self.mean_
        return X

In [191]:
scaler_clone = StandardScalerClone()
scaler_clone.fit(housing[["total_rooms"]])
housing[["total_rooms"]], pd.DataFrame(scaler_clone.transform(housing[["total_rooms"]]), index=housing.index), pd.DataFrame(scaler_clone.inverse_transform(scaler_clone.transform(housing[["total_rooms"]])), index=housing.index)


(       total_rooms
 824         1736.0
 8578        1951.0
 9759         882.0
 15409       3819.0
 9475        2995.0
 ...            ...
 2661        2658.0
 6954        1673.0
 18626       1807.0
 19303       2443.0
 459         2937.0
 
 [16512 rows x 1 columns],
               0
 824   -0.413494
 8578  -0.313968
 9759  -0.808823
 15409  0.550757
 9475   0.169315
 ...         ...
 2661   0.013313
 6954  -0.442658
 18626 -0.380627
 19303 -0.086214
 459    0.142466
 
 [16512 rows x 1 columns],
             0
 824    1736.0
 8578   1951.0
 9759    882.0
 15409  3819.0
 9475   2995.0
 ...       ...
 2661   2658.0
 6954   1673.0
 18626  1807.0
 19303  2443.0
 459    2937.0
 
 [16512 rows x 1 columns])