## Ch2 Exercises : End-to-End Machine Learning Project


I'll use previous **Housing** dataset to compare exercises results with what we perform in the whole chapter process


### Import Libraries


In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer,\
    OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.svm import SVR

from sklearn import set_config
set_config(display='diagram')
warnings.filterwarnings('ignore')

### Data Preparation


I'll prepare the data then make the preprocessing steps in full pipeline.


In [2]:
data = pd.read_csv('datasets/housing/housing.csv')
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
quantiles = [data['median_income'].quantile(
    i) for i in [0, .20, .40, .60, .80, 1]]

data['median_income_cat'] = pd.cut(data['median_income'], bins=quantiles,
                                   labels=[1, 2, 3, 4, 5],
                                   include_lowest=True
                                   )

In [4]:
data, true_labels = data.drop(
    columns=['median_house_value']).copy(), data['median_house_value'].copy()

In [5]:
x_train, x_test, y_train, y_test = \
    train_test_split(data, true_labels, random_state=42, test_size=.2,
                     stratify=data['median_income_cat'])

In [6]:
x_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_income_cat
1333,-119.81,34.44,23.0,3172.0,588.0,1467.0,559.0,4.6806,NEAR OCEAN,4
16948,-117.66,34.0,5.0,1387.0,236.0,855.0,270.0,5.411,INLAND,5
20029,-118.52,34.26,21.0,8850.0,2139.0,4717.0,1979.0,3.7816,<1H OCEAN,3
14469,-117.2,33.16,13.0,4503.0,1137.0,3094.0,1091.0,2.3159,<1H OCEAN,1
18062,-121.6,37.9,5.0,14684.0,2252.0,4276.0,1722.0,6.9051,INLAND,5


In [7]:
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=.1, random_state=None):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.gamma = gamma

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(n_clusters=self.n_clusters,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f'Cluster {i} similarity' for i in range(self.n_clusters)]


def col_ratio(X):
    return X[:, [0]]/X[:, [1]]


def ration_name(function_transformer, feature_names_out):
    return ['ratio']


clusterSimil = ClusterSimilarity(10, 1., random_state=32)
default_pipeline = make_pipeline(
    SimpleImputer(strategy='median'), StandardScaler())
ratio_pipeline = make_pipeline(SimpleImputer(strategy='median'), FunctionTransformer(
    col_ratio, feature_names_out=ration_name), StandardScaler())
log_pipeline = make_pipeline(SimpleImputer(strategy='median'), FunctionTransformer(
    func=np.log, inverse_func=np.exp, feature_names_out='one-to-one'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(
    strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

In [8]:
preprocessing = ColumnTransformer([
    ('bedrooms', ratio_pipeline, ['total_bedrooms', 'total_rooms']),
    ('rooms_per_house', ratio_pipeline, ['total_rooms', 'households']),
    ('people_per_house', ratio_pipeline, ['population', 'households']),
    ('log', log_pipeline, ["total_bedrooms", "total_rooms", "population",
                           "households", "median_income"]),
    ('geo', clusterSimil, ['longitude', 'latitude']),
    ('cat', cat_pipeline, ['ocean_proximity']),
],
    remainder=default_pipeline)

In [9]:
preprocessing

In [10]:
x_train_prepared = pd.DataFrame(preprocessing.fit_transform(
    x_train), columns=preprocessing.get_feature_names_out())
x_test_prepared = pd.DataFrame(preprocessing.fit_transform(
    x_test), columns=preprocessing.get_feature_names_out())

1. Try a support vector machine regressor (sklearn.svm.SVR) with various
   hyperparameters, such as kernel="linear" (with various values for the
   C hyperparameter) or kernel="rbf" (with various values for the C and gamma
   hyperparameters). Note that support vector machines don’t scale well to large
   datasets, so you should probably train your model on just the first 5,000 instances
   of the training set and use only 3-fold cross-validation, or else it will take hours.
   Don’t worry about what the hyperparameters mean for now; we’ll discuss them
   in Chapter 5. How does the best SVR predictor perform?


In [11]:
def model_inference(model):
    model.fit(x_train_prepared, y_train)

    predictions = model.predict(x_train_prepared)
    print('Mean Squared Error Train error: ',
          mean_squared_error(y_train, predictions, squared=False))

    predictions = model.predict(x_test_prepared)
    print('Mean Squared Error Test error: ',
          mean_squared_error(y_test, predictions, squared=False))

    return model

In [12]:
model = SVR(kernel='linear')
m = model_inference(model)

Mean Squared Error Train error:  107777.33362711844
Mean Squared Error Test error:  105947.52123218017


In [13]:
model = SVR(kernel='rbf')
m2 = model_inference(model)

Mean Squared Error Train error:  118341.05465844313
Mean Squared Error Test error:  116416.51766824679


In [14]:
scores = -cross_val_score(SVR(kernel='rbf', C=.6, gamma=.2),
                          x_train_prepared[:10000], y_train[:10000], cv=3, scoring='neg_mean_squared_error')
print(scores)
print(scores.mean())

[1.41684642e+10 1.49283664e+10 1.39530038e+10]
14349944803.936953


In [15]:
scores = -cross_val_score(SVR(kernel='linear', C=.4),
                          x_train_prepared[:10000], y_train[:10000], cv=3, scoring='neg_mean_squared_error')
print(scores)
print(scores.mean())

[1.36720003e+10 1.43448336e+10 1.34366926e+10]
13817842181.777212


In [16]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear']}


search = GridSearchCV(SVR(), param_grid=param_grid,
                      scoring='neg_mean_squared_error', cv=3, n_jobs=-1, return_train_score=True, refit=True, verbose=2)

In [17]:
trained_mode = model_inference(search)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Mean Squared Error Train error:  71471.73823713412
Mean Squared Error Test error:  77509.28074175381


In [18]:
trained_mode.best_estimator_

In [19]:
trained_mode.best_params_

{'C': 1000, 'gamma': 1, 'kernel': 'linear'}

2. Try replacing the GridSearchCV with a RandomizedSearchCV.


In [20]:
from scipy.stats import uniform

param_grid = {'C': uniform(loc=1e-1, scale=1000),
              'gamma': uniform(loc=1e-5, scale=1),
              'kernel': ['rbf', 'linear']
              }

In [21]:
rnsearch = RandomizedSearchCV(SVR(), param_distributions=param_grid,
                              scoring='neg_mean_squared_error', cv=3, n_jobs=-1, return_train_score=True, refit=True, verbose=3, random_state=42)

In [22]:
trained_model_rand = model_inference(rnsearch)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Mean Squared Error Train error:  71504.73652572054
Mean Squared Error Test error:  77382.67893683462


In [23]:
trained_model_rand.best_params_

{'C': 973.8555188414592, 'gamma': 0.23278134043030424, 'kernel': 'linear'}

In [24]:
trained_model_rand.best_estimator_

3. Try adding a SelectFromModel transformer in the preparation pipeline to select
   only the most important attributes.


In [25]:
selector = SelectFromModel(
    estimator=trained_model_rand.best_estimator_, prefit=True)
selector.fit(x_train_prepared, y_train)

In [26]:
selector.threshold_

27190.49119441623

In [28]:
selector.get_support()

array([False, False, False,  True,  True,  True,  True,  True,  True,
       False, False, False, False, False, False,  True,  True, False,
       False,  True, False, False,  True, False, False])

In [27]:
pd.DataFrame(selector.transform(x_train_prepared),
             columns=selector.get_feature_names_out())

Unnamed: 0,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,geo__Cluster 0 similarity,geo__Cluster 7 similarity,geo__Cluster 8 similarity,cat__ocean_proximity_INLAND,cat__ocean_proximity_NEAR OCEAN
0,0.446357,0.576770,0.362511,0.470269,0.636214,6.198221e-02,7.952813e-01,4.591996e-06,0.0,1.0
1,-0.803408,-0.522821,-0.366792,-0.520805,0.944082,7.452873e-01,8.795328e-03,3.685342e-13,1.0,0.0
2,2.214255,1.940666,1.940276,2.191950,0.183396,8.437081e-01,1.895552e-01,1.340835e-09,0.0,0.0
3,1.349114,1.042519,1.370599,1.380957,-0.857673,1.816514e-01,1.269483e-04,1.263599e-17,0.0,0.0
4,2.284732,2.613725,1.807680,2.002506,1.461755,2.353248e-12,2.540275e-06,5.876968e-01,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
16507,0.030612,0.723802,1.207443,1.193922,-0.652895,9.740416e-01,8.073959e-02,6.728963e-11,0.0,0.0
16508,0.369749,0.424181,0.525402,0.276882,-0.647075,4.171061e-14,8.094432e-08,1.530724e-01,1.0,0.0
16509,-1.228542,-1.118571,-0.856540,-1.224919,-0.747021,1.754672e-07,2.876151e-03,1.967363e-01,1.0,0.0
16510,-0.252194,0.404768,-0.080286,-0.113969,2.296631,9.807557e-01,5.313866e-02,3.017309e-11,0.0,0.0


In [29]:
pipeline_v2 = make_pipeline(preprocessing,
                            SelectFromModel(estimator=trained_model_rand.best_estimator_, prefit=True))
pipeline_v2

In [30]:
x_train_prepared_v2 = pd.DataFrame(pipeline_v2.fit_transform(
    x_train), columns=pipeline_v2.get_feature_names_out())
x_train_prepared_v2

Unnamed: 0,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,geo__Cluster 0 similarity,geo__Cluster 7 similarity,geo__Cluster 8 similarity,cat__ocean_proximity_INLAND,cat__ocean_proximity_NEAR OCEAN
0,0.446357,0.576770,0.362511,0.470269,0.636214,6.198221e-02,7.952813e-01,4.591996e-06,0.0,1.0
1,-0.803408,-0.522821,-0.366792,-0.520805,0.944082,7.452873e-01,8.795328e-03,3.685342e-13,1.0,0.0
2,2.214255,1.940666,1.940276,2.191950,0.183396,8.437081e-01,1.895552e-01,1.340835e-09,0.0,0.0
3,1.349114,1.042519,1.370599,1.380957,-0.857673,1.816514e-01,1.269483e-04,1.263599e-17,0.0,0.0
4,2.284732,2.613725,1.807680,2.002506,1.461755,2.353248e-12,2.540275e-06,5.876968e-01,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
16507,0.030612,0.723802,1.207443,1.193922,-0.652895,9.740416e-01,8.073959e-02,6.728963e-11,0.0,0.0
16508,0.369749,0.424181,0.525402,0.276882,-0.647075,4.171061e-14,8.094432e-08,1.530724e-01,1.0,0.0
16509,-1.228542,-1.118571,-0.856540,-1.224919,-0.747021,1.754672e-07,2.876151e-03,1.967363e-01,1.0,0.0
16510,-0.252194,0.404768,-0.080286,-0.113969,2.296631,9.807557e-01,5.313866e-02,3.017309e-11,0.0,0.0


---

4. Try creating a custom transformer that trains a k-nearest neighbors regressor
   (sklearn.neighbors.KNeighborsRegressor) in its fit() method, and outputs
   the model’s predictions in its transform() method. Then add this feature to
   the preprocessing pipeline, using latitude and longitude as the inputs to this
   transformer. This will add a feature in the model that corresponds to the housing
   median price of the nearest districts


In [140]:
from sklearn.neighbors import KNeighborsRegressor


class RegressorKNN(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.yy = y_train

    def fit(self, X, y=None):
        self.regressor = KNeighborsRegressor()
        self.regressor.fit(X, self.yy)
        return self

    def transform(self, X):
        preds = np.array(self.regressor.predict(X))
        print(preds.shape)
        # preds.reshape
        return pd.DataFrame(self.regressor.predict(X))

    def get_feature_names_out(self, names=None):
        return 'nearest_dist'

In [141]:
preprocessing

In [142]:
preprocessing_v3 = ColumnTransformer([
    ('bedrooms', ratio_pipeline, ['total_bedrooms', 'total_rooms']),
    ('rooms_per_house', ratio_pipeline, ['total_rooms', 'households']),
    ('people_per_house', ratio_pipeline, ['population', 'households']),
    ('log', log_pipeline, ["total_bedrooms", "total_rooms", "population",
                           "households", "median_income"]),
    ('geo', clusterSimil, ['longitude', 'latitude']),
    ('cat', cat_pipeline, ['ocean_proximity']),

    ('long and lut', RegressorKNN(), ['longitude', 'latitude'])
],
    remainder=default_pipeline)

In [143]:
preprocessing_v3

In [144]:
x_train_prepared_v3 = pd.DataFrame(preprocessing_v3.fit_transform(
    x_train), columns=preprocessing_v3.get_feature_names_out())
x_train_prepared_v3

(16512,)


ValueError: Shape of passed values is (16512, 26), indices imply (16512, 37)