This notebook is dedicated to second chapter. Exercises itself could be found at page 127.

In [1]:
import pandas as pd
import os 
import tarfile 
import urllib
import numpy as np
 
    
HOUSING_PATH = "C:\margo\ML\Geron\git_example\handson-ml2\datasets\housing"
    
def load_housing_data(housing_path=HOUSING_PATH): 
    csv_path = os.path.join(housing_path, "housing.csv")     
    return pd.read_csv(csv_path)

In [2]:
housing = load_housing_data(HOUSING_PATH)
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [3]:
def split_train_set(data, test_ratio):
        shuffled_indices = np.random.permutation(len(data))
        test_set_size = int(len(data) * test_ratio)
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]     
        return data.iloc[train_indices], data.iloc[test_indices]

In [4]:
train_set, test_set = split_train_set(housing, 0.2)

In [5]:
print("train set lenght is:" + str(len(train_set)) + ";\ntest set lenght is: " +str(len(test_set)))

train set lenght is:16512;
test set lenght is: 4128


In [6]:
from sklearn.model_selection import StratifiedShuffleSplit 

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6.0, 7.5, 9.0, np.inf],
                               labels=[1, 2, 3, 4, 5, 6, 7])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 
for train_index, test_index in split.split(housing, housing["income_cat"]):     
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    

for set_ in (strat_train_set, strat_test_set):     
    set_.drop("income_cat", axis=1, inplace=True)

In [7]:
housing = strat_train_set.copy()

Let’s create new attributes, that will be a bit more informative for dataset analysis

In [8]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"] 
housing["population_per_household"]=housing["population"] / housing["households"]

In [9]:
housing = strat_train_set.drop("median_house_value", axis=1) 
housing_labels = strat_train_set["median_house_value"].copy()

In [10]:
from sklearn.impute import SimpleImputer 

housing_num = housing.drop("ocean_proximity", axis=1)
#imputer = SimpleImputer(strategy="median")
#imputer.fit(housing_num)

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin 
 
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 

In [12]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):     
    def __init__(self, add_bedrooms_per_room = True): # no * args or ** kargs
         self.add_bedrooms_per_room = add_bedrooms_per_room     
    def fit(self, X, y=None):         
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:             
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room] 
 
        else:             
            return np.c_[X, rooms_per_household, population_per_household] 


We just created transformer that will help as to get data prepared for training. As the last preparation step we ought to create a Pipeline.

In [13]:
#attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
#housing_extra_attribs = attr_adder.transform(housing.values)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
 
num_pipeline = Pipeline([         
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()), 
        ('std_scaler', StandardScaler()),     
    ]) 
 
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [15]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder 
    
num_attribs = list(housing_num) 
cat_attribs = ["ocean_proximity"] 
 
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),     
    ]) 
 
housing_prepared = full_pipeline.fit_transform(housing)

In [16]:
from sklearn.linear_model import LinearRegression 
 
lin_reg = LinearRegression() 
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

In [17]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data) 
print("Predictions:", lin_reg.predict(some_data_prepared)) 

Predictions: [ 85567.3451932  471965.3878167  151916.14768674 186477.93483214
 244358.19381986]


In [None]:
Cells that lying above were taken from the book itself, now let`s take a look on exercises

### 1
Try a Support Vector Machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don’t worry about what these hyperparameters mean for now. How does the best SVR predictor perform? 

In [18]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


def SvrBuildUp(housing_prepared, housing_labels, kernel_type, C_param, gamma_param='auto', epsilon_param=.1):
    svr = SVR(kernel=kernel_type, C=C_param, gamma=gamma_param, epsilon=epsilon_param)
    svr.fit(housing_prepared, housing_labels)
    svr_housing_predictions = svr.predict(housing_prepared)
    svr_mse = mean_squared_error(housing_labels, svr_housing_predictions) 
    return np.sqrt(svr_mse) 

svr_lin_rmse = SvrBuildUp(housing_prepared, housing_labels, 'linear', 100, 'auto')
print(svr_lin_rmse)

71237.10538307506


In [19]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

svr = SVR()
svr.fit(housing_prepared, housing_labels)
svr_housing_predictions = svr.predict(housing_prepared)
svr_mse = mean_squared_error(housing_labels, svr_housing_predictions) 
svr_rmse = np.sqrt(svr_mse) 
svr_rmse

118357.29776637343

In [20]:
nsvr_rbf_rmse = SvrBuildUp(housing_prepared, housing_labels, 'rbf', 100, 0.1, .3)
nsvr_rbf_rmse

95429.8862063893

In [21]:
nsvr_lin_rmse = SvrBuildUp(housing_prepared, housing_labels, 'linear', 100, 'auto')
nsvr_lin_rmse

71237.10538307506

In [22]:
svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1)
svr_poly.fit(housing_prepared, housing_labels)
svr_poly_housing_predictions = svr_poly.predict(housing_prepared)
svr_poly_mse = mean_squared_error(housing_labels, svr_poly_housing_predictions) 
svr_poly_rmse = np.sqrt(svr_poly_mse) 
svr_poly_rmse

75031.18355631913

In [23]:
nsvr_poly_rmse = SvrBuildUp(housing_prepared, housing_labels, 'poly', 10, 'auto')
nsvr_poly_rmse

116938.01912668717

In [24]:
nnsvr_lin_rmse = SvrBuildUp(housing_prepared, housing_labels, 'linear', 10, 'auto')
nnsvr_lin_rmse

81913.70157811412

In [25]:
nsvr_lin_rmse = SvrBuildUp(housing_prepared, housing_labels, 'linear', 900, 'auto')
nsvr_lin_rmse

70278.49017554823

 2. Try replacing GridSearchCV with RandomizedSearchCV.

In [26]:
# from sklearn.model_selection import GridSearchCV 
 
# param_grid = [     {'C': [10, 100, 500], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
#               { 'C': [80, 100, 130], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma':['scale', 'auto'], 
#                'degree':[1, 2, 4, 8, 10], 'epsilon':[0.01, 0.05 ,.1, .2, .5, .8,]}, ]
#                #'coef0' :[0, 0.5, 1, 2]},   ]


# svr_search = SVR()
 
# grid_search = GridSearchCV(svr_search, param_grid, cv=5,
#                            scoring='neg_mean_squared_error',
#                            return_train_score=True) 

# #SVR().get_params().keys()
# grid_search.fit(housing_prepared, housing_labels)

In [27]:
from sklearn.model_selection import GridSearchCV 
 
param_grid = {'C': [ 100, 500, 1000], 'kernel': ['linear',  'rbf', 'sigmoid'] }
               #'coef0' :[0, 0.5, 1, 2]},   ]


svr_search = SVR()
 
grid_search = GridSearchCV(svr_search, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True, refit=True) 


grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [100, 500, 1000],
                         'kernel': ['linear', 'rbf', 'sigmoid']},
             return_train_score=True, scoring='neg_mean_squared_error')

In [29]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_rand = {'C': randint(10, 200), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

#randint(1, 9)
svr_rand = SVR()
 
    
rand_search = RandomizedSearchCV(svr_rand, param_rand,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, random_state=28, return_train_score=True)    


#SVR().get_params().keys()
rand_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] C=11, kernel=poly ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................ C=11, kernel=poly, total=   6.6s
[CV] C=11, kernel=poly ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.3s remaining:    0.0s


[CV] ................................ C=11, kernel=poly, total=   6.2s
[CV] C=11, kernel=poly ...............................................
[CV] ................................ C=11, kernel=poly, total=   6.5s
[CV] C=11, kernel=poly ...............................................
[CV] ................................ C=11, kernel=poly, total=   6.4s
[CV] C=11, kernel=poly ...............................................
[CV] ................................ C=11, kernel=poly, total=   6.4s
[CV] C=15, kernel=rbf ................................................
[CV] ................................. C=15, kernel=rbf, total=   9.1s
[CV] C=15, kernel=rbf ................................................
[CV] ................................. C=15, kernel=rbf, total=   9.0s
[CV] C=15, kernel=rbf ................................................
[CV] ................................. C=15, kernel=rbf, total=   9.0s
[CV] C=15, kernel=rbf ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  9.9min finished


RandomizedSearchCV(cv=5, estimator=SVR(),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000016A84A6EBB0>,
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']},
                   random_state=28, return_train_score=True,
                   scoring='neg_mean_squared_error', verbose=2)

3. Try adding a transformer in the preparation pipeline to select only the
most important attributes.

In [30]:
from sklearn.ensemble import RandomForestRegressor

def GetFeatureImportancesRandomForest():
    housing_prepared = full_pipeline.fit_transform(housing)

    param_grid = [ {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, ]

    forest_reg = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                               scoring='neg_mean_squared_error',
                               return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)

    feature_importances = grid_search.best_estimator_.feature_importances_
    print(feature_importances)
    return feature_importances

if we don't have feature importances scale, we create it within Random Forest Regressor

In [31]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self,  k, feature_importances = None ):
        if feature_importances:
            self.feature_importances = feature_importances
        else:
            self.feature_importances = GetFeatureImportancesRandomForest()
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [32]:
# k = 5
# top_k_feature_indices = indices_of_top_k(feature_importances, k)
# top_k_feature_indices
# prepare_select_and_predict_pipeline.fit(housing, housing_labels)

4. Try creating a single pipeline that does the full data preparation plus
the final prediction.

In [33]:
k = 3
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(k, None))
])



housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)

housing_prepared_top_k_features[0:3]


[7.10605432e-02 6.44287813e-02 4.54178421e-02 1.77443793e-02
 1.64922497e-02 1.70322309e-02 1.57611437e-02 3.29082991e-01
 5.02566854e-02 1.07451736e-01 9.50849568e-02 1.19604716e-02
 1.48724641e-01 8.53589015e-05 5.14727060e-03 4.26871818e-03]


array([[-0.89522651,  0.00614618,  1.        ],
       [ 3.81608801,  0.02702276,  0.        ],
       [-0.52597384, -0.07544594,  1.        ]])

In [35]:
k = 5

prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(k)),
    ('svm_reg', SVR(**grid_search.best_params_))
])


prepare_select_and_predict_pipeline.fit(housing, housing_labels)



[7.10605432e-02 6.44287813e-02 4.54178421e-02 1.77443793e-02
 1.64922497e-02 1.70322309e-02 1.57611437e-02 3.29082991e-01
 5.02566854e-02 1.07451736e-01 9.50849568e-02 1.19604716e-02
 1.48724641e-01 8.53589015e-05 5.14727060e-03 4.26871818e-03]


Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('attribs_adder',
                                                                   CombinedAttributesAdder()),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['longitude', 'latitude',
                                                   'housing_median_age',
                                                   'total_rooms',
                                                   'total_bedrooms',
                                                   'population', 'households',
                    