In [130]:
# hands on machine learning Chap 2. practice
# Robin Lee. 2020. 5. 20.


# load housing data
import os
import pandas as pd
HOUSING_PATH = os.path.join('datasets', 'housing')
def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)
housing = load_housing_data()
housing.info()

# split train and test set
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"]<5, 5.0, inplace=True)

splits = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=42)
for train_index, test_index in splits.split(housing, housing["income_cat"]):
    train_set = housing.loc[train_index]
    test_set = housing.loc[test_index]

for set_ in (train_set, test_set, housing):
    set_.drop("income_cat", axis=1, inplace=True)
housing_train = train_set.drop("median_house_value", axis=1)
housing_train_label = train_set["median_house_value"].copy()
housing_train.info()
housing_test = test_set.drop("median_house_value", axis=1)
housing_test_label = test_set["median_house_value"].copy()

# preprocess data - number
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
imputer = SimpleImputer(strategy = "median")
housing_train_num = housing_train.drop("ocean_proximity", axis=1)
imputer.fit(housing_train_num)

# preprocess data - category
from sklearn.preprocessing import OneHotEncoder
housing_train_cat = housing_train[["ocean_proximity"]]

# custom transformer
from sklearn.base import BaseEstimator, TransformerMixin
ROOM_IDX, BEDROOM_IDX, POPULATION_IDX, HOUSEHOLDS_IDX = 3,4,5,6
class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, ROOM_IDX] / X[:, HOUSEHOLDS_IDX]
        population_per_household = X[:, POPULATION_IDX] / X[:, HOUSEHOLDS_IDX]
        bedrooms_per_room = X[:, BEDROOM_IDX] / X[:, ROOM_IDX]
        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

# Pandas dataframe selector
class DataframeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attribute_names = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values
num_attribs = list(housing_train_num)
cat_attribs = ["ocean_proximity"]

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.compose import ColumnTransformer
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('attrib_adder', CombinedAttributeAdder()),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('selector', DataframeSelector(cat_attribs)),
    ('encoder', OneHotEncoder(sparse=False))
])

'''
full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])
'''
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(categories='auto'), cat_attribs)
])

train_prepared = full_pipeline.fit_transform(housing_train)
print(train_prepared[:10])
print("train_prepared.shape:",train_prepared.shape)
print("preprocess done...")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  

In [66]:
### 1. use support vector machine
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
# svr = SVR(kernel="linear", C = 1000.0)      # 70256.10469475007
svr = SVR(kernel='rbf', C=1000.0, gamma='scale')  # 69578.23550110968
svr.fit(train_prepared, housing_train_label)
svr_predictions = svr.predict(train_prepared)
svr_mse = mean_squared_error(svr_predictions, housing_train_label)
svr_rmse = np.sqrt(svr_mse)
print("svr rmse in train set:",svr_rmse)

svr rmse in train set: 69578.23550110968


In [72]:
### 2. GridSearchCV to RadomizedSearchCV
# Grid search
from sklearn.model_selection import GridSearchCV
grid_params = [
    {'kernel':['linear'], 'C':[10000, 20000,30000]},
    #{'kernel':['rbf'], 'C':[1000, 10000, 30000], 'gamma':['scale', 'auto']}
]

svr_model = SVR()
grid_search_svr = GridSearchCV(svr_model, grid_params, cv=5, scoring="neg_mean_squared_error", return_train_score=True)
grid_search_svr.fit(train_prepared, housing_train_label)
print(grid_search_svr.best_params_) # {'C': 10000, 'gamma': 'scale', 'kernel': 'rbf'}

import joblib
joblib.dump(grid_search_svr, "my_grid_svr.pkl")


{'C': 30000, 'kernel': 'linear'}


['my_grid_svr.pkl']

In [37]:
# Randomized Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
random_params = {'C':uniform(loc=0, scale=40000)}
random_search_svr = RandomizedSearchCV(svr_model, random_params, scoring="neg_mean_squared_error")
random_search_svr.fit(train_prepared, housing_train_label)
random_search_svr.best_params_ # {'C': 9873.86118942861, 'gamma': 'scale'}

{'C': 9873.86118942861}

In [63]:
### 3. most important attribute selector

# use corr as importance 
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])
class MostImportantAttributeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, corr_matrix, k):
        self.corr_matrix = corr_matrix
        self.k = k
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # sort column by corr
        corrr = abs(self.corr_matrix["median_house_value"])
        corrr.drop('median_house_value', inplace = True)
        
        # select top k attribs
        attribs = indices_of_top_k(corrr, self.k)
        print(attribs)
        return X[:,attribs]

corr_mat = housing.corr()
cm = abs(corr_mat['median_house_value'])
cm.drop('median_house_value', inplace=True)
print(cm.sort_values(ascending=False))
#print(housing_train.columns[indices_of_top_k(cm, 5)])
selector = MostImportantAttributeSelector(corr_mat, 5)

full_pipe = Pipeline([
    ('full_pipeline', full_pipeline),
    ('attrib_selector', MostImportantAttributeSelector(corr_mat, 5))
])

housing_train_prep = full_pipe.fit_transform(housing_train)
print(housing_train_prep[:10])
print(train_prepared[:10,indices_of_top_k(cm, 5)])


median_income         0.688075
latitude              0.144160
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
longitude             0.045967
population            0.024650
Name: median_house_value, dtype: float64
Index(['latitude', 'housing_median_age', 'total_rooms', 'households',
       'median_income'],
      dtype='object')
[1 2 3 6 7]
[[ 0.77194962  0.74333089 -0.49323393 -0.42069842 -0.61493744]
 [ 0.6596948  -1.1653172  -0.90896655 -1.02222705  1.33645936]
 [-1.34218285  0.18664186 -0.31365989 -0.0933178  -0.5320456 ]
 [ 0.31357576 -0.29052016 -0.36276217 -0.38343559 -1.04556555]
 [-0.65929936 -0.92673619  1.85619316  2.57097492 -0.44143679]
 [ 0.94500913 -0.37004716  0.14369276  0.19413836 -0.17643487]
 [-0.74816776  1.85670895 -0.18225284 -0.58571955  2.36670154]
 [-0.69671763 -0.21099316 -0.27203986 -0.3248797   1.11523946]
 [-0.77155418  1.45907393 -0.35434463 -0.15719694 -1.07690591]
 [-0.75752232  

In [143]:
### 3. most important attribute selector (SOLUTION: assume feature importances are given...)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }
forest_reg = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs, scoring="neg_mean_squared_error", n_iter=5)
random_search.fit(housing_train_prep, housing_train_label)

feature_importances = random_search.best_estimator_.feature_importances_


NameError: name 'grid_search' is not defined

In [144]:
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_important, k):
        self.feature_importances = feature_important
        self.k = k
    def fit(self, X, y=None):
        self.indices = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X, y=None):
        return X[:,self.indices]

preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, 5))
])

housing_train_feature = preparation_and_feature_selection_pipeline.fit_transform(housing_train)
housing_train_feature[:10]

array([[-1.15604281,  0.77194962,  0.74333089, -0.49323393, -0.44543821],
       [-1.17602483,  0.6596948 , -1.1653172 , -0.90896655, -1.0369278 ],
       [ 1.18684903, -1.34218285,  0.18664186, -0.31365989, -0.15334458],
       [-0.01706767,  0.31357576, -0.29052016, -0.36276217, -0.39675594],
       [ 0.49247384, -0.65929936, -0.92673619,  1.85619316,  2.41221109],
       [-0.69645635,  0.94500913, -0.37004716,  0.14369276,  0.1314467 ],
       [ 0.53743338, -0.74816776,  1.85670895, -0.18225284, -0.52819807],
       [ 1.16686701, -0.69671763, -0.21099316, -0.27203986, -0.58661679],
       [ 0.6573255 , -0.77155418,  1.45907393, -0.35434463, -0.16794926],
       [ 0.64733449, -0.75752232,  0.02758786, -0.98565964, -0.74240006]])

In [134]:
### 4. data prepare to final predict pipeline

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_train_prep, housing_train_label)
housing_test_prep = full_pipeline.transform(housing_test)
final_predict = lin_reg.predict(housing_train_prep)

prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('svm_reg', SVR(C= 30000, kernel='linear'))
])

prepare_select_and_predict_pipeline.fit(housing_train, housing_train_label)
res = prepare_select_and_predict_pipeline.predict(housing_test)

print(res)

[418145.34175248 245692.55318833 206252.25750686 ... 278543.34923859
 181863.70281433 164635.42000976]


In [138]:
### 5. use GridSearchCV to find option in prepare stage

param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent']
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search_prep.fit(housing_train, housing_train_label)

grid_search_prep.best_params_  # {'preparation__num__imputer__strategy': 'most_frequent'}


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:  2.0min finished


{'preparation__num__imputer__strategy': 'most_frequent'}