# 0. Make DataSet

In [1]:
# 1. 서포트 벡터 머신 회귀(sklearn.svm.SVR)를 kernel="linear"나 kernel="rbf"등의 다양한 하이퍼파라미터 설정으로 시도해보세요. 최상의 SVR 모델은 ?
# 데이터셋 => num 데이터는 num끼리 text 데이터는 one hot (ocean_proximity)->corr가 낮은 데이터는 제거
# 추가 특성 조합
# rooms_per_houshold = rooms/household
# population_per_household = population/household
# bedrooms_per_room = bedrooms/rooms

"""
1. 데이터셋 다운로드 및 테스트 분리
2. 데이터셋 조합기 생성
3. 데이터셋 파이프라인 생성
4. num, cat 파이프라인 merge 생성
5. gridSearch 조합
6. 학습 및 비교
7. 그래프
"""

import urllib.request
import os
import tarfile

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rickiepark/handson-ml2/master/"
HOUSING_PATH=os.path.join('datasets','housing')
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

def fetch_housing_data(housing_url, housing_path):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
        
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data(HOUSING_URL, HOUSING_PATH)

In [3]:
import pandas as pd

def load_housing_data(housing_path):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [4]:
import numpy as np

housing = load_housing_data(HOUSING_PATH)

np.random.seed(42)

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [6]:
housing = strat_train_set.drop("median_house_value", axis=1) # 훈련 세트를 위해 레이블 삭제
housing_labels = strat_train_set["median_house_value"].copy()
housing.count() # bedrooms nan 값 발견

longitude             16512
latitude              16512
housing_median_age    16512
total_rooms           16512
total_bedrooms        16354
population            16512
households            16512
median_income         16512
ocean_proximity       16512
income_cat            16512
dtype: int64

In [7]:
housing_cat = housing[['ocean_proximity']].copy()
housing_num = housing.copy()
housing_num.drop('ocean_proximity', axis=1, inplace=True)
housing_cat

Unnamed: 0,ocean_proximity
12655,INLAND
15502,NEAR OCEAN
2908,INLAND
14053,NEAR OCEAN
20496,<1H OCEAN
...,...
15174,<1H OCEAN
12661,INLAND
19263,<1H OCEAN
19140,<1H OCEAN


In [8]:
housing_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,2
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,5
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,2
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,2
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,3


In [9]:
# rooms_per_houshold = rooms/household
# population_per_household = population/household
# bedrooms_per_room = bedrooms/rooms
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class MakeAttribute(BaseEstimator, TransformerMixin):
    def __init__(self, indexes):
        self.indexes=indexes
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:,self.indexes.index('total_rooms')] / X[:, self.indexes.index('households')]
        population_per_household = X[:,self.indexes.index('population')] / X[:, self.indexes.index('households')]
        bedrooms_per_room = X[:,self.indexes.index('total_bedrooms')] / X[:, self.indexes.index('total_rooms')]
        
        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', MakeAttribute(list(housing_num))),
    ('standard_scaler', StandardScaler()),
])

num_attr = list(housing_num)
cat_attr = list(housing_cat)

full_pipeline = ColumnTransformer([
    ('num_pipe', num_pipeline, num_attr),
    ('cat_pipe', OneHotEncoder(), cat_attr),
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

# 1. Support Vector Machine Regression (Use GridSearchCV)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid =[
    {'kernel':['linear'], 'C':[10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
    {'kernel':['rbf'], 'C':[1.0, 3.0, 10., 30., 100., 300., 1000.0],'gamma':[0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}
]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ..............................C=10.0, kernel=linear; total time=   7.9s
[CV] END ..............................C=10.0, kernel=linear; total time=   7.7s
[CV] END ..............................C=10.0, kernel=linear; total time=   7.9s
[CV] END ..............................C=10.0, kernel=linear; total time=   8.0s
[CV] END ..............................C=10.0, kernel=linear; total time=   7.9s
[CV] END ..............................C=30.0, kernel=linear; total time=   7.7s
[CV] END ..............................C=30.0, kernel=linear; total time=   7.3s
[CV] END ..............................C=30.0, kernel=linear; total time=   7.3s
[CV] END ..............................C=30.0, kernel=linear; total time=   7.4s
[CV] END ..............................C=30.0, kernel=linear; total time=   7.3s
[CV] END .............................C=100.0, kernel=linear; total time=   7.4s
[CV] END .............................C=100.0, 

GridSearchCV(cv=5, estimator=SVR(),
             param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,
                                10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             scoring='neg_mean_squared_error', verbose=2)

In [18]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

70279.30954066859

In [20]:
grid_search.best_params_

{'C': 1000.0, 'kernel': 'linear'}

# 2. GridsearchCV to RandomizedSearchCV

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, expon

param_distribs = {
    'kernel':['linear', 'rbf'], 'C':randint(low=20, high=200000), 'gamma': expon(scale=1.0),
}

svm_reg2 = SVR()
rnd_search = RandomizedSearchCV(svm_reg2, param_distributions=param_distribs, n_iter=50, cv=5, scoring='neg_mean_squared_error', verbose=2, random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ..C=121978, gamma=1.5923005340557224, kernel=linear; total time=  34.0s
[CV] END ..C=121978, gamma=1.5923005340557224, kernel=linear; total time=  39.0s
[CV] END ..C=121978, gamma=1.5923005340557224, kernel=linear; total time=  31.7s
[CV] END ..C=121978, gamma=1.5923005340557224, kernel=linear; total time=  34.3s
[CV] END ..C=121978, gamma=1.5923005340557224, kernel=linear; total time=  32.8s
[CV] END ..C=119899, gamma=0.9129425537759532, kernel=linear; total time=  32.1s
[CV] END ..C=119899, gamma=0.9129425537759532, kernel=linear; total time=  36.7s
[CV] END ..C=119899, gamma=0.9129425537759532, kernel=linear; total time=  31.8s
[CV] END ..C=119899, gamma=0.9129425537759532, kernel=linear; total time=  35.2s
[CV] END ..C=119899, gamma=0.9129425537759532, kernel=linear; total time=  33.4s
[CV] END .C=137357, gamma=0.16959629191460518, kernel=linear; total time=  39.4s
[CV] END .C=137357, gamma=0.16959629191460518, 

RandomizedSearchCV(cv=5, estimator=SVR(), n_iter=50,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002581DA9A390>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002581DA9AC88>,
                                        'kernel': ['linear', 'rbf']},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [27]:
negative_mse2 = rnd_search.best_score_
rmse2 = np.sqrt(-negative_mse2) 
rmse2

55065.35247199966

In [28]:
rnd_search.best_params_

{'C': 193684, 'gamma': 0.22153944050588595, 'kernel': 'rbf'}

# 3. Make Convertor to Select Best Attribute (Use Pipeline)

In [88]:
from sklearn.base import BaseEstimator, TransformerMixin

class SelectBestAttribute(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def indices_of_top_k(self):
        return np.sort(np.argpartition(np.array(self.feature_importances), -self.k)[-self.k:]) # 관련이 큰 k개의 변수를 배열의 오른쪽에 순서에 상관없이 몰아 놓고 [-k:] 로 k 이상만 슬라이스
    def fit(self, X, y=None):
        self.feature_indices_ = self.indices_of_top_k()
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [39]:
# feature_importances_ 산출을 위한 RandomForest Regression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

param_distribs = {
    'n_estimators':randint(low=1, high=200),
    'max_features':randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search2 = RandomizedSearchCV(forest_reg, param_distributions=param_distribs, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search2.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002582EE2A710>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002582EE0FCC0>},
                   random_state=42, scoring='neg_mean_squared_error')

In [42]:
feature_importances = rnd_search2.best_estimator_.feature_importances_
feature_importances

array([6.56863167e-02, 5.97871721e-02, 4.46517157e-02, 1.68844069e-02,
       1.65147541e-02, 1.65530829e-02, 1.61759635e-02, 2.76212740e-01,
       1.41219128e-01, 3.87967033e-02, 1.07825748e-01, 4.94744854e-02,
       9.11577527e-03, 1.33517348e-01, 7.63049962e-05, 3.49438421e-03,
       4.01397065e-03])

In [50]:
k=5

top_k_features_indices = np.sort(np.argpartition(np.array(feature_importances), -k)[-k:])
top_k_features_indices

array([ 0,  7,  8, 10, 13], dtype=int64)

In [97]:
preparation_and_feature_selection_pipeline = Pipeline([
    ('preperation', full_pipeline),
    ('feature_selection', SelectBestAttribute(feature_importances, k))
])
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)
housing_prepared_top_k_features

array([[-0.94135046, -0.8936472 , -0.95468705,  0.00622264,  1.        ],
       [ 1.17178212,  1.292168  ,  1.89007821, -0.04081077,  0.        ],
       [ 0.26758118, -0.52543365, -0.95468705, -0.07537122,  1.        ],
       ...,
       [-1.5707942 , -0.36547546, -0.00643196, -0.03743619,  0.        ],
       [-1.56080303,  0.16826095, -0.00643196, -0.05915604,  0.        ],
       [-1.28105026, -0.390569  , -0.00643196,  0.00657083,  1.        ]])

# 4. Datasets make (Use One Pipeline)

In [98]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', SelectBestAttribute(feature_importances, k)),
    ('svm_reg', SVR(**rnd_search.best_params_))
])

In [99]:
prepare_select_and_predict_pipeline.fit(housing, housing_labels)

Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('attribs_adder',
                                                                   MakeAttribute(indexes=['longitude',
                                                                                          'latitude',
                                                                                          'housing_median_age',
                                                                                          'total_rooms',
                                                                                          'total_bedrooms',
                                                                                          'population',
        

In [100]:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

Predictions:	 [ 82978.26321069 297653.39290231 100891.99928164 152210.9357997 ]
Labels:		 [72100.0, 279600.0, 82700.0, 112500.0]


# 5. Search Options for Ready (Use GridSearchCV)

In [103]:
full_pipeline.named_transformers_["cat"].handle_unknown = 'ignore'

param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(housing, housing_labels)

KeyError: 'cat'