In [1]:
# 1. 서포트 벡터 머신 회귀(sklearn.svm.SVR)를 kernel="linear"나 kernel="rbf"등의 다양한 하이퍼파라미터 설정으로 시도해보세요. 최상의 SVR 모델은 ?
# 데이터셋 => num 데이터는 num끼리 text 데이터는 one hot (ocean_proximity)->corr가 낮은 데이터는 제거
# 추가 특성 조합
# rooms_per_houshold = rooms/household
# population_per_household = population/household
# bedrooms_per_room = bedrooms/rooms

"""
1. 데이터셋 다운로드 및 테스트 분리
2. 데이터셋 조합기 생성
3. 데이터셋 파이프라인 생성
4. num, cat 파이프라인 merge 생성
5. gridSearch 조합
6. 학습 및 비교
7. 그래프
"""

import urllib.request
import os
import tarfile

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rickiepark/handson-ml2/master/"
HOUSING_PATH=os.path.join('datasets','housing')
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

def fetch_housing_data(housing_url, housing_path):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
        
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data(HOUSING_URL, HOUSING_PATH)

In [3]:
import pandas as pd

def load_housing_data(housing_path):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [4]:
import numpy as np

housing = load_housing_data(HOUSING_PATH)

np.random.seed(42)

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [6]:
housing = strat_train_set.drop("median_house_value", axis=1) # 훈련 세트를 위해 레이블 삭제
housing_labels = strat_train_set["median_house_value"].copy()
housing.count() # bedrooms nan 값 발견

longitude             16512
latitude              16512
housing_median_age    16512
total_rooms           16512
total_bedrooms        16354
population            16512
households            16512
median_income         16512
ocean_proximity       16512
income_cat            16512
dtype: int64

In [7]:
housing_cat = housing[['ocean_proximity']].copy()
housing_num = housing.copy()
housing_num.drop('ocean_proximity', axis=1, inplace=True)
housing_cat

Unnamed: 0,ocean_proximity
12655,INLAND
15502,NEAR OCEAN
2908,INLAND
14053,NEAR OCEAN
20496,<1H OCEAN
...,...
15174,<1H OCEAN
12661,INLAND
19263,<1H OCEAN
19140,<1H OCEAN


In [8]:
housing_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,2
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,5
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,2
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,2
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,3


In [9]:
# rooms_per_houshold = rooms/household
# population_per_household = population/household
# bedrooms_per_room = bedrooms/rooms
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class MakeAttribute(BaseEstimator, TransformerMixin):
    def __init__(self, indexes):
        self.indexes=indexes
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:,self.indexes.index('total_rooms')] / X[:, self.indexes.index('households')]
        population_per_household = X[:,self.indexes.index('population')] / X[:, self.indexes.index('households')]
        bedrooms_per_room = X[:,self.indexes.index('total_bedrooms')] / X[:, self.indexes.index('total_rooms')]
        
        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', MakeAttribute(list(housing_num))),
    ('standard_scaler', StandardScaler()),
])

num_attr = list(housing_num)
cat_attr = list(housing_cat)

full_pipeline = ColumnTransformer([
    ('num_pipe', num_pipeline, num_attr),
    ('cat_pipe', OneHotEncoder(), cat_attr),
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])