In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

In [2]:
# 데이터 준비
housing = pd.read_csv('data-files/california-housing.csv')
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [3]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [10]:
# train_x, test_x, train_y, test_y 분할
train_X, test_X, train_y, test_y = train_test_split(housing.drop("median_house_value", axis=1), housing['median_house_value'], test_size=0.2, random_state=42)
# train, test 분할
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin

class HousingAttributesAdder(BaseEstimator, TransformerMixin):

    def __init__(self): # 초기화 함수
        pass

    def fit(self, X, y=None): # BaseEstimator에서 상속 -> 재정의 : 내용을 변경
        return self
    
    def transform(self, X, y=None): # BaseEstimator에서 상속 -> 재정의 : 내용을 변경
        rooms_per_household = X[:, 3] / X[:, 6] # 방수 / 가구수
        bedrooms_per_rooms = X[:, 4] / X[:, 3] # 침실수 / 방수
        population_per_household = X[:, 5] / X[:, 6] # 인구수 / 가구수

        return np.c_[X, rooms_per_household, bedrooms_per_rooms, population_per_household]
    
class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, column_names): # 초기화 함수
        self.column_names = column_names

    def fit(self, X, y=None): # BaseEstimator에서 상속 -> 재정의 : 내용을 변경
        return self
    
    def transform(self, X, y=None): # BaseEstimator에서 상속 -> 재정의 : 내용을 변경
        return X[self.column_names].values

In [15]:
# 변환기 테스트
attr_adder = HousingAttributesAdder()
attr_adder.fit(train_set.values)
transformed = attr_adder.transform(train_set.values)

transformed[:3, -4:]

array([['NEAR OCEAN', 5.017656500802568, 0.20057581573896352,
        3.691813804173355],
       ['NEAR OCEAN', 4.473544973544974, 0.23270254287403902,
        1.7380952380952381],
       ['NEAR OCEAN', 5.645833333333333, 0.17448603057459147,
        2.7232142857142856]], dtype=object)

In [37]:
# 변환기 테스트 2
pipeline = Pipeline([('imputer', SimpleImputer(strategy="mean")),
                     ("attr_adder", HousingAttributesAdder()),
                     ("standard_scaler", StandardScaler())])

transformed_train = pipeline.fit_transform(train_set.drop("ocean_proximity", axis=1).values)
transformed_test = pipeline.transform(test_set.drop("ocean_proximity", axis=1).values)

transform is called
transform is called


In [29]:
print( np.isnan(transformed_train).sum())
print( transformed_train[:3, -3:] )
print( transformed_train.mean(axis=0), transformed_train.std(axis=0) )


0
[[-0.17491646 -0.2117846   0.05137609]
 [-0.40283542  0.34218528 -0.11736222]
 [ 0.08821601 -0.66165785 -0.03227969]]
[ 1.75333477e-15  6.40099515e-17 -9.25185854e-18  3.37800416e-17
 -4.59365534e-17 -2.15159501e-19 -5.42201942e-17 -6.51933288e-17
 -6.19659363e-17 -1.98108110e-16 -5.61566297e-17  4.93656580e-17] [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [32]:
numeric_columns = train_set.columns[:-1]
category_columns = train_set.columns[-1:]

pipeline1 = Pipeline([('column_selector', ColumnSelector(numeric_columns)),
                      ('imputer', SimpleImputer(strategy="mean")),
                      ("attr_adder", HousingAttributesAdder()),
                      ("standard_scaler", StandardScaler())])

pipeline2 = make_pipeline(ColumnSelector(category_columns), OneHotEncoder())

In [42]:
pipeline1.fit_transform(train_set)
pipeline2.fit_transform(train_set).toarray()

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [43]:
train_set["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     7341
INLAND        5227
NEAR OCEAN    2086
NEAR BAY      1854
ISLAND           4
Name: count, dtype: int64

In [45]:
full_pipeline = FeatureUnion(transformer_list=[("numeric_pipeline", pipeline1), 
                                               ("category_pipeline", pipeline2)])

full_pipeline.fit(train_set)
train_transformed = full_pipeline.transform(train_set)
test_transformed = full_pipeline.transform(test_set)

In [47]:
train_set.shape, train_transformed.shape

((16512, 10), (16512, 17))

In [51]:
train_df = pd.DataFrame(train_transformed.toarray())
test_df = pd.DataFrame(test_transformed.toarray())

X_train = train_df.drop(8, axis=1)
X_test = test_df.drop(8, axis=1)
y_train = train_df[8]
y_test = test_df[8]

In [53]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr.fit(X_train, y_train)
gbr.score(X_train, y_train), gbr.score(X_test, y_test)

(0.810810666517977, 0.7814505541672989)