In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
import tarfile
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

print('-------------------')
print('|     lab1         |')
print('-------------------')

# 데이터 다운로드 및 추출
url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"
urllib.request.urlretrieve(url, "housing.tgz")

tar = tarfile.open("housing.tgz")
tar.extractall()
tar.close()

# 데이터 로드
housing = pd.read_csv("housing.csv")
print(housing.describe())
print(housing.info())


class CombinedAttributesAdder():

    def __init__(self):
        self.le = LabelEncoder()
        self.sk = StandardScaler()

    def fit_and_transform(self, train_df, test_df):
        train_df, test_df = self._DataHandler(train_df, test_df)
        return train_df, test_df

    def _Encoder(self, train_df, test_df):
        feature = 'ocean_proximity'
        train_df[feature] = self.le.fit_transform(train_df[feature])
        test_df[feature] = self.le.transform(test_df[feature])
        return train_df, test_df

    def _Norm(self, train_df, test_df):
        numeric_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                            'total_bedrooms', 'population', 'households']
        for feature in numeric_features:
            train_df[feature] = self.sk.fit_transform(train_df[[feature]])
            test_df[feature] = self.sk.transform(test_df[[feature]])
        return train_df, test_df

    def _DataHandler(self, train_df: pd.DataFrame, test_df: pd.DataFrame):
        train_df, test_df = self._Encoder(train_df, test_df)
        train_df, test_df = self._Norm(train_df, test_df)
        return train_df, test_df

In [6]:
housing = housing.sample(frac=1).reset_index(drop=True)

train_df = housing.iloc[:18000].copy()
test_df = housing.iloc[18000:].copy()

handler = CombinedAttributesAdder()
train_df, test_df = handler.fit_and_transform(train_df=train_df, test_df=test_df)

print(train_df.head())
print(test_df.head())


-------------------
|     lab1         |
-------------------
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      

In [None]:
train_x = train_df.drop(['median_house_value'], axis=1)
test_x = test_df.drop(['median_house_value'], axis=1)

train_y = train_df['median_house_value']
test_y = test_df['median_house_value']

train_x["income_cat"] = pd.cut(train_df["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
test_x["income_cat"] = pd.cut(test_df["median_income"],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])

train_x = pd.get_dummies(train_x, columns=['income_cat'])
test_x = pd.get_dummies(test_x, columns=['income_cat'])

In [11]:
xgb_reg = xgb.XGBRegressor(random_state=123)
param_grid = {
    'n_estimators': [50, 100, 200],  
    'learning_rate': [0.01, 0.1, 0.3], 
    'max_depth': [5, 7, 10], 
}
grid_search = GridSearchCV(xgb_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(train_x, train_y)
print("Best parameters: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

test_predictions = best_model.predict(test_x)
test_mse = mean_squared_error(test_y, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(test_y, test_predictions)

print("테스트 세트 성능:")
print("RMSE:", test_rmse)
print("R2:", test_r2)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.1s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.1s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.1s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.1s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   0.2s
[CV] END ..learning_rate=0.01, max_depth=5, n_e

### 베이지안 최적화로 object function optimize

In [None]:
from hyperopt import hp
from sklearn.model_selection import cross_val_score
from hyperopt import STATUS_OK, fmin, tpe, Trials
import numpy as np
import xgboost as xgb

xgb_search_space = {
    'max_depth': hp.quniform('max_depth', 5, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
}

In [24]:
def objective_func(search_space):
    xgb_reg = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=int(search_space['max_depth']),
        min_child_weight=int(search_space['min_child_weight']),
        learning_rate=search_space['learning_rate'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=123
    )
    mse = cross_val_score(xgb_reg, train_x, train_y, scoring='neg_mean_squared_error', cv=3)
    mse = -1.0* np.mean(mse)

    return {'loss': mse, 'status': STATUS_OK}


trial_val = Trials()
best = fmin(
    fn=objective_func,
    space=xgb_search_space,
    algo=tpe.suggest,
    max_evals=500,
    trials=trial_val,
    rstate=np.random.default_rng(seed=9)
)

print('best:', best)


100%|██████████| 500/500 [14:20<00:00,  1.72s/trial, best loss: 2232407742.25708]   
best: {'colsample_bytree': np.float64(0.7708404052529336), 'learning_rate': np.float64(0.09250674132536013), 'max_depth': np.float64(9.0), 'min_child_weight': np.float64(2.0)}


### best param

colsample_bytree = 0.77084
learning_rate = 0.0925
max_depth = 9
min_child_weight = 2


In [32]:
xgb_reg = xgb.XGBRegressor(
        n_estimators=200,
        max_depth=8,
        min_child_weight=2,
        learning_rate=0.09025,
        random_state=123
    )

In [33]:
model = xgb_reg.fit(train_x,train_y)

In [34]:
test_predictions = model.predict(test_x)
test_mse = mean_squared_error(test_y, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(test_y, test_predictions)

print("테스트 세트 성능:")
print("RMSE:", test_rmse)
print("R2:", test_r2)

테스트 세트 성능:
RMSE: 48130.51395498043
R2: 0.8316832763688611
