In [361]:
import pandas as pd
import numpy as np

# ML관련
from sklearn.model_selection import train_test_split

# Calling Datas

In [362]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')

# check data
- shape
- null count
- which feature is significant

In [363]:
train.shape

# train data has just too much Features

(1460, 81)

### column handling

In [364]:
train_id = train.pop('Id')
test_id = test.pop('Id')

y = np.log1p(train.pop('SalePrice'))
X = train

In [365]:
pd.set_option('display.max_rows', None)

X.isnull().sum()

# Alley, FireplaceQu, poolQC, Fence, MiscFeature has too mnay nulls
# LotArea, MasVnrType, has some nulls and doesn't look important

MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        872
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea 

In [366]:
drop_list = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'LotArea', 'MasVnrType']
X = X.drop(drop_list, axis=1)
X.shape

(1460, 72)

In [367]:
# let's sum like 10~15 most important features.
# Street, Alley, Condition1 etc has too much same Values

important_features = [
    'MSSubClass', 'Neighborhood', 'OverallQual', 'OverallCond',
    'YearBuilt', 'YearRemodAdd', 'ExterQual', 'ExterCond', 
    'BsmtQual', 'BsmtCond', 'HeatingQC', '1stFlrSF', '2ndFlrSF',
    'GrLivArea', 'FullBath', 'KitchenQual', 'TotRmsAbvGrd',
    'GarageQual', 'GarageCond'
]

X_sort = pd.DataFrame(X[important_features])
X_sort.shape

(1460, 19)

In [368]:
# some nulls exists

X_sort.isnull().sum()

MSSubClass       0
Neighborhood     0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
ExterQual        0
ExterCond        0
BsmtQual        37
BsmtCond        37
HeatingQC        0
1stFlrSF         0
2ndFlrSF         0
GrLivArea        0
FullBath         0
KitchenQual      0
TotRmsAbvGrd     0
GarageQual      81
GarageCond      81
dtype: int64

In [369]:
cat_cols = X_sort.select_dtypes(exclude = np.number).columns.tolist()
num_cols = X_sort.select_dtypes(include = np.number).columns.tolist()

print(num_cols, cat_cols)

['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd'] ['Neighborhood', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']


# Making PipeLine & RandomSearchCV
- let's gooooo00000ooo0oo0000OOO00OooOO0

In [370]:
# 훈련 - 테스트 데이터 분리

X_train, X_test, y_train, y_test = train_test_split(
    X_sort, y, test_size = 0.2, random_state = 42
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1168, 19) (1168,) (292, 19) (292,)


In [371]:
# 훈련 - 검증 데이터 분리

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size = 0.2, random_state = 42
)

print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)

(934, 19) (234, 19) (934,) (234,)


In [372]:
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer # 결측치 처리 관련
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


# StratifiedKFold : 분류 작업 할 때
# KFold : 수치 작업 할 때
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold, KFold

from scipy.stats import uniform, randint
from sklearn.metrics import mean_squared_error


In [373]:
# 파이프라인 구축

# 연속형/범주형 피처 변환기 설정

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer( # 최종 전처리
    transformers=[
        ('num', numerical_transformer, num_cols), # 이름, 처리 방법, 대상 순
        ('cat', categorical_transformer, cat_cols)
    ])

pipeline_1 = Pipeline(steps=[
    ('preprocessor', preprocessor), #전처리 라인 적용
    ('classifier', RandomForestRegressor(random_state=42)) #분류는 포레스트 적용
])

param_distributions = { #파라미터 설정
    'classifier__min_impurity_decrease': uniform(0.0001, 0.001),
    'classifier__max_depth': randint(20, 50),
    'classifier__min_samples_split': randint(2, 25),
    'classifier__min_samples_leaf': randint(1, 25),
}

split_number = 5  # fold숫자 정하기
kfold = KFold(n_splits=split_number, shuffle=True, random_state=42)

random_search = RandomizedSearchCV( #랜덤서치 실행
    estimator=pipeline_1, #원래는 Tree, REG같은 모델이 들어갔던 자리, 전처리와 모델처리가 한번에 진행됨.
    param_distributions=param_distributions,
    n_iter=50, #시도횟수
    cv= kfold,
    scoring='neg_mean_squared_error', # 평가 지표
    random_state=42,
    n_jobs=-1 #가용한 모든 코어 이용
)

# 모델 빌딩
def model_builing(model, X_train, y_train):
    model.fit(X_train, y_train)
    best_model = model.best_estimator_
    return best_model

# 점수 평가 지표 제작
def get_score(model, X_tr, X_val, y_tr, y_val):
    tr_pred = model.predict(X_tr)
    val_pred = model.predict(X_val)
    tr_score = np.sqrt(mean_squared_error(y_tr, tr_pred))
    val_score = np.sqrt(mean_squared_error(y_val, val_pred))
    return f"train: {tr_score}, validation: {val_score}"

In [374]:
# Set Random_Search
random_search.fit(X_train, y_train)

# 검증 세트에 대한 예측 값 계산
y_val_pred = random_search.predict(X_val)

# MSE 점수 계산 및 출력
mse = mean_squared_error(y_val, y_val_pred)
print(f'MSE 점수: {mse:.4f}')

MSE 점수: 0.0179


In [375]:
best_model = model_builing(random_search, X_tr, y_tr)
get_score(best_model, X_tr, X_val, y_tr, y_val)

'train: 0.12423324546146439, validation: 0.16736333399453884'

In [376]:
#LightGBM 적용
#!pip install lightgbm -qq
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# LightGBM이 적용된 파이프라인
pipeline_2 = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 전처리 라인 적용
    ('regressor', LGBMRegressor(random_state=42))  # 회귀는 LightGBM 적용
])

# LightGBM 파라미터 설정
# Define the parameter distributions
param_distributions = {
    'regressor__learning_rate': uniform(0.05, 0.15),
    'regressor__max_depth': randint(5, 10),
    'regressor__reg_alpha': uniform(0.1, 0.5),
    'regressor__reg_lambda': uniform(0.1, 0.5),
}

# RandomizedSearchCV를 사용한 하이퍼파라미터 튜닝
random_search_2 = RandomizedSearchCV(
    estimator=pipeline_2,
    param_distributions=param_distributions,
    n_iter=50,  # 시도 횟수
    cv=kfold,  # 교차 검증
    scoring='neg_mean_squared_error',  # 평가 지표
    random_state=42,
    n_jobs=-1  # 가용한 모든 코어 사용
)

random_search_2.fit(X_train, y_train)

# 검증 세트에 대한 예측 확률 계산
y_val_pred = random_search_2.predict(X_val)  # 클래스 1(양성 클래스)에 대한 확률 추출

# MSE 점수 계산 및 출력
mse = mean_squared_error(y_val, y_val_pred)
print(f'MSE 점수: {mse:.4f}')

MSE 점수: 0.0156


In [377]:
best_model_LGBM = model_builing(random_search_2, X_tr, y_tr)
get_score(best_model_LGBM, X_tr, X_val, y_tr, y_val)

'train: 0.11592714610303721, validation: 0.1495983010151579'

In [378]:
#XGBOOST 적용
#!pip install xgboost -qq
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# XGBOOST가 적용된 파이프라인
pipeline_3 = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 전처리 라인 적용
    ('regressor', XGBRegressor(random_state=42))  # 회귀는 XGBOOST 적용
])

# XGBOOST 파라미터 설정
param_distributions = {
    'regressor__learning_rate': uniform(0.01, 0.1),  # 학습률
    'regressor__n_estimators': randint(50, 500),  # 트리의 개수
    'regressor__max_depth': randint(3, 15),  # 트리의 최대 깊이
    'regressor__min_child_weight': randint(1, 10),  # 리프 노드의 최소 가중치
    'regressor__subsample': uniform(0.6, 0.4),  # 데이터 샘플링 비율
    'regressor__colsample_bytree': uniform(0.6, 0.4),  # 컬럼 샘플링 비율
    'regressor__reg_alpha': uniform(0.0, 0.1),  # L1 정규화
    'regressor__reg_lambda': uniform(0.0, 0.1),  # L2 정규화
    'regressor__gamma': uniform(0.0, 0.1),  # 감마 값
    #'regressor__verbose': randint(-1, 1),  # 로그 출력 수준, 안 쓰인다.
}

# RandomizedSearchCV를 사용한 하이퍼파라미터 튜닝
random_search_3 = RandomizedSearchCV(
    estimator=pipeline_3,
    param_distributions=param_distributions,
    n_iter=50,  # 시도 횟수
    cv=kfold,  # 교차 검증
    scoring='neg_mean_squared_error',  # 평가 지표
    random_state=42,
    n_jobs=-1  # 가용한 모든 코어 사용
)

random_search_3.fit(X_train, y_train)

# 검증 세트에 대한 예측 확률 계산
y_val_pred = random_search_3.predict(X_val)  # 클래스 1(양성 클래스)에 대한 확률 추출

# MSE 점수 계산 및 출력
mse = mean_squared_error(y_val, y_val_pred)
print(f'MSE 점수: {mse:.4f}')

MSE 점수: 0.0101


In [379]:
best_model_XGB = model_builing(random_search_3, X_tr, y_tr)
get_score(best_model_XGB, X_tr, X_val, y_tr, y_val)

'train: 0.1032740294533756, validation: 0.1484863785926545'

## Ensemble Modeling
- 더 좋아야겠지????

In [391]:
from sklearn.ensemble import VotingRegressor

# 최적의 모델 추출
best_lgbm = random_search_2.best_estimator_
best_xgb = random_search_3.best_estimator_

# VotingRegressor 구성
ensemble_model = VotingRegressor(estimators=[
    ('lightgbm', best_lgbm),
    ('xgboost', best_xgb)
])

# 최적화된 VotingRegressor로 학습
ensemble_model.fit(X_train, y_train)


In [392]:
# 테스트 데이터 평가
y_pred = ensemble_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"앙상블 모델의 MSE: {mse}")

앙상블 모델의 MSE: 0.020961709248917354


In [390]:
def model_builing_ensemble(model, X_train, y_train):
    model_best = model.fit(X_train, y_train)
    #best_model = model.best_estimator_
    return model_best

best_model = model_builing_ensemble(ensemble_model, X_tr, y_tr)
get_score(best_model, X_tr, X_val, y_tr, y_val)

'train: 0.10742405691289576, validation: 0.14720352850199617'

# Code Collection With Answer

- 다시 원본 데이터로 돌아가서 col_selection을 하지 말고 값을 더 줄인 데이터로 분석
- 그리고 칼럼을 10개 내외로 더 줄인 데이터로 분석
- 걸리는 시간과 성늘을 교차 검증 해보도록 하자!

## 더 줄인 데이터 먼저 검증

In [334]:
# 더 줄인 데이터 먼저

important_features_min = [
    'OverallQual', 'OverallCond',
    'ExterQual', 'ExterCond', 
    'BsmtQual', 'BsmtCond', 'HeatingQC',
    'GrLivArea', 'KitchenQual',
    'GarageQual', 'GarageCond'
]

X_sorted_min = pd.DataFrame(X[important_features_min])
X_sorted_min.shape

(1460, 11)

In [335]:
cat_cols = X_sorted_min.select_dtypes(exclude = np.number).columns.tolist()
num_cols = X_sorted_min.select_dtypes(include = np.number).columns.tolist()

print(num_cols, cat_cols)

['OverallQual', 'OverallCond', 'GrLivArea'] ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']


In [336]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sorted_min, y, test_size = 0.2, random_state = 42
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1168, 11) (1168,) (292, 11) (292,)


In [337]:
# 훈련 - 검증 데이터 분리

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size = 0.2, random_state = 42
)

print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)

(934, 11) (234, 11) (934,) (234,)


In [338]:
preprocessor = ColumnTransformer( # 최종 전처리
    transformers=[
        ('num', numerical_transformer, num_cols), # 이름, 처리 방법, 대상 순
        ('cat', categorical_transformer, cat_cols)
    ])

pipeline_1 = Pipeline(steps=[
    ('preprocessor', preprocessor), #전처리 라인 적용
    ('classifier', RandomForestRegressor(random_state=42)) #분류는 포레스트 적용
])

param_distributions = { #파라미터 설정
    'classifier__min_impurity_decrease': uniform(0.0001, 0.001),
    'classifier__max_depth': randint(20, 50),
    'classifier__min_samples_split': randint(2, 25),
    'classifier__min_samples_leaf': randint(1, 25),
}

split_number = 5  # fold숫자 정하기
kfold = KFold(n_splits=split_number, shuffle=True, random_state=42)

random_search = RandomizedSearchCV( #랜덤서치 실행
    estimator=pipeline_1, #원래는 Tree, REG같은 모델이 들어갔던 자리, 전처리와 모델처리가 한번에 진행됨.
    param_distributions=param_distributions,
    n_iter=50, #시도횟수
    cv= kfold,
    scoring='neg_mean_squared_error', # 평가 지표
    random_state=42,
    n_jobs=-1 #가용한 모든 코어 이용
)

In [339]:
best_model = model_builing(random_search, X_tr, y_tr)
get_score(best_model, X_tr, X_val, y_tr, y_val)

'train: 0.15351727973422447, validation: 0.18094025204352143'

In [340]:
# LightGBM이 적용된 파이프라인
pipeline_2 = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 전처리 라인 적용
    ('regressor', LGBMRegressor(random_state=42))  # 회귀는 LightGBM 적용
])

# LightGBM 파라미터 설정
# Define the parameter distributions
param_distributions = {
    'regressor__learning_rate': uniform(0.01, 0.2),
    'regressor__max_depth': randint(3, 15),
    'regressor__reg_alpha': uniform(0.0, 1.0),
    'regressor__reg_lambda': uniform(0.0, 1.0),
}

# RandomizedSearchCV를 사용한 하이퍼파라미터 튜닝
random_search_2 = RandomizedSearchCV(
    estimator=pipeline_2,
    param_distributions=param_distributions,
    n_iter=50,  # 시도 횟수
    cv=kfold,  # 교차 검증
    scoring='neg_mean_squared_error',  # 평가 지표
    random_state=42,
    n_jobs=-1  # 가용한 모든 코어 사용
)

In [341]:
best_model_LGBM = model_builing(random_search_2, X_tr, y_tr)
get_score(best_model_LGBM, X_tr, X_val, y_tr, y_val)

'train: 0.15546807744902008, validation: 0.16554808160334525'

In [342]:
pipeline_3 = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 전처리 라인 적용
    ('regressor', XGBRegressor(random_state=42))  # 회귀는 XGBOOST 적용
])

# XGBOOST 파라미터 설정
param_distributions = {
    'regressor__learning_rate': uniform(0.01, 0.1),  # 학습률
    'regressor__n_estimators': randint(50, 500),  # 트리의 개수
    'regressor__max_depth': randint(3, 15),  # 트리의 최대 깊이
    'regressor__min_child_weight': randint(1, 10),  # 리프 노드의 최소 가중치
    'regressor__subsample': uniform(0.6, 0.4),  # 데이터 샘플링 비율
    'regressor__colsample_bytree': uniform(0.6, 0.4),  # 컬럼 샘플링 비율
    'regressor__reg_alpha': uniform(0.0, 0.1),  # L1 정규화
    'regressor__reg_lambda': uniform(0.0, 0.1),  # L2 정규화
    'regressor__gamma': uniform(0.0, 0.1),  # 감마 값
    #'regressor__verbose': randint(-1, 1),  # 로그 출력 수준, 안 쓰인다.
}

# RandomizedSearchCV를 사용한 하이퍼파라미터 튜닝
random_search_3 = RandomizedSearchCV(
    estimator=pipeline_3,
    param_distributions=param_distributions,
    n_iter=50,  # 시도 횟수
    cv=kfold,  # 교차 검증
    scoring='neg_mean_squared_error',  # 평가 지표
    random_state=42,
    n_jobs=-1  # 가용한 모든 코어 사용
)

In [343]:
best_model_XGB = model_builing(random_search_3, X_tr, y_tr)
get_score(best_model_XGB, X_tr, X_val, y_tr, y_val)

'train: 0.14563019432686092, validation: 0.16708489742046898'

## 전체 활용 데이터
- 칼럼 축약 없이 전체 사용

In [393]:
X_full = train.copy()
X.shape

(1460, 72)

In [394]:
cat_cols = X_full.select_dtypes(exclude = np.number).columns.tolist()
num_cols = X_full.select_dtypes(include = np.number).columns.tolist()

print(len(num_cols), len(cat_cols))

36 43


In [395]:
'''
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size = 0.2, random_state = 42
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
'''

'\nX_train, X_test, y_train, y_test = train_test_split(\n    X_full, y, test_size = 0.2, random_state = 42\n)\n\nprint(X_train.shape, y_train.shape, X_test.shape, y_test.shape)\n'

In [396]:
# 훈련 - 검증 데이터 분리
'''
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size = 0.2, random_state = 42
)

print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)
'''

X_tr, X_val, y_tr, y_val = train_test_split(
    X_full, y, test_size = 0.2, random_state = 42
)

print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)

(1168, 79) (292, 79) (1168,) (292,)


In [397]:
preprocessor = ColumnTransformer( # 최종 전처리
    transformers=[
        ('num', numerical_transformer, num_cols), # 이름, 처리 방법, 대상 순
        ('cat', categorical_transformer, cat_cols)
    ])

pipeline_1 = Pipeline(steps=[
    ('preprocessor', preprocessor), #전처리 라인 적용
    ('classifier', RandomForestRegressor(random_state=42)) #분류는 포레스트 적용
])

param_distributions = { #파라미터 설정
    'classifier__min_impurity_decrease': uniform(0.0001, 0.001),
    'classifier__max_depth': randint(20, 50),
    'classifier__min_samples_split': randint(2, 25),
    'classifier__min_samples_leaf': randint(1, 25),
}

split_number = 5  # fold숫자 정하기
kfold = KFold(n_splits=split_number, shuffle=True, random_state=42)

random_search = RandomizedSearchCV( #랜덤서치 실행
    estimator=pipeline_1, #원래는 Tree, REG같은 모델이 들어갔던 자리, 전처리와 모델처리가 한번에 진행됨.
    param_distributions=param_distributions,
    n_iter=50, #시도횟수
    cv= kfold,
    scoring='neg_mean_squared_error', # 평가 지표
    random_state=42,
    n_jobs=-1 #가용한 모든 코어 이용
)

In [398]:
best_model = model_builing(random_search, X_tr, y_tr)
get_score(best_model, X_tr, X_val, y_tr, y_val)

'train: 0.11576928586817183, validation: 0.15698417199074258'

In [399]:
# LightGBM이 적용된 파이프라인
pipeline_2 = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 전처리 라인 적용
    ('regressor', LGBMRegressor(random_state=42))  # 회귀는 LightGBM 적용
])

# LightGBM 파라미터 설정
# Define the parameter distributions
param_distributions = {
    'regressor__learning_rate': uniform(0.01, 0.2),
    'regressor__max_depth': randint(3, 15),
    'regressor__reg_alpha': uniform(0.0, 1.0),
    'regressor__reg_lambda': uniform(0.0, 1.0),
}

# RandomizedSearchCV를 사용한 하이퍼파라미터 튜닝
random_search_2 = RandomizedSearchCV(
    estimator=pipeline_2,
    param_distributions=param_distributions,
    n_iter=50,  # 시도 횟수
    cv=kfold,  # 교차 검증
    scoring='neg_mean_squared_error',  # 평가 지표
    random_state=42,
    n_jobs=-1  # 가용한 모든 코어 사용
)

In [400]:
best_model_LGBM = model_builing(random_search_2, X_tr, y_tr)
get_score(best_model_LGBM, X_tr, X_val, y_tr, y_val)

'train: 0.08920184287828858, validation: 0.13786427939835105'

In [401]:
def create_pipeline(model, param_distributions):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # 전처리 라인 적용
        ('regressor', model(random_state=42))  # 회귀 모델 적용
    ])
    
    # RandomizedSearchCV를 사용한 하이퍼파라미터 튜닝
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=50,  # 시도 횟수
        cv=kfold,  # 교차 검증
        scoring='neg_mean_squared_error',  # 평가 지표
        random_state=42,
        n_jobs=-1  # 가용한 모든 코어 사용
    )
    
    return pipeline, random_search

# XGBOOST 파라미터 설정
param_distributions = {
    'regressor__learning_rate': uniform(0.01, 0.1),  # 학습률
    'regressor__n_estimators': randint(50, 500),  # 트리의 개수
    'regressor__max_depth': randint(3, 15),  # 트리의 최대 깊이
    'regressor__min_child_weight': randint(1, 10),  # 리프 노드의 최소 가중치
    'regressor__subsample': uniform(0.6, 0.4),  # 데이터 샘플링 비율
    'regressor__colsample_bytree': uniform(0.6, 0.4),  # 컬럼 샘플링 비율
    'regressor__reg_alpha': uniform(0.0, 0.1),  # L1 정규화
    'regressor__reg_lambda': uniform(0.0, 0.1),  # L2 정규화
    'regressor__gamma': uniform(0.0, 0.1),  # 감마 값
    #'regressor__verbose': randint(-1, 1),  # 로그 출력 수준, 안 쓰인다.
}

# XGBOOST 모델과 하이퍼파라미터 튜닝을 위한 파이프라인 생성
pipeline_3, random_search_3 = create_pipeline(XGBRegressor, param_distributions)

In [402]:
best_model_XGB = model_builing(random_search_3, X_tr, y_tr)
get_score(best_model_XGB, X_tr, X_val, y_tr, y_val)

'train: 0.05260692818732749, validation: 0.13290717144184241'

# full_data_ensemble

In [404]:
X_train.shape

(1168, 19)

In [405]:
# 최적의 모델 추출
best_lgbm = random_search_2.best_estimator_
best_xgb = random_search_3.best_estimator_

# VotingRegressor 구성
ensemble_model = VotingRegressor(estimators=[
    ('lightgbm', best_lgbm),
    ('xgboost', best_xgb)
])

# 최적화된 VotingRegressor로 학습
ensemble_model.fit(X_tr, y_tr)

In [407]:
# 테스트 데이터 평가
y_pred = ensemble_model.predict(X_val)
mse = mean_squared_error(y_test, y_pred)

print(f"앙상블 모델의 MSE: {mse}")

앙상블 모델의 MSE: 0.01784246827398391


In [408]:
def model_builing_ensemble(model, X_train, y_train):
    model_best = model.fit(X_train, y_train)
    #best_model = model.best_estimator_
    return model_best

best_model = model_builing_ensemble(ensemble_model, X_tr, y_tr)
get_score(best_model, X_tr, X_val, y_tr, y_val)

'train: 0.06887085101404136, validation: 0.13357570240872368'

# Full_Data_Stacking

In [434]:
from sklearn.ensemble import GradientBoostingRegressor

param_distributions = {
    'regressor__n_estimators': randint(50, 200),  # 트리의 개수
    'regressor__learning_rate': uniform(0.01, 0.2),  # 학습률
    'regressor__max_depth': randint(3, 10),  # 각 트리의 최대 깊이
    'regressor__min_samples_split': randint(2, 10),  # 분할을 위한 최소 샘플 수
    'regressor__min_samples_leaf': randint(1, 10),  # 리프 노드의 최소 샘플 수
}



pipeline_4, random_search_4 = create_pipeline(GradientBoostingRegressor, param_distributions)

In [437]:
best_model_GBR = model_builing(random_search_4, X_tr, y_tr)
get_score(best_model_GBR, X_tr, X_val, y_tr, y_val)

'train: 0.06821509942007573, validation: 0.13519247072710383'

In [442]:
from sklearn.svm import SVR

def create_pipeline_SVR(model, param_distributions):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # 전처리 라인 적용
        ('regressor', model())  # 회귀 모델 적용 (random_state 제거)
    ])

    # RandomizedSearchCV를 사용한 하이퍼파라미터 튜닝
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=50,  # 시도 횟수
        cv=kfold,  # 교차 검증
        scoring='neg_mean_squared_error',  # 평가 지표
        random_state=42,
        n_jobs=-1  # 가용한 모든 코어 사용
    )

    return pipeline, random_search

param_distributions = {
    'regressor__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # 커널 함수
    'regressor__C': uniform(0.1, 10),  # 정규화 강도
    'regressor__epsilon': uniform(0.1, 1),  # 허용 오차
    'regressor__gamma': ['scale', 'auto'],  # 감마 값
    'regressor__degree': randint(1, 5),  # 다항식 커널의 차수
    'regressor__coef0': uniform(-1, 1),  # 다항식 커널의 상수항
}

pipeline_5, random_search_5 = create_pipeline_SVR(SVR, param_distributions)

In [443]:
best_model_SVR = model_builing(random_search_5, X_tr, y_tr)
get_score(best_model_SVR, X_tr, X_val, y_tr, y_val)

'train: 0.1022018284482251, validation: 0.1374963324902626'

In [444]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

# 최적의 모델 추출
best_lgbm = random_search_2.best_estimator_
best_xgb = random_search_3.best_estimator_
best_gbr = random_search_4.best_estimator_
best_svr = random_search_5.best_estimator_

# 스태킹 모델 구성
stacking_model = StackingRegressor(estimators=[
    ('lightgbm', best_lgbm),
    ('xgboost', best_xgb),
    ('gbr', best_gbr),
    ('svr', best_svr)
], final_estimator=LinearRegression())

# 최적화된 스태킹 모델로 학습
stacking_model.fit(X_tr, y_tr)

In [445]:
# 테스트 데이터 평가
y_pred = stacking_model.predict(X_val)
mse = mean_squared_error(y_test, y_pred)

print(f"스태킹 모델의 MSE: {mse}")

스태킹 모델의 MSE: 0.016755087980439835


In [446]:
def model_builing_stacking(model, X_train, y_train):
    model_best = model.fit(X_train, y_train)
    #best_model = model.best_estimator_
    return model_best

best_model = model_builing_stacking(stacking_model, X_tr, y_tr)
get_score(best_model, X_tr, X_val, y_tr, y_val)

# 딱히 나아지진 않음

'train: 0.05644299119294937, validation: 0.12944144614627817'

# Conclusion

- XGBoost가 가장 성능이 좋았음

- 앙상블 모델은 모델 아키텍처 미흡
- 모델을 2개정도 할 거면 안하는게 낫다
    - 4~5개의 모델이 적합함