# 데이터 전처리 + 초기 학습 모델

- [데이터 전처리 + 초기 학습 모델](https://colab.research.google.com/drive/1FthLfmep1IeAQD6-e9JuKOpND3NGEVm_#scrollTo=Xyl49pwdq3O_)

# 데이터 준비

In [None]:
!pip install catboost
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.3-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m6.5 MB/s[0m 

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.linear_model import LogisticRegression, Ridge
import optuna
import lightgbm
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler

In [None]:
train      = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/train.csv' )
test       = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/test.csv' )
submission = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/sample_submission.csv' )
extra_data = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/train__dataset.csv' )

train.shape, test.shape, submission.shape, extra_data.shape

((42100, 19), (28068, 18), (28068, 2), (18137, 18))

In [None]:
train.drop( "id", axis = 1, inplace = True )
test.drop( "id", axis = 1, inplace = True )

# 학습 준비

In [None]:
train_targets = train[ "booking_status" ]
extra_targets = extra_data[ "booking_status" ]

train.drop(      "booking_status", axis = 1, inplace = True )
extra_data.drop( "booking_status", axis = 1, inplace = True )

In [None]:
def objective( trial ):
    params = {
        # 트리 개수 지정
        'n_estimators' : 2000,
        # 트리 최대 깊이
        'depth' : trial.suggest_int( 'depth', 2, 15 ),
        # 트리의 학습에 사용되는 하이퍼파라미터
        'subsample' : trial.suggest_float( "subsample", 0.1, 1 ),
        'l2_leaf_reg' : trial.suggest_float( "l2_leaf_reg", 0.0, 5.0 ),
        'learning_rate' : trial.suggest_float( "learning_rate", 0.0001, 0.5 ),
        # 조기 정지를 위한 라운드 수
        'early_stopping_rounds' : 100
    }

    # 모델 초기화
    model = CatBoostClassifier( **params )

    model.fit( X_train, y_train,
               eval_set = [ ( X_valid, y_valid ) ],
               verbose = 0 )

    # 검증 데이터 ROC AUC 점수
    return roc_auc_score( y_valid, model.predict_proba( X_valid )[ : , 1 ] )

In [None]:
# 테스트 데이터의 예측 결과를 저장할 빈 배열 생성
test_preds = np.zeros( len( test ) )
# 교차 검증 점수를 저장할 변수를 초기화
cv = 0
# 10-Fold 교차 검증을 수행하기 위한 StratifiedKFold 객체를 생성
kf = StratifiedKFold( n_splits = 10, shuffle = True, random_state = 42 )

# 1. 모델 학습
- DecisionTree
- MinMaxScaler

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler

df_train = pd.read_csv( '/content/drive/MyDrive/멋사/파이널프로젝트/이진 분류 - 예약 취소/train.csv' )
df_test = pd.read_csv( '/content/drive/MyDrive/멋사/파이널프로젝트/이진 분류 - 예약 취소/test.csv' )

y = df_train[ 'booking_status' ]
X = df_train.drop( [ 'booking_status' ], axis = 1 )

X = pd.concat( [ X, df_test ], ignore_index = True )
X = X.drop( [ 'id' ], axis = 1 )

scaler = MinMaxScaler( )
X_scaled = scaler.fit_transform( X )

X_train = X_scaled[ : len( df_train ) ]
X_test = X_scaled[ len( df_train ) : ]
X_train, X_test, y_train, y_test = train_test_split( X_scaled[ : len( df_train ) ],
                                                     y,
                                                     random_state = 104,
                                                     train_size = 0.8,
                                                     shuffle = True )

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score

model_dt_gs = DecisionTreeClassifier( random_state = 42 )
param_grid_dt = { 'max_depth' : [ None, 10, 20 ],
                  'min_samples_split' : [ 2, 5, 10 ],
                  'min_samples_leaf' : [ 1, 2, 4 ] }

grid_search_dt = GridSearchCV( estimator = model_dt_gs,
                               param_grid = param_grid_dt,
                               cv=5, scoring='roc_auc', n_jobs = -1 )

grid_search_dt.fit( X_train, y_train )

print( "Best Parameters (Decision Tree): ", grid_search_dt.best_params_ )
print( "Best Score (Decision Tree): ", grid_search_dt.best_score_ )

best_model_dt = grid_search_dt.best_estimator_

y_proba_dt_gs = [ ]
for sample in X_test:
    y_proba = best_model_dt.predict_proba( [sample] )[ :, 1 ]
    y_proba_dt_gs.append( y_proba[ 0 ] )

# ROC AUC 스코어 계산
roc_auc_dt_gs = roc_auc_score( y_test, y_proba_dt_gs )
print( "ROC AUC Score (Decision Tree - GridSearchCV): ", roc_auc_dt_gs )
print( "Accuracy: ", best_model_dt.score( X_test, y_test ) )

Best Parameters (Decision Tree): {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Score (Decision Tree): 0.8642528681718294
ROC AUC Score (Decision Tree - GridSearchCV): 0.8717931232903721
Accuracy: 0.809501187648456


# 2. 모델 학습
- 10 - Fold Cross Check
- DecisionTree
- HistGradientBoost
- CatBoost

### 모듈 다운

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer

### 모델 제작 및 학습

In [None]:
# 각 폴드에 대해 반복
for fold, ( train_idx, valid_idx ) in enumerate( kf.split( train, train_targets ) ):

    #  훈련 데이터와 검증 데이터를 나눔
    X_train, X_valid = train.iloc[ train_idx ], train.iloc[ valid_idx ]
    # 훈련 타겟과 검증 타겟을 나눔
    y_train, y_valid = train_targets.iloc[ train_idx ], train_targets.iloc[ valid_idx ]

    # 추가 데이터를 기존 훈련 데이터에 추가
    X_train = pd.concat( ( X_train, extra_data ) )
    y_train = pd.concat( ( y_train, extra_targets ) )

    # 결측 값 처리를 위한 SimpleImputer
    imputer = SimpleImputer( strategy = 'mean' )
    X_train_imputed = imputer.fit_transform( X_train )
    X_valid_imputed = imputer.transform( X_valid )

    # 의사결정나무 모델 정의
    dt_classifier = DecisionTreeClassifier( random_state = 42 )

    # HistGradientBoostingClassifier 모델 정의
    hgbc = HistGradientBoostingClassifier( learning_rate = 0.1,
                                           max_iter = 100,
                                           random_state = 42 )


    # CatBoost 모델 정의
    ctb = CatBoostClassifier( n_estimators = 10000,  # 트리 수 설정
                              early_stopping_rounds = 100,  # 조기 정지 설정
                              **{ # 모델의 하이퍼파라미터 설정
                                  'depth': 3,
                                  'subsample': 0.9,
                                  'l2_leaf_reg': 4.0,
                                  'learning_rate': 0.3 } )
    '''
      < subsample >
        - 각 트리가 학습에 사용하는 샘플의 비율을 결정
        - 값이 1보다 작으면 부트스트랩 샘플링 사용
        - 이 값이 클수록 더 많은 다양성을 가진 트리를 생성하며,
          작을수록 더 안정적인 트리를 생성

      < l2_leaf_reg >
        -  L2 정규화의 강도를 결정
        - 과적합을 방지
        - 값이 클수록 정규화의 강도 큼

      < learning_rate >
        - 가중치 업데이트에 대한 학습률
        - 작은 학습률은 모델을 안정적으로 만들지만 오래 걸림
    '''

    # 모델 학습

    # 의사결정나무 모델 학습
    dt_classifier.fit( X_train, y_train )
    # HistGradientBoostingClassifier 모델 학습
    hgbc.fit( X_train_imputed, y_train )
    # CatBoost 모델 학습
    ctb.fit( X_train, y_train, eval_set = ( X_valid, y_valid ), verbose = 100 )
    # eval_set: 검증 세트를 지정하여 모델의 성능 평가

    # 검증 데이터에 대한 클래스 확률 예측
    ctb_preds = ctb.predict_proba( X_valid )[ : , 1 ]
    ctb_test_preds = ctb.predict_proba( test )[ : , 1 ]

    #  ROC AUC 점수를 계산하여 모델의 성능 평가
    print (roc_auc_score ( y_valid, ctb_preds ) )

    # 메타 트레이닝 및 테스트 예측을 위해
    # CatBoost 모델의 검증 및 테스트 예측을 리스트에 저장
    meta_train = [ ctb_preds ]
    meta_test = [ ctb_test_preds ]

    # 최적의 가중치를 찾기 위한 목적 함수를 정의
    def roc_auc( weights ):
        fpred = np.zeros( len( meta_train[ 0 ] ) )
        for i, pred in enumerate( meta_train ):
            fpred += weights[ i ] * pred

        return -roc_auc_score( y_valid, fpred )

    starting_values = [ 0.33 ]*len( meta_train )
    cons = ( { 'type' : 'eq', 'fun' : lambda w : 1 - sum( w ) } )
    bounds = [ ( -1, 1 ) ] * len( meta_train )
    # 최적의 가중치 탐색
    res = minimize( roc_auc,
                    starting_values,
                    method = 'Nelder-Mead',
                    bounds = bounds,
                    constraints = cons )

    print( -res [ "fun" ] )

    # 교차 검증 점수에 최적 가중치에 대한 목적 함수 값의 음수를 더함
    # 최적화 과정에서 최대값 찾음
    cv -= res[ "fun" ]

    # 최적 가중치를 사용하여 테스트 데이터에 대한 예측을 업데이트
    for i, pred in enumerate( meta_test ):
        test_preds += res[ "x" ][ i ] * pred

# 모든 폴드에 대한 교차 검증 점수의 평균을 출력
print( f"CV: { cv / ( fold + 1) }" )

0:	learn: 0.5883211	test: 0.5890032	best: 0.5890032 (0)	total: 60.2ms	remaining: 10m 1s
100:	learn: 0.3839567	test: 0.4144221	best: 0.4144221 (100)	total: 2.13s	remaining: 3m 28s
200:	learn: 0.3681738	test: 0.4058301	best: 0.4058283 (199)	total: 4.84s	remaining: 3m 55s
300:	learn: 0.3587661	test: 0.4005886	best: 0.4005886 (300)	total: 7.57s	remaining: 4m 3s
400:	learn: 0.3516108	test: 0.3968937	best: 0.3968590 (398)	total: 12.8s	remaining: 5m 5s
500:	learn: 0.3458188	test: 0.3959951	best: 0.3958694 (480)	total: 15.4s	remaining: 4m 52s
600:	learn: 0.3406360	test: 0.3940525	best: 0.3940034 (576)	total: 19.3s	remaining: 5m 2s
700:	learn: 0.3361098	test: 0.3923377	best: 0.3922277 (692)	total: 23s	remaining: 5m 4s
800:	learn: 0.3323042	test: 0.3911076	best: 0.3908780 (765)	total: 27.1s	remaining: 5m 11s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3908780459
bestIteration = 765

Shrink model to first 766 iterations.
0.9000944602272727


  warn('Method %s cannot handle constraints.' % method,


0.9000944602272727
0:	learn: 0.5887196	test: 0.5841636	best: 0.5841636 (0)	total: 11.3ms	remaining: 1m 52s
100:	learn: 0.3831325	test: 0.4143425	best: 0.4143425 (100)	total: 1.13s	remaining: 1m 50s
200:	learn: 0.3678402	test: 0.4044687	best: 0.4044423 (199)	total: 2.28s	remaining: 1m 51s
300:	learn: 0.3588870	test: 0.4021246	best: 0.4020507 (281)	total: 4.28s	remaining: 2m 17s
400:	learn: 0.3517048	test: 0.3986182	best: 0.3986182 (400)	total: 6.7s	remaining: 2m 40s
500:	learn: 0.3455797	test: 0.3970090	best: 0.3970090 (500)	total: 7.85s	remaining: 2m 28s
600:	learn: 0.3409328	test: 0.3964314	best: 0.3961125 (548)	total: 8.99s	remaining: 2m 20s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3961124999
bestIteration = 548

Shrink model to first 549 iterations.
0.8971974431818182
0.8971974431818182


  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5879546	test: 0.5908749	best: 0.5908749 (0)	total: 11.6ms	remaining: 1m 56s
100:	learn: 0.3830076	test: 0.4088442	best: 0.4088442 (100)	total: 1.15s	remaining: 1m 53s
200:	learn: 0.3677763	test: 0.4018608	best: 0.4018608 (200)	total: 2.29s	remaining: 1m 51s
300:	learn: 0.3583164	test: 0.3995305	best: 0.3992636 (290)	total: 3.45s	remaining: 1m 51s
400:	learn: 0.3506388	test: 0.3976897	best: 0.3975793 (396)	total: 4.59s	remaining: 1m 49s
500:	learn: 0.3449669	test: 0.3965711	best: 0.3965711 (500)	total: 6.78s	remaining: 2m 8s
600:	learn: 0.3402215	test: 0.3956738	best: 0.3955794 (598)	total: 9.04s	remaining: 2m 21s
700:	learn: 0.3356193	test: 0.3949351	best: 0.3948042 (683)	total: 10.2s	remaining: 2m 15s
800:	learn: 0.3316373	test: 0.3943566	best: 0.3938767 (770)	total: 11.4s	remaining: 2m 10s
900:	learn: 0.3281476	test: 0.3931451	best: 0.3930704 (896)	total: 12.5s	remaining: 2m 6s
1000:	learn: 0.3249610	test: 0.3934601	best: 0.3927897 (963)	total: 13.7s	remaining: 2m 3s
1100

  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5887412	test: 0.5839223	best: 0.5839223 (0)	total: 11.1ms	remaining: 1m 50s
100:	learn: 0.3837136	test: 0.4006753	best: 0.4006753 (100)	total: 1.16s	remaining: 1m 53s
200:	learn: 0.3684555	test: 0.3923712	best: 0.3923712 (200)	total: 2.3s	remaining: 1m 52s
300:	learn: 0.3592931	test: 0.3893431	best: 0.3892157 (297)	total: 3.47s	remaining: 1m 51s
400:	learn: 0.3522368	test: 0.3878470	best: 0.3877770 (399)	total: 4.61s	remaining: 1m 50s
500:	learn: 0.3460807	test: 0.3855737	best: 0.3855737 (500)	total: 5.79s	remaining: 1m 49s
600:	learn: 0.3411254	test: 0.3849073	best: 0.3845271 (584)	total: 6.96s	remaining: 1m 48s
700:	learn: 0.3365212	test: 0.3834406	best: 0.3832365 (678)	total: 8.52s	remaining: 1m 52s
800:	learn: 0.3326704	test: 0.3828956	best: 0.3828956 (800)	total: 10.9s	remaining: 2m 5s
900:	learn: 0.3293426	test: 0.3837283	best: 0.3824926 (822)	total: 12.6s	remaining: 2m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3824926115
bestIteration = 

  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5875837	test: 0.5916127	best: 0.5916127 (0)	total: 10.8ms	remaining: 1m 48s
100:	learn: 0.3822799	test: 0.4298303	best: 0.4298303 (100)	total: 1.17s	remaining: 1m 54s
200:	learn: 0.3671037	test: 0.4225030	best: 0.4225030 (200)	total: 2.33s	remaining: 1m 53s
300:	learn: 0.3573331	test: 0.4185942	best: 0.4182948 (288)	total: 3.5s	remaining: 1m 52s
400:	learn: 0.3501088	test: 0.4169611	best: 0.4169611 (400)	total: 4.65s	remaining: 1m 51s
500:	learn: 0.3439278	test: 0.4152031	best: 0.4150734 (471)	total: 5.78s	remaining: 1m 49s
600:	learn: 0.3391469	test: 0.4156221	best: 0.4149586 (519)	total: 7.05s	remaining: 1m 50s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.414958649
bestIteration = 519

Shrink model to first 520 iterations.
0.8860542140151515
0.8860542140151515


  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5883710	test: 0.5875696	best: 0.5875696 (0)	total: 11.3ms	remaining: 1m 52s
100:	learn: 0.3846848	test: 0.4020208	best: 0.4020208 (100)	total: 1.14s	remaining: 1m 52s
200:	learn: 0.3694045	test: 0.3937912	best: 0.3937912 (200)	total: 2.3s	remaining: 1m 52s
300:	learn: 0.3597703	test: 0.3887750	best: 0.3887750 (300)	total: 3.42s	remaining: 1m 50s
400:	learn: 0.3529900	test: 0.3868541	best: 0.3867522 (396)	total: 4.55s	remaining: 1m 48s
500:	learn: 0.3474385	test: 0.3847700	best: 0.3846294 (493)	total: 5.72s	remaining: 1m 48s
600:	learn: 0.3421211	test: 0.3827126	best: 0.3826283 (593)	total: 6.85s	remaining: 1m 47s
700:	learn: 0.3372784	test: 0.3810517	best: 0.3810517 (700)	total: 8.02s	remaining: 1m 46s
800:	learn: 0.3330375	test: 0.3795432	best: 0.3795281 (799)	total: 11.5s	remaining: 2m 12s
900:	learn: 0.3296061	test: 0.3788819	best: 0.3786739 (883)	total: 13.7s	remaining: 2m 18s
1000:	learn: 0.3264781	test: 0.3782703	best: 0.3780721 (992)	total: 14.9s	remaining: 2m 13s
11

  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5883524	test: 0.5891780	best: 0.5891780 (0)	total: 11.6ms	remaining: 1m 56s
100:	learn: 0.3851995	test: 0.4031009	best: 0.4031009 (100)	total: 1.15s	remaining: 1m 52s
200:	learn: 0.3702074	test: 0.3915534	best: 0.3915534 (200)	total: 2.25s	remaining: 1m 49s
300:	learn: 0.3608140	test: 0.3853826	best: 0.3853255 (293)	total: 4.4s	remaining: 2m 21s
400:	learn: 0.3534884	test: 0.3816169	best: 0.3815827 (389)	total: 6.58s	remaining: 2m 37s
500:	learn: 0.3477387	test: 0.3797362	best: 0.3797362 (500)	total: 7.73s	remaining: 2m 26s
600:	learn: 0.3423062	test: 0.3782311	best: 0.3780831 (590)	total: 8.87s	remaining: 2m 18s
700:	learn: 0.3379110	test: 0.3774457	best: 0.3774457 (700)	total: 10s	remaining: 2m 12s
800:	learn: 0.3341877	test: 0.3759110	best: 0.3757787 (781)	total: 11.2s	remaining: 2m 8s
900:	learn: 0.3308493	test: 0.3753811	best: 0.3753811 (900)	total: 12.3s	remaining: 2m 4s
1000:	learn: 0.3277480	test: 0.3756710	best: 0.3752284 (901)	total: 13.5s	remaining: 2m
Stopped by

  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5882175	test: 0.5886826	best: 0.5886826 (0)	total: 10.3ms	remaining: 1m 43s
100:	learn: 0.3823687	test: 0.4211491	best: 0.4209913 (97)	total: 1.6s	remaining: 2m 36s
200:	learn: 0.3672025	test: 0.4140583	best: 0.4140583 (200)	total: 4.15s	remaining: 3m 22s
300:	learn: 0.3580396	test: 0.4111324	best: 0.4111324 (300)	total: 5.56s	remaining: 2m 59s
400:	learn: 0.3509849	test: 0.4095509	best: 0.4093778 (397)	total: 6.69s	remaining: 2m 40s
500:	learn: 0.3448200	test: 0.4084836	best: 0.4084681 (486)	total: 7.82s	remaining: 2m 28s
600:	learn: 0.3397803	test: 0.4075530	best: 0.4074939 (587)	total: 8.97s	remaining: 2m 20s
700:	learn: 0.3353122	test: 0.4074678	best: 0.4069489 (634)	total: 10.1s	remaining: 2m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.406948899
bestIteration = 634

Shrink model to first 635 iterations.
0.8908983601776985
0.8908983601776985


  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5889018	test: 0.5840727	best: 0.5840727 (0)	total: 12.4ms	remaining: 2m 3s
100:	learn: 0.3831191	test: 0.4127169	best: 0.4127169 (100)	total: 1.16s	remaining: 1m 53s
200:	learn: 0.3676914	test: 0.4049816	best: 0.4048391 (197)	total: 2.38s	remaining: 1m 56s
300:	learn: 0.3585315	test: 0.4008807	best: 0.4007936 (291)	total: 4.93s	remaining: 2m 39s
400:	learn: 0.3514018	test: 0.3986942	best: 0.3986646 (398)	total: 6.64s	remaining: 2m 38s
500:	learn: 0.3458648	test: 0.3964472	best: 0.3964472 (500)	total: 7.78s	remaining: 2m 27s
600:	learn: 0.3404227	test: 0.3933625	best: 0.3930780 (592)	total: 8.9s	remaining: 2m 19s
700:	learn: 0.3358552	test: 0.3930583	best: 0.3930354 (614)	total: 10s	remaining: 2m 13s
800:	learn: 0.3322970	test: 0.3935714	best: 0.3928495 (712)	total: 11.2s	remaining: 2m 8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3928495408
bestIteration = 712

Shrink model to first 713 iterations.
0.8977652536421495
0.8977652536421495


  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5884274	test: 0.5873365	best: 0.5873365 (0)	total: 11ms	remaining: 1m 49s
100:	learn: 0.3819514	test: 0.4076149	best: 0.4076149 (100)	total: 1.11s	remaining: 1m 48s
200:	learn: 0.3673256	test: 0.4005830	best: 0.4005830 (200)	total: 2.21s	remaining: 1m 47s
300:	learn: 0.3582657	test: 0.3946079	best: 0.3946079 (300)	total: 4.24s	remaining: 2m 16s
400:	learn: 0.3509656	test: 0.3933809	best: 0.3933809 (400)	total: 6.59s	remaining: 2m 37s
500:	learn: 0.3458520	test: 0.3929178	best: 0.3926872 (492)	total: 7.74s	remaining: 2m 26s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3926872
bestIteration = 492

Shrink model to first 493 iterations.
0.8979493996202048
0.8979493996202048
CV: 0.8989589277937148


  warn('Method %s cannot handle constraints.' % method,


### 데이터 프레임 구성 및 파일 생성

In [None]:
train = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/train.csv' )
test = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/test.csv' )

In [None]:
# test_preds는 이전 단계에서 각 폴드에서 얻은 예측 결과
# (fold+1)은 폴드의 수로 나누어 결과를 평균화
submission[ "booking_status" ] = test_preds / ( fold + 1 )
y = 'booking_status'

# 테스트 데이터에서 'id' 열을 제외한 중복된 특성 가져옴
dup_features = test.drop( columns = 'id').columns.tolist( )

# 이너 조인하여 중복된 샘플을 찾음
# 데이터프레임 생성
values_to_assign = test.merge( train.drop( columns = 'id' ), on = dup_features, how = 'inner' )[[ 'id', y ] ]

# 중복된 샘플의 예측 값을 수정하기 위해 사용할 매핑 딕셔너리를 생성
# booking_status 최대값 : 0, 최소값 : 1 매핑
map_di = { 0 : submission[ y ].max( ), 1 : submission[ y ].min( ) }
# 중복된 샘플의 booking_status 값 수정
# 매핑 딕셔너리를 사용해 수정된 값 할당
submission.loc[ submission.id.isin( values_to_assign.id ), y ] = values_to_assign[ y ].map( map_di ).values

In [None]:
submission_1 = submission
submission_1

Unnamed: 0,id,booking_status
0,42100,0.048162
1,42101,0.013577
2,42102,0.111172
3,42103,0.017520
4,42104,0.130589
...,...,...
28063,70163,0.320713
28064,70164,0.016052
28065,70165,0.023209
28066,70166,0.164267


In [None]:
submission_1.to_csv( "submission_1_10Fold+Decison+HgB+CatB.csv", index = False )

# 3. 모델 학습
- 10 - Fold Cross Check
- RandomForest
- HistGradientBoost
- CatBoost

### 모듈 다운

In [None]:
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from catboost import CatBoostClassifier
from scipy.optimize import minimize
import numpy as np

### 모델 제작 및 학습

In [None]:
# 각 폴드에 대해 반복
for fold, ( train_idx, valid_idx ) in enumerate( kf.split( train, train_targets ) ):

    #  훈련 데이터와 검증 데이터를 나눔
    X_train, X_valid = train.iloc[ train_idx ], train.iloc[ valid_idx ]
    # 훈련 타겟과 검증 타겟을 나눔
    y_train, y_valid = train_targets.iloc[ train_idx ], train_targets.iloc[ valid_idx ]

    # 추가 데이터를 기존 훈련 데이터에 추가
    X_train = pd.concat( ( X_train, extra_data ) )
    y_train = pd.concat( ( y_train, extra_targets ) )

    # 결측 값 처리를 위한 SimpleImputer
    imputer = SimpleImputer( strategy = 'mean' )
    X_train_imputed = imputer.fit_transform( X_train )
    X_valid_imputed = imputer.transform( X_valid )

    # 랜덤포레스트 모델 정의
    rf_classifier = RandomForestClassifier( n_estimators = 100, random_state = 42 )

    # HistGradientBoostingClassifier 모델 정의
    hgbc = HistGradientBoostingClassifier( learning_rate = 0.1,
                                           max_iter = 100,
                                           random_state = 42 )


    # CatBoost 모델 정의
    ctb = CatBoostClassifier( n_estimators = 10000,  # 트리 수 설정
                              early_stopping_rounds = 100,  # 조기 정지 설정
                              **{ # 모델의 하이퍼파라미터 설정
                                  'depth': 3,
                                  'subsample': 0.9,
                                  'l2_leaf_reg': 4.0,
                                  'learning_rate': 0.3 } )
    '''
      < subsample >
        - 각 트리가 학습에 사용하는 샘플의 비율을 결정
        - 값이 1보다 작으면 부트스트랩 샘플링 사용
        - 이 값이 클수록 더 많은 다양성을 가진 트리를 생성하며,
          작을수록 더 안정적인 트리를 생성

      < l2_leaf_reg >
        -  L2 정규화의 강도를 결정
        - 과적합을 방지
        - 값이 클수록 정규화의 강도 큼

      < learning_rate >
        - 가중치 업데이트에 대한 학습률
        - 작은 학습률은 모델을 안정적으로 만들지만 오래 걸림
    '''

    # 모델 학습

    # 랜덤포레스트 모델 학습
    rf_classifier.fit( X_train, y_train )
    # HistGradientBoostingClassifier 모델 학습
    hgbc.fit( X_train_imputed, y_train )
    # CatBoost 모델 학습
    ctb.fit( X_train, y_train, eval_set = ( X_valid, y_valid ), verbose = 100 )
    # eval_set: 검증 세트를 지정하여 모델의 성능 평가

    # 검증 데이터에 대한 클래스 확률 예측
    ctb_preds = ctb.predict_proba( X_valid )[ : , 1 ]
    ctb_test_preds = ctb.predict_proba( test )[ : , 1 ]

    #  ROC AUC 점수를 계산하여 모델의 성능 평가
    print (roc_auc_score ( y_valid, ctb_preds ) )

    # 메타 트레이닝 및 테스트 예측을 위해
    # CatBoost 모델의 검증 및 테스트 예측을 리스트에 저장
    meta_train = [ ctb_preds ]
    meta_test = [ ctb_test_preds ]

    # 최적의 가중치를 찾기 위한 목적 함수를 정의
    def roc_auc( weights ):
        fpred = np.zeros( len( meta_train[ 0 ] ) )
        for i, pred in enumerate( meta_train ):
            fpred += weights[ i ] * pred

        return -roc_auc_score( y_valid, fpred )

    starting_values = [ 0.33 ]*len( meta_train )
    cons = ( { 'type' : 'eq', 'fun' : lambda w : 1 - sum( w ) } )
    bounds = [ ( -1, 1 ) ] * len( meta_train )
    # 최적의 가중치 탐색
    res = minimize( roc_auc,
                    starting_values,
                    method = 'Nelder-Mead',
                    bounds = bounds,
                    constraints = cons )

    print( -res [ "fun" ] )

    # 교차 검증 점수에 최적 가중치에 대한 목적 함수 값의 음수를 더함
    # 최적화 과정에서 최대값 찾음
    cv -= res[ "fun" ]

    # 최적 가중치를 사용하여 테스트 데이터에 대한 예측을 업데이트
    for i, pred in enumerate( meta_test ):
        test_preds += res[ "x" ][ i ] * pred

# 모든 폴드에 대한 교차 검증 점수의 평균을 출력
print( f"CV: { cv / ( fold + 1) }" )

0:	learn: 0.5883211	test: 0.5890032	best: 0.5890032 (0)	total: 46.4ms	remaining: 7m 44s
100:	learn: 0.3839567	test: 0.4144221	best: 0.4144221 (100)	total: 2.78s	remaining: 4m 32s
200:	learn: 0.3681738	test: 0.4058301	best: 0.4058283 (199)	total: 5.91s	remaining: 4m 47s
300:	learn: 0.3587661	test: 0.4005886	best: 0.4005886 (300)	total: 8.74s	remaining: 4m 41s
400:	learn: 0.3516108	test: 0.3968937	best: 0.3968590 (398)	total: 11.3s	remaining: 4m 29s
500:	learn: 0.3458188	test: 0.3959951	best: 0.3958694 (480)	total: 13.1s	remaining: 4m 9s
600:	learn: 0.3406360	test: 0.3940525	best: 0.3940034 (576)	total: 14.3s	remaining: 3m 43s
700:	learn: 0.3361098	test: 0.3923377	best: 0.3922277 (692)	total: 15.4s	remaining: 3m 24s
800:	learn: 0.3323042	test: 0.3911076	best: 0.3908780 (765)	total: 16.6s	remaining: 3m 10s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3908780459
bestIteration = 765

Shrink model to first 766 iterations.
0.9000944602272727
0.9000944602272727


  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5887196	test: 0.5841636	best: 0.5841636 (0)	total: 10.7ms	remaining: 1m 47s
100:	learn: 0.3831325	test: 0.4143425	best: 0.4143425 (100)	total: 1.16s	remaining: 1m 53s
200:	learn: 0.3678402	test: 0.4044687	best: 0.4044423 (199)	total: 2.27s	remaining: 1m 50s
300:	learn: 0.3588870	test: 0.4021246	best: 0.4020507 (281)	total: 3.38s	remaining: 1m 48s
400:	learn: 0.3517048	test: 0.3986182	best: 0.3986182 (400)	total: 4.5s	remaining: 1m 47s
500:	learn: 0.3455797	test: 0.3970090	best: 0.3970090 (500)	total: 5.64s	remaining: 1m 47s
600:	learn: 0.3409328	test: 0.3964314	best: 0.3961125 (548)	total: 6.78s	remaining: 1m 46s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3961124999
bestIteration = 548

Shrink model to first 549 iterations.
0.8971974431818182
0.8971974431818182


  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5879546	test: 0.5908749	best: 0.5908749 (0)	total: 10.9ms	remaining: 1m 49s
100:	learn: 0.3830076	test: 0.4088442	best: 0.4088442 (100)	total: 1.17s	remaining: 1m 54s
200:	learn: 0.3677763	test: 0.4018608	best: 0.4018608 (200)	total: 2.3s	remaining: 1m 52s
300:	learn: 0.3583164	test: 0.3995305	best: 0.3992636 (290)	total: 3.44s	remaining: 1m 51s
400:	learn: 0.3506388	test: 0.3976897	best: 0.3975793 (396)	total: 4.62s	remaining: 1m 50s
500:	learn: 0.3449669	test: 0.3965711	best: 0.3965711 (500)	total: 6.16s	remaining: 1m 56s
600:	learn: 0.3402215	test: 0.3956738	best: 0.3955794 (598)	total: 8.72s	remaining: 2m 16s
700:	learn: 0.3356193	test: 0.3949351	best: 0.3948042 (683)	total: 10.3s	remaining: 2m 17s
800:	learn: 0.3316373	test: 0.3943566	best: 0.3938767 (770)	total: 11.5s	remaining: 2m 11s
900:	learn: 0.3281476	test: 0.3931451	best: 0.3930704 (896)	total: 12.6s	remaining: 2m 7s
1000:	learn: 0.3249610	test: 0.3934601	best: 0.3927897 (963)	total: 13.7s	remaining: 2m 3s
1100

  warn('Method %s cannot handle constraints.' % method,


0.900833925189394
0:	learn: 0.5887412	test: 0.5839223	best: 0.5839223 (0)	total: 11.2ms	remaining: 1m 52s
100:	learn: 0.3837136	test: 0.4006753	best: 0.4006753 (100)	total: 1.16s	remaining: 1m 53s
200:	learn: 0.3684555	test: 0.3923712	best: 0.3923712 (200)	total: 2.26s	remaining: 1m 50s
300:	learn: 0.3592931	test: 0.3893431	best: 0.3892157 (297)	total: 4.31s	remaining: 2m 18s
400:	learn: 0.3522368	test: 0.3878470	best: 0.3877770 (399)	total: 6.68s	remaining: 2m 39s
500:	learn: 0.3460807	test: 0.3855737	best: 0.3855737 (500)	total: 7.8s	remaining: 2m 27s
600:	learn: 0.3411254	test: 0.3849073	best: 0.3845271 (584)	total: 8.96s	remaining: 2m 20s
700:	learn: 0.3365212	test: 0.3834406	best: 0.3832365 (678)	total: 10.1s	remaining: 2m 13s
800:	learn: 0.3326704	test: 0.3828956	best: 0.3828956 (800)	total: 11.2s	remaining: 2m 9s
900:	learn: 0.3293426	test: 0.3837283	best: 0.3824926 (822)	total: 12.4s	remaining: 2m 5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.382492611

  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5875837	test: 0.5916127	best: 0.5916127 (0)	total: 15.3ms	remaining: 2m 32s
100:	learn: 0.3822799	test: 0.4298303	best: 0.4298303 (100)	total: 1.67s	remaining: 2m 43s
200:	learn: 0.3671037	test: 0.4225030	best: 0.4225030 (200)	total: 3.29s	remaining: 2m 40s
300:	learn: 0.3573331	test: 0.4185942	best: 0.4182948 (288)	total: 4.42s	remaining: 2m 22s
400:	learn: 0.3501088	test: 0.4169611	best: 0.4169611 (400)	total: 5.59s	remaining: 2m 13s
500:	learn: 0.3439278	test: 0.4152031	best: 0.4150734 (471)	total: 6.74s	remaining: 2m 7s
600:	learn: 0.3391469	test: 0.4156221	best: 0.4149586 (519)	total: 8.71s	remaining: 2m 16s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.414958649
bestIteration = 519

Shrink model to first 520 iterations.
0.8860542140151515
0.8860542140151515


  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5883710	test: 0.5875696	best: 0.5875696 (0)	total: 10.9ms	remaining: 1m 48s
100:	learn: 0.3846848	test: 0.4020208	best: 0.4020208 (100)	total: 1.13s	remaining: 1m 51s
200:	learn: 0.3694045	test: 0.3937912	best: 0.3937912 (200)	total: 2.26s	remaining: 1m 50s
300:	learn: 0.3597703	test: 0.3887750	best: 0.3887750 (300)	total: 4.31s	remaining: 2m 18s
400:	learn: 0.3529900	test: 0.3868541	best: 0.3867522 (396)	total: 6.71s	remaining: 2m 40s
500:	learn: 0.3474385	test: 0.3847700	best: 0.3846294 (493)	total: 7.87s	remaining: 2m 29s
600:	learn: 0.3421211	test: 0.3827126	best: 0.3826283 (593)	total: 9s	remaining: 2m 20s
700:	learn: 0.3372784	test: 0.3810517	best: 0.3810517 (700)	total: 10.1s	remaining: 2m 14s
800:	learn: 0.3330375	test: 0.3795432	best: 0.3795281 (799)	total: 11.3s	remaining: 2m 9s
900:	learn: 0.3296061	test: 0.3788819	best: 0.3786739 (883)	total: 12.4s	remaining: 2m 5s
1000:	learn: 0.3264781	test: 0.3782703	best: 0.3780721 (992)	total: 13.5s	remaining: 2m 1s
1100:	l

  warn('Method %s cannot handle constraints.' % method,


0.9060776515151514
0:	learn: 0.5883524	test: 0.5891780	best: 0.5891780 (0)	total: 11.1ms	remaining: 1m 51s
100:	learn: 0.3851995	test: 0.4031009	best: 0.4031009 (100)	total: 1.13s	remaining: 1m 50s
200:	learn: 0.3702074	test: 0.3915534	best: 0.3915534 (200)	total: 2.27s	remaining: 1m 50s
300:	learn: 0.3608140	test: 0.3853826	best: 0.3853255 (293)	total: 3.42s	remaining: 1m 50s
400:	learn: 0.3534884	test: 0.3816169	best: 0.3815827 (389)	total: 5.93s	remaining: 2m 21s
500:	learn: 0.3477387	test: 0.3797362	best: 0.3797362 (500)	total: 7.83s	remaining: 2m 28s
600:	learn: 0.3423062	test: 0.3782311	best: 0.3780831 (590)	total: 8.99s	remaining: 2m 20s
700:	learn: 0.3379110	test: 0.3774457	best: 0.3774457 (700)	total: 10.1s	remaining: 2m 14s
800:	learn: 0.3341877	test: 0.3759110	best: 0.3757787 (781)	total: 11.3s	remaining: 2m 9s
900:	learn: 0.3308493	test: 0.3753811	best: 0.3753811 (900)	total: 12.4s	remaining: 2m 5s
1000:	learn: 0.3277480	test: 0.3756710	best: 0.3752284 (901)	total: 13.6s	re

  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5882175	test: 0.5886826	best: 0.5886826 (0)	total: 10.9ms	remaining: 1m 49s
100:	learn: 0.3823687	test: 0.4211491	best: 0.4209913 (97)	total: 1.16s	remaining: 1m 54s
200:	learn: 0.3672025	test: 0.4140583	best: 0.4140583 (200)	total: 2.34s	remaining: 1m 54s
300:	learn: 0.3580396	test: 0.4111324	best: 0.4111324 (300)	total: 3.48s	remaining: 1m 52s
400:	learn: 0.3509849	test: 0.4095509	best: 0.4093778 (397)	total: 4.63s	remaining: 1m 50s
500:	learn: 0.3448200	test: 0.4084836	best: 0.4084681 (486)	total: 5.78s	remaining: 1m 49s
600:	learn: 0.3397803	test: 0.4075530	best: 0.4074939 (587)	total: 6.94s	remaining: 1m 48s
700:	learn: 0.3353122	test: 0.4074678	best: 0.4069489 (634)	total: 8.8s	remaining: 1m 56s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.406948899
bestIteration = 634

Shrink model to first 635 iterations.
0.8908983601776985
0.8908983601776985


  warn('Method %s cannot handle constraints.' % method,


0:	learn: 0.5889018	test: 0.5840727	best: 0.5840727 (0)	total: 11ms	remaining: 1m 49s
100:	learn: 0.3831191	test: 0.4127169	best: 0.4127169 (100)	total: 1.12s	remaining: 1m 49s
200:	learn: 0.3676914	test: 0.4049816	best: 0.4048391 (197)	total: 2.22s	remaining: 1m 48s
300:	learn: 0.3585315	test: 0.4008807	best: 0.4007936 (291)	total: 4.36s	remaining: 2m 20s
400:	learn: 0.3514018	test: 0.3986942	best: 0.3986646 (398)	total: 6.71s	remaining: 2m 40s
500:	learn: 0.3458648	test: 0.3964472	best: 0.3964472 (500)	total: 7.84s	remaining: 2m 28s
600:	learn: 0.3404227	test: 0.3933625	best: 0.3930780 (592)	total: 8.95s	remaining: 2m 20s
700:	learn: 0.3358552	test: 0.3930583	best: 0.3930354 (614)	total: 10.1s	remaining: 2m 13s
800:	learn: 0.3322970	test: 0.3935714	best: 0.3928495 (712)	total: 11.2s	remaining: 2m 8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3928495408
bestIteration = 712

Shrink model to first 713 iterations.
0.8977652536421495


  warn('Method %s cannot handle constraints.' % method,


0.8977652536421495
0:	learn: 0.5884274	test: 0.5873365	best: 0.5873365 (0)	total: 10.9ms	remaining: 1m 49s
100:	learn: 0.3819514	test: 0.4076149	best: 0.4076149 (100)	total: 1.12s	remaining: 1m 50s
200:	learn: 0.3673256	test: 0.4005830	best: 0.4005830 (200)	total: 2.24s	remaining: 1m 49s
300:	learn: 0.3582657	test: 0.3946079	best: 0.3946079 (300)	total: 3.36s	remaining: 1m 48s
400:	learn: 0.3509656	test: 0.3933809	best: 0.3933809 (400)	total: 4.51s	remaining: 1m 47s
500:	learn: 0.3458520	test: 0.3929178	best: 0.3926872 (492)	total: 6.33s	remaining: 1m 59s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3926872
bestIteration = 492

Shrink model to first 493 iterations.
0.8979493996202048
0.8979493996202048
CV: 0.8989589277937148


  warn('Method %s cannot handle constraints.' % method,


### 데이터 프레임 구성 및 파일 생성

In [None]:
train = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/train.csv' )
test = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/test.csv' )

In [None]:
# test_preds는 이전 단계에서 각 폴드에서 얻은 예측 결과
# (fold+1)은 폴드의 수로 나누어 결과를 평균화
submission[ "booking_status" ] = test_preds / ( fold + 1 )
y = 'booking_status'

# 테스트 데이터에서 'id' 열을 제외한 중복된 특성 가져옴
dup_features = test.drop( columns = 'id').columns.tolist( )

# 이너 조인하여 중복된 샘플을 찾음
# 데이터프레임 생성
values_to_assign = test.merge( train.drop( columns = 'id' ), on = dup_features, how = 'inner' )[[ 'id', y ] ]

# 중복된 샘플의 예측 값을 수정하기 위해 사용할 매핑 딕셔너리를 생성
# booking_status 최대값 : 0, 최소값 : 1 매핑
map_di = { 0 : submission[ y ].max( ), 1 : submission[ y ].min( ) }
# 중복된 샘플의 booking_status 값 수정
# 매핑 딕셔너리를 사용해 수정된 값 할당
submission.loc[ submission.id.isin( values_to_assign.id ), y ] = values_to_assign[ y ].map( map_di ).values

In [None]:
submission_2 = submission
submission_2

Unnamed: 0,id,booking_status
0,42100,0.048162
1,42101,0.013577
2,42102,0.111172
3,42103,0.017520
4,42104,0.130589
...,...,...
28063,70163,0.320713
28064,70164,0.016052
28065,70165,0.023209
28066,70166,0.164267


In [None]:
submission_2.to_csv( "submission_2+10Fold+Random+HgB+CatB.csv", index = False )

# 4. 모델 학습 - 포기
- VS 코드에서 진행

- 10 - Fold Cross Check
- GridSearchCV
- RandomForest
- HistGradientBoost
- CatBoost

### 모듈 다운

In [None]:
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from scipy.optimize import minimize
import numpy as np

### 모델 제작 및 학습

In [None]:
# 각 폴드에 대해 반복
for fold, ( train_idx, valid_idx ) in enumerate( kf.split( train, train_targets ) ):

    #  훈련 데이터와 검증 데이터를 나눔
    X_train, X_valid = train.iloc[ train_idx ], train.iloc[ valid_idx ]
    # 훈련 타겟과 검증 타겟을 나눔
    y_train, y_valid = train_targets.iloc[ train_idx ], train_targets.iloc[ valid_idx ]

    # 추가 데이터를 기존 훈련 데이터에 추가
    X_train = pd.concat( ( X_train, extra_data ) )
    y_train = pd.concat( ( y_train, extra_targets ) )

    # 결측 값 처리를 위한 SimpleImputer
    imputer = SimpleImputer( strategy = 'mean' )
    X_train_imputed = imputer.fit_transform( X_train )
    X_valid_imputed = imputer.transform( X_valid )

    # 랜덤포레스트 모델 정의
    rf_classifier = RandomForestClassifier( n_estimators = 100, random_state = 42 )

    # HistGradientBoostingClassifier 모델 정의
    hgbc = HistGradientBoostingClassifier( learning_rate = 0.1,
                                           max_iter = 100,
                                           random_state = 42 )


    # CatBoost 모델 정의
    ctb = CatBoostClassifier( n_estimators = 10000,  # 트리 수 설정
                              early_stopping_rounds = 100,  # 조기 정지 설정
                              **{ # 모델의 하이퍼파라미터 설정
                                  'depth': 3,
                                  'subsample': 0.9,
                                  'l2_leaf_reg': 4.0,
                                  'learning_rate': 0.3 } )
    '''
      < subsample >
        - 각 트리가 학습에 사용하는 샘플의 비율을 결정
        - 값이 1보다 작으면 부트스트랩 샘플링 사용
        - 이 값이 클수록 더 많은 다양성을 가진 트리를 생성하며,
          작을수록 더 안정적인 트리를 생성

      < l2_leaf_reg >
        -  L2 정규화의 강도를 결정
        - 과적합을 방지
        - 값이 클수록 정규화의 강도 큼

      < learning_rate >
        - 가중치 업데이트에 대한 학습률
        - 작은 학습률은 모델을 안정적으로 만들지만 오래 걸림
    '''

    # 탐색할 하이퍼파라미터 그리드
    param_grid = { 'depth':         [ 3, 6, 9 ],  # 트리 깊이
                   'learning_rate': [ 0.1, 0.3, 0.5 ],  # 학습률
                   'l2_leaf_reg':   [ 1, 3, 5 ] } # L2 정규화 강도
    # 그리드 서치 초기화
    grid_search = GridSearchCV( estimator = ctb,
                                param_grid = param_grid,
                                cv = 5,
                                scoring = 'roc_auc',
                                n_jobs = -1 )

    # 모델 학습
    # 그리드 서치를 사용하여 모델 피팅
    grid_search.fit( X_train, y_train )
    # 랜덤포레스트 모델 학습
    rf_classifier.fit( X_train, y_train )
    # HistGradientBoostingClassifier 모델 학습
    hgbc.fit( X_train_imputed, y_train )
    # CatBoost 모델 학습
    ctb.fit( X_train, y_train, eval_set = ( X_valid, y_valid ), verbose = 100 )
    # eval_set: 검증 세트를 지정하여 모델의 성능 평가

    # 검증 데이터에 대한 클래스 확률 예측
    ctb_preds = ctb.predict_proba( X_valid )[ : , 1 ]
    ctb_test_preds = ctb.predict_proba( test )[ : , 1 ]

    #  ROC AUC 점수를 계산하여 모델의 성능 평가
    print (roc_auc_score ( y_valid, ctb_preds ) )

    # 메타 트레이닝 및 테스트 예측을 위해
    # CatBoost 모델의 검증 및 테스트 예측을 리스트에 저장
    meta_train = [ ctb_preds ]
    meta_test = [ ctb_test_preds ]

    # 최적의 가중치를 찾기 위한 목적 함수를 정의
    def roc_auc( weights ):
        fpred = np.zeros( len( meta_train[ 0 ] ) )
        for i, pred in enumerate( meta_train ):
            fpred += weights[ i ] * pred

        return -roc_auc_score( y_valid, fpred )

    starting_values = [ 0.33 ]*len( meta_train )
    cons = ( { 'type' : 'eq', 'fun' : lambda w : 1 - sum( w ) } )
    bounds = [ ( -1, 1 ) ] * len( meta_train )
    # 최적의 가중치 탐색
    res = minimize( roc_auc,
                    starting_values,
                    method = 'Nelder-Mead',
                    bounds = bounds,
                    constraints = cons )

    print( -res [ "fun" ] )

    # 교차 검증 점수에 최적 가중치에 대한 목적 함수 값의 음수를 더함
    # 최적화 과정에서 최대값 찾음
    cv -= res[ "fun" ]

    # 최적 가중치를 사용하여 테스트 데이터에 대한 예측을 업데이트
    for i, pred in enumerate( meta_test ):
        test_preds += res[ "x" ][ i ] * pred

# 모든 폴드에 대한 교차 검증 점수의 평균을 출력
print( f"CV: { cv / ( fold + 1) }" )

### 데이터 프레임 구성 및 파일 생성

In [None]:
train = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/train.csv' )
test = pd.read_csv( '/content/drive/MyDrive/멋사_데이터분석스쿨/9.파이널_프로젝트/Binary Classification with a Tabular Reservation Cancellation Dataset/test.csv' )

In [None]:
# test_preds는 이전 단계에서 각 폴드에서 얻은 예측 결과
# (fold+1)은 폴드의 수로 나누어 결과를 평균화
submission[ "booking_status" ] = test_preds / ( fold + 1 )
y = 'booking_status'

# 테스트 데이터에서 'id' 열을 제외한 중복된 특성 가져옴
dup_features = test.drop( columns = 'id').columns.tolist( )

# 이너 조인하여 중복된 샘플을 찾음
# 데이터프레임 생성
values_to_assign = test.merge( train.drop( columns = 'id' ), on = dup_features, how = 'inner' )[[ 'id', y ] ]

# 중복된 샘플의 예측 값을 수정하기 위해 사용할 매핑 딕셔너리를 생성
# booking_status 최대값 : 0, 최소값 : 1 매핑
map_di = { 0 : submission[ y ].max( ), 1 : submission[ y ].min( ) }
# 중복된 샘플의 booking_status 값 수정
# 매핑 딕셔너리를 사용해 수정된 값 할당
submission.loc[ submission.id.isin( values_to_assign.id ), y ] = values_to_assign[ y ].map( map_di ).values

In [None]:
submission_3 = submission
submission_3

In [None]:
submission_3.to_csv( "submission_3_10Fold+Random+Grid+HgB+CatB.csv", index = False )

# 5. 모델 학습
- Min - Max Scaler
- XGBoost

### 모듈 다운

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler

### 데이터 준비

In [None]:
df_train = pd.read_csv( '/content/drive/MyDrive/멋사/파이널프로젝트/이진 분류 - 예약 취소/train.csv' )

y = df_train[ 'booking_status' ]
X = df_train.drop( [ 'booking_status' ], axis = 1 )

df_test = pd.read_csv( '/content/drive/MyDrive/멋사/파이널프로젝트/이진 분류 - 예약 취소/test.csv' )

X = pd.concat( [ X, df_test ], ignore_index = True )
X = X.drop( [ 'id' ], axis = 1 )

### 모델 제작 및 학습

In [None]:
# Min-Max Scaler
scaler = MinMaxScaler( )
X_scaled = scaler.fit_transform( X )

X_train = X_scaled[ : len( df_train ) ]
X_test = X_scaled[ len( df_train ) : ]
X_train, X_val, y_train, y_val = train_test_split( X_train, y,
                                                   random_state = 104,
                                                   train_size   = 0.8,
                                                   shuffle      = True )

xgb_params = { 'learning_rate'     : 0.01,
               'max_depth'         : 6,
               'min_child_weight'  : 7,
               'subsample'         : 0.9,
               'colsample_bytree'  : 0.5,
               'objective'         : 'binary:logistic',
               'eval_metric'       : 'auc',
               'seed'              : 10,
               'n_estimators'      : 3500,
               'reg_alpha'         : 0.1,
               'reg_lambda'        : 0.1 }

model = XGBClassifier( **xgb_params )
model.fit( X_train, y_train )

# accuracy
print( "Accuracy : ", model.score( X_val, y_val ) )

# predict_proba
predictions_proba = model.predict_proba( X_test )[ : , 1 ]

### 데이터 프레임 구성 및 파일 생성

In [None]:
submission_4 = pd.DataFrame( { 'id' : df_test[ 'id' ],
                            'booking_status' : predictions_proba } )
submission_4.head( )
submission_4.to_csv( 'submission_4_MinMax+XGB.csv', index = False )

Accuracy: 0.8309976247030879


Unnamed: 0,id,booking_status
0,42100,0.087195
1,42101,0.098004
2,42102,0.315955
3,42103,0.033935
4,42104,0.488868


# 학습 결과

1. 첫 번째 모델

  - **[ 0.8095 ]**
  - DecisionTree
  - MinMaxScaler


2. 두 번째 모델

  - **[ 0.9133 ]**
  - 10 - Fold Cross Check
  - DecisionTreeClassifier
  - HistGradientBoosting
  - CatBoost


3. 세 번째 모델

  - **[ 0.9146 ]**
  - 10 - Fold Cross Check
  - RandomForestClassifier
  - HistGradientBoosting
  - CatBoost

포기

  - **[ ? ]**
  - 10 - Fold Cross Check
  - GridSearchCV
  - RandomForestClassifier
  - HistGradientBoosting
  - CatBoost

4. 네 번째 모델

  - **[ 0.9016 ]**
  - Min - Max Scaler
  - XGBoost

# 모델 선정 + 추가 학습

- [모델 선정 + 추가 학습](https://colab.research.google.com/drive/1XgfTXm2DsnWENTCM7_Ztpp6-EwtyoPNU#scrollTo=KbP6P407rc-k)