# 1. Hyperparameter Tuning

## 1) 베이지안 최적화

In [2]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl.metadata (543 bytes)
Collecting numpy>=1.9.0 (from bayesian-optimization)
  Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting scipy>=1.0.0 (from bayesian-optimization)
  Using cached scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Collecting scikit-learn>=0.18.0 (from bayesian-optimization)
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting joblib>=1.1.1 (from scikit-learn>=0.18.0->bayesian-optimization)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn>=0.18.0->bayesian-optimization)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (1

In [1]:
1

1

In [None]:
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

# 베이지안 최적화가 주어진 하이퍼 파라미터로 학습 및 평가를 돌려보는 함수

# 탐색 대상 함수 (XGBClassifier)
def XGB_cv(max_depth,learning_rate, n_estimators):
    # 모델 정의
    model = XGBClassifier(max_depth=int(max_depth),
                            learning_rate=learning_rate,
                            n_estimators=int(n_estimators)
                            )
    # 모델 훈련
    model.fit(X_train, y_train)
    # 예측값 출력
    y_pred= model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    # 오차 최적화로 사용할 metric 반환 (베이지안은 이것을 성능수치로 받아들이게됨)
    return f1

In [None]:
#  bayesian-optimization 라이브러리의 BayesianOptimization 클래스 import
from bayes_opt import BayesianOptimization
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

# make_moons 분류 데이터 500 x 2 피처 데이터
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 실험해보고자하는 hyperparameter 집합
# max_depth(int, default: 3): 기본 학습자를 위한 최대 트리 깊이
# learning_rate(float, default: 0.1) : Boosting 학습률
# n_estimators(int, default: 100) : fit하기 위한 Boosted tree의 수

pbounds = {'max_depth': (3, 7),   # 3~7
            'learning_rate': (0.001, 0.2), # 0.001 ~ 0.2
            'n_estimators': (5000, 10000) # 5000~10000
            }

# Bayesian optimization 객체 생성
# f : 탐색 대상 함수, pbounds : hyperparameter 집합
# verbose = 2 항상 출력, verbose = 1 최댓값일 때 출력, verbose = 0 출력 안함
bo=BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=1 )

# 메소드를 이용해 최대화 과정 수행
# init_points :  초기 Random Search 갯수
# n_iter : 반복 횟수 (몇개의 입력값-함숫값 점들을 확인할지! 많을 수록 정확한 값을 얻을 수 있다.)
# acq : Acquisition Function들 중 Expected Improvement(EI) 를 사용
# xi : exploration 강도 (기본값은 0.0)
bo.maximize(init_points=2, n_iter=10)


# ‘iter’는 반복 회차, ‘target’은 목적 함수의 값, 나머지는 입력값을 나타냄.
# 현재 회차 이전까지 조사된 함숫값들과 비교하여, 현재 회차에 최댓값이 얻어진 경우,
# bayesian-optimization 라이브러리는 이를 자동으로 다른 색 글자로 표시하는 것을 확인.

# 찾은 파라미터 값 확인
print(bo.max)

In [None]:
model = XGBClassifier(max_depth=int(6.054141795109241),
                            learning_rate=0.06517207912793385,
                            n_estimators=int(9284.808327884073)
                            )
# 모델 훈련
model.fit(X_train, y_train)
# 예측값 출력
y_pred= model.predict(X_test)
f1 = f1_score(y_test, y_pred)
f1

## 2) Hyperparameter Tuning - optuna

In [None]:
!pip install optuna

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost
import optuna

In [None]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [None]:
def objective(trial): # 옵튜나 객체
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10), #옵튜나한테 int 타입을 제안한다 maxdepth 1 ~ 10 카운팅 값으로 뽑아낸다.
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0), # 옵튜나한테 float타입으로 제안한다.
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred)

In [None]:
# Create the study
study = optuna.create_study(direction='maximize') # maximize
              # 탐색함수, 시도할 횟수
study.optimize(objective, n_trials=100)

In [None]:
# best parameters
print('Best parameters', study.best_params)

In [None]:
model = XGBClassifier(**study.best_params) # 최적의 하이퍼 파라미터
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('f1_score: ', f1_score(y_test, y_pred))

# 2. AutoML

## 1) pycaret

In [None]:
!pip install pycaret

In [1]:
#pycaret에서 제공하는 'juice' 데이터
from pycaret.datasets import get_data
data = get_data('juice')
#data.info()
print(data)

Unnamed: 0,Id,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,1,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,2,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,3,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,4,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,5,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


        Id Purchase  WeekofPurchase  StoreID  PriceCH  PriceMM  DiscCH  \
0        1       CH             237        1     1.75     1.99    0.00   
1        2       CH             239        1     1.75     1.99    0.00   
2        3       CH             245        1     1.86     2.09    0.17   
3        4       MM             227        1     1.69     1.69    0.00   
4        5       CH             228        7     1.69     1.69    0.00   
...    ...      ...             ...      ...      ...      ...     ...   
1065  1066       CH             252        7     1.86     2.09    0.10   
1066  1067       CH             256        7     1.86     2.18    0.00   
1067  1068       MM             257        7     1.86     2.18    0.00   
1068  1069       CH             261        7     1.86     2.13    0.00   
1069  1070       CH             270        1     1.86     2.18    0.00   

      DiscMM  SpecialCH  SpecialMM   LoyalCH  SalePriceMM  SalePriceCH  \
0       0.00          0          0  0

In [2]:
from pycaret.classification import *  # setup, compare_models
setup_clf = setup(data=data, target='Purchase')
class_top = compare_models()
class_top

Unnamed: 0,Description,Value
0,Session id,1151
1,Target,Purchase
2,Target type,Binary
3,Target mapping,"CH: 0, MM: 1"
4,Original data shape,"(1070, 19)"
5,Transformed data shape,"(1070, 19)"
6,Transformed train set shape,"(749, 19)"
7,Transformed test set shape,"(321, 19)"
8,Numeric features,17
9,Categorical features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8171,0.8837,0.8171,0.8175,0.8164,0.6135,0.6149,0.134
lda,Linear Discriminant Analysis,0.8158,0.8841,0.8158,0.816,0.8147,0.6095,0.6114,0.058
lr,Logistic Regression,0.8118,0.8825,0.8118,0.8119,0.8102,0.5994,0.602,1.797
rf,Random Forest Classifier,0.7997,0.8724,0.7997,0.8012,0.7987,0.5762,0.5791,0.352
gbc,Gradient Boosting Classifier,0.7971,0.8759,0.7971,0.7985,0.7959,0.5704,0.5733,0.182
lightgbm,Light Gradient Boosting Machine,0.7957,0.8671,0.7957,0.796,0.7949,0.568,0.5695,38.976
ada,Ada Boost Classifier,0.793,0.86,0.793,0.7937,0.7903,0.557,0.5616,0.204
et,Extra Trees Classifier,0.7677,0.8367,0.7677,0.7686,0.7673,0.5106,0.5119,0.114
nb,Naive Bayes,0.7463,0.8167,0.7463,0.7598,0.7482,0.4826,0.4888,0.095
knn,K Neighbors Classifier,0.7222,0.7599,0.7222,0.7182,0.717,0.3998,0.4036,0.174


In [None]:
!pip install xgboost

In [None]:
from pycaret.regression import * # setup, compare_models
from pycaret.datasets import get_data
dataset = get_data('diamond')
exp = setup(dataset, target='Price')
reg_top = compare_models()
reg_top

In [None]:
# pycaret 버전
import pycaret
print('PyCaret: %s' % pycaret.__version__)

In [None]:
from pandas import read_csv
# pycaret classification 모듈
from pycaret.classification import setup
# pycaret classification 모델들을 비교하기 위한 모듈
from pycaret.classification import compare_models
# sonar 데이터셋
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
# 데이터셋 로드
df = read_csv(url, header=None)
# 컬럼 갯수
n_cols = df.shape[1]
# 컬럼명 String으로 반환
df.columns = [str(i) for i in range(n_cols)]
# 데이터셋 설정
grid = setup(data=df, target=df.columns[-1],  verbose=True)
# 모델 학습
best = compare_models()
# best 모델 확인
print(best)

In [3]:
from pandas import read_csv
# pycaret classification 모듈
from pycaret.classification import setup
# pycaret classification 모델들을 비교하기 위한 모듈
from pycaret.classification import compare_models

# 데이터셋 로드
df = read_csv("titanic.csv")

# 컬럼 갯수
# n_cols = df.shape[1]

# 컬럼명 String으로 반환
# df.columns = [str(i) for i in range(n_cols)]

# 데이터셋 설정
grid = setup(data=df, target=df.columns[1],  verbose=True)

# 모델 학습
best = compare_models()

# best 모델 확인
print(best)

Unnamed: 0,Description,Value
0,Session id,8183
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(891, 14)"
5,Transformed train set shape,"(623, 14)"
6,Transformed test set shape,"(268, 14)"
7,Numeric features,6
8,Categorical features,5
9,Rows with missing values,79.5%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8139,0.8655,0.6864,0.812,0.7391,0.5964,0.6057,2.065
ridge,Ridge Classifier,0.7415,0.8634,0.4221,0.814,0.549,0.3961,0.44,0.047
et,Extra Trees Classifier,0.7286,0.7877,0.3757,0.8739,0.4913,0.3547,0.4264,0.246
nb,Naive Bayes,0.6806,0.781,0.2089,0.8619,0.3301,0.2122,0.3059,0.276
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.272
rf,Random Forest Classifier,0.6164,0.8226,0.0,0.0,0.0,0.0,0.0,0.238
ada,Ada Boost Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.052
qda,Quadratic Discriminant Analysis,0.6164,0.5638,0.0,0.0,0.0,0.0,0.0,0.093
gbc,Gradient Boosting Classifier,0.6164,0.4881,0.0,0.0,0.0,0.0,0.0,0.131
lda,Linear Discriminant Analysis,0.6164,0.5276,0.013,0.05,0.0207,0.0063,0.0087,0.123


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [4]:
from pandas import read_csv
# pycaret classification 모듈
from pycaret.classification import setup
# pycaret classification 모델들을 비교하기 위한 모듈
from pycaret.classification import compare_models

# 데이터셋 로드
df = read_csv("titanic.csv")

# 컬럼 갯수
# n_cols = df.shape[1]

# 컬럼명 String으로 반환
# df.columns = [str(i) for i in range(n_cols)]

# 데이터셋 설정
grid = setup(data=df, target=df.columns[1])

# 모델 학습
best = compare_models()

# best 모델 확인
print(best)

Unnamed: 0,Description,Value
0,Session id,7800
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(891, 14)"
5,Transformed train set shape,"(623, 14)"
6,Transformed test set shape,"(268, 14)"
7,Numeric features,6
8,Categorical features,5
9,Rows with missing values,79.5%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8251,0.8757,0.7025,0.8168,0.754,0.6197,0.6251,1.914
ridge,Ridge Classifier,0.7463,0.8524,0.4304,0.8297,0.5616,0.4089,0.4545,0.928
et,Extra Trees Classifier,0.7417,0.7981,0.4935,0.7514,0.5902,0.4151,0.4373,1.311
nb,Naive Bayes,0.6548,0.7824,0.1426,0.7755,0.234,0.1366,0.2158,0.249
lda,Linear Discriminant Analysis,0.6228,0.525,0.0304,0.07,0.0424,0.0257,0.0299,0.708
knn,K Neighbors Classifier,0.6212,0.6076,0.3645,0.5023,0.4173,0.1527,0.1563,0.283
rf,Random Forest Classifier,0.6212,0.7894,0.0214,0.25,0.0388,0.0196,0.0397,1.584
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.39
qda,Quadratic Discriminant Analysis,0.6164,0.485,0.0,0.0,0.0,0.0,0.0,0.932
ada,Ada Boost Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.953


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=7800, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


# 2) Autogluon

In [None]:
# AutoML - Autogluon

https://www.kaggle.com/datasets/parisrohan/credit-score-classification

In [None]:
!pip install autogluon

In [5]:
import pandas as pd
# Tabular Data를 다루기 때문에 아래 라이브러리를 호출
from autogluon.tabular import TabularDataset, TabularPredictor

In [6]:
train_df = pd.read_csv('./train2.csv').iloc[:50,:]
#test_df = pd.read_csv('./test.csv').iloc[:50,:]

In [None]:
train_df.shape

In [7]:
# autogluon 학습을 위한 데이터 형태로 변환
train = TabularDataset(train_df.drop(['ID'], axis=1))
#test = TabularDataset(test_df.drop(['ID'], axis=1))

In [None]:
type(train)

In [8]:
#학습       # AutoML 객체       credit_score를 맞출것이다. 평가 f1_macro
            # 곧바로 fit으로 train 데이터 학습
predictor = TabularPredictor(label='Credit_Score', eval_metric='f1_macro').fit(train)

No path specified. Models will be saved in: "AutogluonModels/ag-20250829_040758"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.10.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #149~20.04.1-Ubuntu SMP Wed Apr 16 08:29:56 UTC 2025
CPU Count:          8
Memory Avail:       11.92 GB / 31.34 GB (38.0%)
Disk Space Avail:   725.98 GB / 1006.36 GB (72.1%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Rec

In [9]:
# 각각의 모델의 훈련 성능을 평가할 수 있음
ld_board = predictor.leaderboard(train, silent=True)

ld_board

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,1.0,1.0,f1_macro,0.013519,0.006597,8.386375,0.013519,0.006597,8.386375,1,True,5
1,ExtraTreesEntr,1.0,1.0,f1_macro,0.05593,0.049036,0.63898,0.05593,0.049036,0.63898,1,True,7
2,ExtraTreesGini,1.0,1.0,f1_macro,0.056281,0.038059,0.71506,0.056281,0.038059,0.71506,1,True,6
3,RandomForestEntr,1.0,1.0,f1_macro,0.057568,0.061503,0.712144,0.057568,0.061503,0.712144,1,True,4
4,RandomForestGini,1.0,1.0,f1_macro,0.060023,0.050023,0.726552,0.060023,0.050023,0.726552,1,True,3
5,LightGBMLarge,0.937991,1.0,f1_macro,0.00407,0.003533,0.83632,0.00407,0.003533,0.83632,1,True,10
6,XGBoost,0.872666,1.0,f1_macro,0.012771,0.014201,0.812257,0.012771,0.014201,0.812257,1,True,9
7,WeightedEnsemble_L2,0.872666,1.0,f1_macro,0.01433,0.016321,1.062599,0.001559,0.00212,0.250342,2,True,11
8,NeuralNetFastAI,0.872666,1.0,f1_macro,0.024799,0.012758,2.974533,0.024799,0.012758,2.974533,1,True,8
9,LightGBM,0.358974,0.375,f1_macro,0.003211,0.003587,0.69557,0.003211,0.003587,0.69557,1,True,2


In [10]:
from sklearn.metrics import accuracy_score, f1_score
# 예측하기
pred_y = predictor.predict(train)
pred_y

0         Good
1         Good
2         Good
3         Good
4         Good
5         Good
6         Good
7         Good
8         Good
9         Good
10        Good
11        Good
12        Good
13        Good
14        Good
15        Good
16        Good
17        Good
18        Good
19        Good
20        Good
21        Good
22        Good
23        Good
24    Standard
25    Standard
26    Standard
27    Standard
28    Standard
29    Standard
30    Standard
31    Standard
32    Standard
33    Standard
34    Standard
35    Standard
36    Standard
37    Standard
38    Standard
39    Standard
40        Good
41        Good
42        Good
43        Good
44        Good
45        Good
46        Good
47        Good
48        Good
49        Good
Name: Credit_Score, dtype: object

In [11]:
accuracy_score(pred_y,train['Credit_Score'])

0.88