<a href="https://colab.research.google.com/github/seirah-yang/BootCamp/blob/main/AutoML_pycaret_ipynb_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Hyperparameter Tuning

## 1) 베이지안 최적화

In [1]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-3.1.0-py3-none-any.whl.metadata (11 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading bayesian_optimization-3.1.0-py3-none-any.whl (36 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-3.1.0 colorama-0.4.6


In [2]:
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

# 베이지안 최적화가 주어진 하이퍼 파라미터로 학습 및 평가를 돌려보는 함수

# 탐색 대상 함수 (XGBClassifier)
def XGB_cv(max_depth,learning_rate, n_estimators):
    # 모델 정의
    model = XGBClassifier(max_depth=int(max_depth),
                            learning_rate=learning_rate,
                            n_estimators=int(n_estimators)
                            )
    # 모델 훈련
    model.fit(X_train, y_train)
    # 예측값 출력
    y_pred= model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    # 오차 최적화로 사용할 metric 반환 (베이지안은 이것을 성능수치로 받아들이게됨)
    return f1

In [3]:
#  bayesian-optimization 라이브러리의 BayesianOptimization 클래스 import
from bayes_opt import BayesianOptimization
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

# make_moons 분류 데이터 500 x 2 피처 데이터
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 실험해보고자하는 hyperparameter 집합
# max_depth(int, default: 3): 기본 학습자를 위한 최대 트리 깊이
# learning_rate(float, default: 0.1) : Boosting 학습률
# n_estimators(int, default: 100) : fit하기 위한 Boosted tree의 수

pbounds = {'max_depth': (3, 7),   # 3~7
            'learning_rate': (0.001, 0.2), # 0.001 ~ 0.2
            'n_estimators': (5000, 10000) # 5000~10000
            }

# Bayesian optimization 객체 생성
# f : 탐색 대상 함수, pbounds : hyperparameter 집합
# verbose = 2 항상 출력, verbose = 1 최댓값일 때 출력, verbose = 0 출력 안함
bo=BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=1 )

# 메소드를 이용해 최대화 과정 수행
# init_points :  초기 Random Search 갯수
# n_iter : 반복 횟수 (몇개의 입력값-함숫값 점들을 확인할지! 많을 수록 정확한 값을 얻을 수 있다.)
# acq : Acquisition Function들 중 Expected Improvement(EI) 를 사용
# xi : exploration 강도 (기본값은 0.0)
bo.maximize(init_points=2, n_iter=10)


# ‘iter’는 반복 회차, ‘target’은 목적 함수의 값, 나머지는 입력값을 나타냄.
# 현재 회차 이전까지 조사된 함숫값들과 비교하여, 현재 회차에 최댓값이 얻어진 경우,
# bayesian-optimization 라이브러리는 이를 자동으로 다른 색 글자로 표시하는 것을 확인.

# 찾은 파라미터 값 확인

|   iter    |  target   | max_depth | learni... | n_esti... |
-------------------------------------------------------------
| [39m1        [39m | [39m0.8360655[39m | [39m4.6680880[39m | [39m0.1443445[39m | [39m5000.5718[39m |
| [35m2        [39m | [35m0.8524590[39m | [35m4.2093302[39m | [35m0.0302044[39m | [35m5461.6929[39m |
| [39m3        [39m | [39m0.8196721[39m | [39m3.0      [39m | [39m0.2      [39m | [39m5461.2133[39m |
| [39m4        [39m | [39m0.8403361[39m | [39m5.8922282[39m | [39m0.0428427[39m | [39m8853.7396[39m |
| [39m5        [39m | [39m0.8196721[39m | [39m3.4620021[39m | [39m0.0608805[39m | [39m8073.8997[39m |
| [39m6        [39m | [39m0.8360655[39m | [39m4.9296680[39m | [39m0.0546924[39m | [39m9877.9414[39m |
| [35m7        [39m | [35m0.8852459[39m | [35m4.8519293[39m | [35m0.0044731[39m | [35m9877.8068[39m |
| [39m8        [39m | [39m0.8852459[39m | [39m3.7993347[39m | [39m0.0085697[39m | [

In [4]:
model = XGBClassifier(max_depth=int(6.054141795109241),
                            learning_rate=0.06517207912793385,
                            n_estimators=int(9284.808327884073)
                            )
# 모델 훈련
model.fit(X_train, y_train)
# 예측값 출력
y_pred= model.predict(X_test)
f1 = f1_score(y_test, y_pred)
f1

0.859504132231405

## 2) Hyperparameter Tuning - optuna

In [5]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.5-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.5-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.4/247.4 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.5 colorlog-6.9.0 optuna-4.5.0


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost
import optuna

In [7]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [8]:
def objective(trial): # 옵튜나 객체
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10), #옵튜나한테 int 타입을 제안한다 maxdepth 1 ~ 10 카운팅 값으로 뽑아낸다.
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0), # 옵튜나한테 float타입으로 제안한다.
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    } #탐색함수
    model = XGBClassifier(**param) # 타입에 맞게 제안
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred)

In [9]:
# Create the study
study = optuna.create_study(direction='maximize') # maximize
              # 탐색함수, 시도할 횟수
study.optimize(objective, n_trials=100)

[I 2025-08-29 15:18:41,407] A new study created in memory with name: no-name-e3e143ff-768d-4ee5-a3aa-bb942333c4a6
[I 2025-08-29 15:18:41,525] Trial 0 finished with value: 0.8688524590163934 and parameters: {'max_depth': 10, 'learning_rate': 0.3649983010937496, 'n_estimators': 854, 'min_child_weight': 6, 'gamma': 0.15681437425947006, 'subsample': 0.9431302365237332, 'colsample_bytree': 0.847655573016871, 'reg_alpha': 0.6132288667645447, 'reg_lambda': 0.2800982318815841, 'random_state': 698}. Best is trial 0 with value: 0.8688524590163934.
[I 2025-08-29 15:18:41,560] Trial 1 finished with value: 0.8524590163934426 and parameters: {'max_depth': 9, 'learning_rate': 0.82563516565024, 'n_estimators': 259, 'min_child_weight': 9, 'gamma': 0.5897786296013523, 'subsample': 0.887463484214885, 'colsample_bytree': 0.5604204223712881, 'reg_alpha': 0.59693605941297, 'reg_lambda': 0.3515187854478527, 'random_state': 462}. Best is trial 0 with value: 0.8688524590163934.
[I 2025-08-29 15:18:41,594] Tria

In [10]:
# best parameters
print('Best parameters', study.best_params)

Best parameters {'max_depth': 1, 'learning_rate': 0.29450934445648447, 'n_estimators': 291, 'min_child_weight': 3, 'gamma': 0.8817745850410071, 'subsample': 0.8460120598541281, 'colsample_bytree': 0.11341254956769382, 'reg_alpha': 0.5554925921475257, 'reg_lambda': 0.23939874372408565, 'random_state': 885}


In [11]:
model = XGBClassifier(**study.best_params) # 최적의 하이퍼 파라미터/ 파이썬에서 ** 연산자는 딕셔너리 unpacking(언패킹)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('f1_score: ', f1_score(y_test, y_pred))

f1_score:  0.9193548387096774


# 2. AutoML

## 1) pycaret

In [12]:
!pip install pycaret==2.0.0

Collecting pycaret==2.0.0
  Downloading pycaret-2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting pyLDAvis (from pycaret==2.0.0)
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting gensim (from pycaret==2.0.0)
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting pyod (from pycaret==2.0.0)
  Downloading pyod-2.0.5-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catboost (from pycaret==2.0.0)
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pandas-profiling>=2.3.0 (from pycaret==2.0.0)
  Downloading pandas_profiling-3.2.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting kmodes>=0.10.1 (from pycaret==2.0.0)
  Downloading kmodes-0.12.2-py2.py3-none-any.whl.metadata (8.1 kB)
Collecting datefinder>=0.7.0 (from pycaret==2.0.0)
  Downloading da

In [13]:
#pycaret에서 제공하는 'juice' 데이터
from pycaret.datasets import get_data
data = get_data('juice')
#data.info()
print(data)

ModuleNotFoundError: No module named 'pycaret'

In [None]:
from pycaret.classification import *  # setup, compare_models
setup_clf = setup(data=data, target='Purchase')
class_top = compare_models()
class_top

In [14]:
!pip install xgboost



In [15]:
from pycaret.regression import * # setup, compare_models
from pycaret.datasets import get_data
dataset = get_data('diamond')
exp = setup(dataset, target='Price')
reg_top = compare_models()
reg_top

ModuleNotFoundError: No module named 'pycaret'

In [16]:
# pycaret 버전
import pycaret
print('PyCaret: %s' % pycaret.__version__)

ModuleNotFoundError: No module named 'pycaret'

In [17]:
from pandas import read_csv
# pycaret classification 모듈
from pycaret.classification import setup
# pycaret classification 모델들을 비교하기 위한 모듈
from pycaret.classification import compare_models
# sonar 데이터셋
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
# 데이터셋 로드
df = read_csv(url, header=None)
# 컬럼 갯수
n_cols = df.shape[1]
# 컬럼명 String으로 반환
df.columns = [str(i) for i in range(n_cols)]
# 데이터셋 설정
grid = setup(data=df, target=df.columns[-1],  verbose=True)
# 모델 학습
best = compare_models()
# best 모델 확인
print(best)

ModuleNotFoundError: No module named 'pycaret'

In [18]:
from pandas import read_csv
# pycaret classification 모듈
from pycaret.classification import setup
# pycaret classification 모델들을 비교하기 위한 모듈
from pycaret.classification import compare_models

# 데이터셋 로드
df = read_csv("titanic.csv")

# 컬럼 갯수
n_cols = df.shape[1]

# 컬럼명 String으로 반환
df.columns = [str(i) for i in range(n_cols)]

# 데이터셋 설정
grid = setup(data=df, target=df.columns[1],  verbose=True)

# 모델 학습
best = compare_models()

# best 모델 확인
print(best)

ModuleNotFoundError: No module named 'pycaret'

# 2) Autogluon

In [19]:
# AutoML - Autogluon

https://www.kaggle.com/datasets/parisrohan/credit-score-classification

SyntaxError: invalid syntax (ipython-input-3006659799.py, line 3)

In [20]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-1.4.0-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.4.0 (from autogluon.core[all]==1.4.0->autogluon)
  Downloading autogluon.core-1.4.0-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.4.0 (from autogluon)
  Downloading autogluon.features-1.4.0-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.4.0 (from autogluon.tabular[all]==1.4.0->autogluon)
  Downloading autogluon.tabular-1.4.0-py3-none-any.whl.metadata (16 kB)
Collecting autogluon.multimodal==1.4.0 (from autogluon)
  Downloading autogluon.multimodal-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting autogluon.timeseries==1.4.0 (from autogluon.timeseries[all]==1.4.0->autogluon)
  Downloading autogluon.timeseries-1.4.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3<2,>=1.10 (from autogluon.core==1.4.0->autogluon.core[all]==1.4.0->autogluon)
  Downloading boto3-1.40.20-py3-none-any.whl.metadata (6.7 kB)
Collecting autogluon.common==1.4

In [21]:
import pandas as pd
# Tabular Data를 다루기 때문에 아래 라이브러리를 호출
from autogluon.tabular import TabularDataset, TabularPredictor

In [25]:
train_df = pd.read_csv('./train2.csv').iloc[:50,:]
#test_df = pd.read_csv('./test.csv').iloc[:50,:]

In [26]:
train_df.shape

(50, 28)

In [27]:
# autogluon 학습을 위한 데이터 형태로 변환
train = TabularDataset(train_df.drop(['ID'], axis=1))
#test = TabularDataset(test_df.drop(['ID'], axis=1))

In [28]:
type(train)

In [29]:
#학습       # AutoML 객체       credit_score를 맞출것이다. 평가 f1_macro
            # 곧바로 fit으로 train 데이터 학습
predictor = TabularPredictor(label='Credit_Score', eval_metric='f1_macro').fit(train)

No path specified. Models will be saved in: "AutogluonModels/ag-20250829_152834"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
CPU Count:          2
Memory Avail:       11.02 GB / 12.67 GB (87.0%)
Disk Space Avail:   62.50 GB / 107.72 GB (58.0%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Recomm

In [30]:
# 각각의 모델의 훈련 성능을 평가할 수 있음
ld_board = predictor.leaderboard(train, silent=True)

ld_board

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,1.0,1.0,f1_macro,0.02275,0.005254,19.887638,0.02275,0.005254,19.887638,1,True,5
1,RandomForestEntr,1.0,1.0,f1_macro,0.086637,0.074458,0.706326,0.086637,0.074458,0.706326,1,True,4
2,ExtraTreesGini,1.0,1.0,f1_macro,0.088845,0.078861,0.608611,0.088845,0.078861,0.608611,1,True,6
3,ExtraTreesEntr,1.0,1.0,f1_macro,0.089459,0.06796,0.62016,0.089459,0.06796,0.62016,1,True,7
4,RandomForestGini,1.0,1.0,f1_macro,0.091129,0.082675,0.802261,0.091129,0.082675,0.802261,1,True,3
5,LightGBMLarge,0.937991,1.0,f1_macro,0.003584,0.003169,0.922095,0.003584,0.003169,0.922095,1,True,11
6,XGBoost,0.872666,1.0,f1_macro,0.0212,0.007579,0.734223,0.0212,0.007579,0.734223,1,True,9
7,WeightedEnsemble_L2,0.872666,1.0,f1_macro,0.022817,0.00984,1.033759,0.001617,0.002261,0.299536,2,True,12
8,NeuralNetTorch,0.872666,1.0,f1_macro,0.031647,0.016668,5.711283,0.031647,0.016668,5.711283,1,True,10
9,NeuralNetFastAI,0.69612,1.0,f1_macro,0.021891,0.012626,1.657567,0.021891,0.012626,1.657567,1,True,8


In [31]:
from sklearn.metrics import accuracy_score, f1_score
# 예측하기
pred_y = predictor.predict(train)
pred_y

Unnamed: 0,Credit_Score
0,Good
1,Good
2,Good
3,Good
4,Good
5,Good
6,Good
7,Good
8,Good
9,Good


In [32]:
accuracy_score(pred_y,train['Credit_Score'])

0.88