## ch21 Serving Models with MLFLOW
- https://github.com/mattharrison/effective_xgboost_book/blob/main/xgbcode.ipynb

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div>
<div style="text-align: right"> Initial upload: 2023.8.18</div>
<div style="text-align: right"> Last update: 2023.8.18</div>

In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from scipy import stats
import warnings; warnings.filterwarnings('ignore')
#plt.style.use('ggplot')
plt.style.use('seaborn-whitegrid')
%matplotlib inline

### 21.1 Installation and Setup

In [2]:
from feature_engine import encoding, imputation
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
from sklearn import base, metrics, model_selection, \
pipeline, preprocessing
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
import urllib
import zipfile

from sklearn import model_selection, preprocessing
import xg_helpers as xhelp

In [3]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/'\
'kaggle-survey-2018.zip'
fname = 'kaggle-survey-2018.zip'
member_name = 'multipleChoiceResponses.csv'

In [4]:
raw = xhelp.extract_zip(url, fname, member_name)
## Create raw X and raw y
kag_X, kag_y = xhelp.get_rawX_y(raw, 'Q6')

In [5]:
## Split data
kag_X_train, kag_X_test, kag_y_train, kag_y_test = \
model_selection.train_test_split(
kag_X, kag_y, test_size=.3, random_state=42, stratify=kag_y)

In [6]:
## Transform X with pipeline
X_train = xhelp.kag_pl.fit_transform(kag_X_train)
X_test = xhelp.kag_pl.transform(kag_X_test)
## Transform y with label encoder
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(kag_y_train)
y_train = label_encoder.transform(kag_y_train)
y_test = label_encoder.transform(kag_y_test)
# Combined Data for cross validation/etc
X = pd.concat([X_train, X_test], axis='index')
y = pd.Series([*y_train, *y_test], index=X.index)

모델 학습을 위해 hyperopt를 사용   
로깅을 위해 mlflow 사용

In [7]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import mlflow
from sklearn import metrics
import xgboost as xgb

In [8]:
ex_id = mlflow.create_experiment(name='ex3', artifact_location='ex2path')
mlflow.set_experiment(experiment_name='ex3')

<Experiment: artifact_location='ex2path', creation_time=1692576632427, experiment_id='1', last_update_time=1692576632427, lifecycle_stage='active', name='ex3', tags={}>

In [9]:
with mlflow.start_run():
    params = {'random_state': 42}
    rounds = [{'max_depth': hp.quniform('max_depth', 1, 12, 1),  # tree
               'min_child_weight': hp.loguniform('min_child_weight', -2, 3)},
              {'subsample': hp.uniform('subsample', 0.5, 1),   # stochastic
               'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)},
              {'gamma': hp.loguniform('gamma', -10, 10)}, # regularization
              {'learning_rate': hp.loguniform('learning_rate', -7, 0)} # boosting
    ]

    for round in rounds:
        params = {**params, **round}
        trials = Trials()
        best = fmin(fn=lambda space: xhelp.hyperparameter_tuning(
                space, X_train, y_train, X_test, y_test),            
            space=params,           
            algo=tpe.suggest,            
            max_evals=10,            
            trials=trials,
            timeout=60*5 # 5 minutes
        )
        params = {**params, **best}
        params['max_depth'] = int(params['max_depth']) # 실수로 변경해야 작동함
        for param, val in params.items():
            mlflow.log_param(param, val)
        
        xg = xgb.XGBClassifier(eval_metric='logloss', early_stopping_rounds=50, **params)
        xg.fit(X_train, y_train,
               eval_set=[(X_train, y_train),
                         (X_test, y_test)
                        ]
              )     
        for metric in [metrics.accuracy_score, metrics.precision_score, metrics.recall_score, 
                       metrics.f1_score]:
            mlflow.log_metric(metric.__name__, metric(y_test, xg.predict(X_test)))
            
    model_info = mlflow.xgboost.log_model(xg, artifact_path='model')

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 10/10 [00:01<00:00,  7.20trial/s, best loss: -0.7635359116022099]
[0]	validation_0-logloss:0.63000	validation_1-logloss:0.62975
[1]	validation_0-logloss:0.59304	validation_1-logloss:0.59064
[2]	validation_0-logloss:0.56922	validation_1-logloss:0.56549
[3]	validation_0-logloss:0.55326	validation_1-logloss:0.55071
[4]	validation_0-logloss:0.54016	validation_1-logloss:0.53926
[5]	validation_0-logloss:0.53088	validation_1-logloss:0.53101
[6]	validation_0-logloss:0.52322	validation_1-logloss:0.52552
[7]	validation_0-logloss:0.51472	validation_1-logloss:0.51574
[8]	validation_0-logloss:0.51023	validation_1-logloss:0.51321
[9]	validation_0-logloss:0.50613	validation_1-logloss:0.51034
[10]	validation_0-logloss:0.50230	validation_1-logloss:0.50881
[11]	validation_0-logloss:0.49793	validation_1-logloss:0.50497
[12]	validation_0-logloss:0.49546	validation_1-logloss:0.50405
[13]	validation_0-logloss:0.49071	validation_1-logloss:0.50142
[14]	validation_0-logloss:0.48848	validation_

In [11]:
ex_id

'1'

In [12]:
model_info.run_id

'625ba1cae49e40febb356b033a9dfec7'