# Import

In [None]:
import sys
import os
import datetime

import pandas as pd
import numpy as np
# ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from mlflow.tracking import MlflowClient
# viz
import matplotlib.pyplot as plt
# statistics
from scipy.stats import shapiro

# Data

In [None]:
path_train = "/workspace/Storage/template_structured/Data/raw/train.csv"
path_test = "/workspace/Storage/template_structured/Data/raw/test.csv"

In [None]:
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)

In [None]:
for col in ['UID', '주거 형태', '현재 직장 근속 연수', '대출 목적', '대출 상환 기간']:
    train[col] = train[col].astype('category')

# train

In [None]:
train, val = train_test_split(train, test_size=0.4, random_state=421)
val, test = train_test_split(val, test_size=0.5, random_state=421)

In [None]:
train.shape, val.shape, test.shape

In [None]:
target = "채무 불이행 여부"
X_train, y_train = train[[x for x in train.columns if x != target]], train[target]
X_val, y_val = val[[x for x in val.columns if x != target]], val[target]
X_test, y_test = test[[x for x in test.columns if x != target]], test[target]

In [None]:
# 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [None]:
# 사용자 정의 콜백: epoch별 메트릭 로그 기록

class MlflowLoggingCallbackLGBM:
    def __init__(self, X_test, y_test, metric):
        self.X_test = X_test
        self.y_test = y_test
        self.metric = metric

    def __call__(self, env):
        # 매 step 마다 호출
        iteration = env.iteration + 1  # 0-indexed
        y_pred = env.model.predict(self.X_test, num_iteration=iteration)
        score = self.metric(self.y_test, y_pred)

        # MLflow에 기록
        # 검증 세트 점수도 기록
        for  valid_name, metric_name, valid_result, _ in env.evaluation_result_list:
            metrics = {
                "test" : score,
                "valid" : valid_result
            }
        mlflow.log_metrics(metrics, step=iteration, synchronous=False)

In [None]:

class MlflowLoggingCallbackLGBM:
    def __init__(self, X_test, y_test, metric):
        self.X_test = X_test
        self.y_test = y_test
        self.metric = metric

    def __call__(self, env):
        # 매 step 마다 호출
        iteration = env.iteration + 1  # 0-indexed
        y_pred = env.model.predict(self.X_test, num_iteration=iteration)
        score = self.metric(self.y_test, y_pred)

        # 손실(Loss) 값 가져오기
        
        metrics = {
            "test": score,
        }
        # MLflow에 기록
        for name, metric_name, value, _ in env.evaluation_result_list:
            metrics[name] = value
        mlflow.log_metrics(metrics, step=iteration, synchronous=False)

In [None]:

class MlflowLoggingCallbackLGBM:
    def __init__(self, X_test, y_test, metric):
        self.X_test = X_test
        self.y_test = y_test
        self.metric = metric

    def __call__(self, env):
        iteration = env.iteration + 1  # 0-indexed
        y_pred = env.model.predict(self.X_test, num_iteration=iteration)

        # 이진 분류인 경우 확률값 → 이진값 변환
        if self.metric == f1_score:
            y_pred_binary = (y_pred > 0.5).astype(int)
            score = self.metric(self.y_test, y_pred_binary)
        else:
            score = self.metric(self.y_test, y_pred)

        metrics = {"test": score}

        # 기존 validation loss 들도 같이 기록
        for name, metric_name, value, _ in env.evaluation_result_list:
            metrics[name] = value

        mlflow.log_metrics(metrics, step=iteration, synchronous=False)

In [None]:
# 모델 학습
# MLflow 실험 시작
mlflow.set_tracking_uri("http://175.214.62.133:50001/")
mlflow.set_experiment("lightgbm_rmsle_experiment")

with mlflow.start_run():
    
    # 하이퍼파라미터 설정
    params = {
        'objective': 'binary',  # 회귀 문제
        'metric': 'binary_logloss',           # 평가 지표
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
    }

    callbacks = [
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100),
        MlflowLoggingCallbackLGBM(X_test, y_test, f1_score)
    ]

    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data,valid_data],
        num_boost_round=1000,
        callbacks=callbacks,
        # feval = root_mean_squared_log_error_lgbm,
    )

    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_binary = (y_pred > 0.5).astype(int)

    # rmsle = root_mean_squared_log_error(y_test, y_pred)
    f1 = f1_score(y_test, y_pred_binary)
    print("f1:", f1)

    # MLflow에 파라미터 및 메트릭 기록
    mlflow.log_params(params)
    mlflow.log_metric("f1", f1)
    
    # 모델 저장
    mlflow.lightgbm.log_model(model, "lightgbm_model")
    print("Model saved to MLflow")