[LightGBM]
- gradient boosting 라이브러리 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

< 데이터 불러오기 >

In [5]:
data_path = '../data/'

train_df = pd.read_csv(data_path+'train.csv', index_col='id')
test_df = pd.read_csv(data_path+'test.csv', index_col='id')
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

< 데이터 분리>

In [7]:
# 피처와 타겟 변수 분리 
X = train_df.drop('defects', axis=1)
y = train_df['defects']

# 훈련 데이터와 검증 데이터로 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



< LightGBM 모델 훈련 >

In [25]:
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# 모델 파라미터 설정
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# 모델 훈련
num_round = 1000
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data])

# 검증 데이터에 대한 예측 수행
y_pred = bst.predict(X_val, num_iteration=bst.best_iteration)

# ROC-AUC 점수 계산
score = roc_auc_score(y_val, y_pred)
print(f"Validation ROC-AUC Score: {score:.4f}")

#결과해석: ROC-AUC Score (0.7834)

[LightGBM] [Info] Number of positive: 18536, number of negative: 62874
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013938 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3559
[LightGBM] [Info] Number of data points in the train set: 81410, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.227687 -> initscore=-1.221418
[LightGBM] [Info] Start training from score -1.221418
Validation ROC-AUC Score: 0.7834


< 성능 개선 > 
- Grid Search를 사용하여 파라미터 튜닝
- L1 및 L2 정규화를 추가로 적용

In [28]:
from sklearn.model_selection import GridSearchCV

In [31]:
# 파라미터 튜닝을 위한 그리드
param_grid = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 1, 1.2],   # L1 정규화
    'reg_lambda': [0, 1, 1.2],  # L2 정규화
    'objective': ['binary'],
    'metric': ['auc'],
    'boosting_type': ['gbdt']
}

gbm = lgb.LGBMClassifier()

# GridSearchCV를 사용하여 최적의 파라미터 찾기
grid = GridSearchCV(gbm, param_grid, cv=3)
grid.fit(X_train, y_train)

print(f"Best parameters found: {grid.best_params_}")

# 최적의 파라미터로 모델 훈련
best_params = grid.best_params_
num_round = 1000
bst = lgb.train(best_params, train_data, num_round, valid_sets=[val_data])

y_pred = bst.predict(X_val, num_iteration=bst.best_iteration)
score = roc_auc_score(y_val, y_pred)
print(f"Validation ROC-AUC Score: {score:.4f}")


[LightGBM] [Info] Number of positive: 12357, number of negative: 41916
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3503
[LightGBM] [Info] Number of data points in the train set: 54273, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.227682 -> initscore=-1.221445
[LightGBM] [Info] Start training from score -1.221445
[LightGBM] [Info] Number of positive: 12357, number of negative: 41916
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3476
[LightGBM] [Info] Number of data points in the train set: 54273, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.227682 -> initscore=-1.221445
[L