In [16]:
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

drive.mount('/content/drive/')

cust_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Santander/train.csv", encoding='latin-1')

# 다른 값에 비해 편차가 큰 var3의 -999999 컬럼은 가장 값이 많은 2로 변환
cust_df['var3'].replace(-999999, 2, inplace = True)

# 피처 세트와 레이블 세트 분리. 레이블 칼럼은 DataFrame의 맨 마지막에 위치해 칼럼 위치
X_features = cust_df.iloc[:,:-1]
y_labels = cust_df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size = 0.2, random_state=0)


lgbm_clf = LGBMClassifier(n_estimators=500)
evals = [(X_test, y_test)]
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals, verbose=True)
lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:, 1], average='macro')
print ('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
[1]	valid_0's binary_logloss: 0.165052	valid_0's auc: 0.817877
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.160082	valid_0's auc: 0.817734
[3]	valid_0's binary_logloss: 0.156395	valid_0's auc: 0.826796
[4]	valid_0's binary_logloss: 0.15354	valid_0's auc: 0.831011
[5]	valid_0's binary_logloss: 0.151312	valid_0's auc: 0.833917
[6]	valid_0's binary_logloss: 0.149504	valid_0's auc: 0.833856
[7]	valid_0's binary_logloss: 0.148079	valid_0's auc: 0.83643
[8]	valid_0's binary_logloss: 0.146815	valid_0's auc: 0.837348
[9]	valid_0's binary_logloss: 0.145641	valid_0's auc: 0.839167
[10]	valid_0's binary_logloss: 0.144916	valid_0's auc: 0.838244
[11]	valid_0's binary_logloss: 0.144231	valid_0's auc: 0.838277
[12]	valid_0's binary_logloss: 0.143605	valid_0's auc: 0.838228
[13]	valid_0's binary_logloss: 0.142999	valid_0's

In [19]:
from sklearn.model_selection import GridSearchCV

# 하이퍼 파라미터 테스트의 수행 속도를 향상시키기 위해 n_estimators를 200으로 감소
lgbm_clf = LGBMClassifier(n_estimators=200)

params = {'num_leaves':[32, 64],
          'max_depth':[128, 160],
          'min_child_samples':[60, 100],
          'subsample':[0.8, 1]}

# cv는 3으로 지정
gridcv = GridSearchCV(lgbm_clf, param_grid=params, cv=3)
evalset = [(X_train, y_train), (X_test, y_test)]
gridcv.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=evalset)

print ('GridSearchCV 최적 파라미터:', gridcv.best_params_)
lgbm_roc_score = roc_auc_score(y_test, gridcv.predict_proba(X_test)[:,1], average='macro')
print ('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	valid_0's binary_logloss: 0.156062	valid_0's auc: 0.819388	valid_1's binary_logloss: 0.16496	valid_1's auc: 0.81298
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's binary_logloss: 0.150891	valid_0's auc: 0.825885	valid_1's binary_logloss: 0.159835	valid_1's auc: 0.821538
[3]	valid_0's binary_logloss: 0.147109	valid_0's auc: 0.83246	valid_1's binary_logloss: 0.156285	valid_1's auc: 0.827605
[4]	valid_0's binary_logloss: 0.144157	valid_0's auc: 0.83795	valid_1's binary_logloss: 0.15334	valid_1's auc: 0.832662
[5]	valid_0's binary_logloss: 0.14169	valid_0's auc: 0.840556	valid_1's binary_logloss: 0.151055	valid_1's auc: 0.835268
[6]	valid_0's binary_logloss: 0.139652	valid_0's auc: 0.844322	valid_1's binary_logloss: 0.149164	valid_1's auc: 0.835763
[7]	valid_0's binary_logloss: 0.138004	valid_0's auc: 0.845086	valid_1's binary_logloss: 0.147653	valid_1's auc: 0.836108
[8]	valid_0's binary_logloss: 0.136505	valid_0's auc: 0.847126	valid_1's binary_logloss: 0

In [21]:
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=32, subsample=0.8, min_child_samples=100, max_depth=128)

evals = [(X_test, y_test)]
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals, verbose=True)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1], average='macro')
print ('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	valid_0's binary_logloss: 0.165119	valid_0's auc: 0.818835
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.159978	valid_0's auc: 0.820752
[3]	valid_0's binary_logloss: 0.156308	valid_0's auc: 0.828507
[4]	valid_0's binary_logloss: 0.153379	valid_0's auc: 0.83378
[5]	valid_0's binary_logloss: 0.151078	valid_0's auc: 0.837554
[6]	valid_0's binary_logloss: 0.149194	valid_0's auc: 0.838158
[7]	valid_0's binary_logloss: 0.147594	valid_0's auc: 0.839522
[8]	valid_0's binary_logloss: 0.146314	valid_0's auc: 0.840293
[9]	valid_0's binary_logloss: 0.145185	valid_0's auc: 0.840526
[10]	valid_0's binary_logloss: 0.144195	valid_0's auc: 0.84088
[11]	valid_0's binary_logloss: 0.143481	valid_0's auc: 0.841402
[12]	valid_0's binary_logloss: 0.142753	valid_0's auc: 0.841863
[13]	valid_0's binary_logloss: 0.142188	valid_0's auc: 0.842266
[14]	valid_0's binary_logloss: 0.141685	valid_0's auc: 0.841986
[15]	valid_0's binary_logloss: 0.141273	valid_0's au