# Configuration

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preprocessing

In [None]:
cust_df = pd.read_csv('../input/santander-customer-satisfaction/train.csv')
print(cust_df.shape)
cust_df.head(3)

In [None]:
cust_df.info()

### 만족 / 불만족 비율 확인

In [None]:
print(cust_df['TARGET'].value_counts())
unsatisfied_cnt = cust_df[cust_df['TARGET'] == 1].TARGET.count()
total_cnt = cust_df['TARGET'].count()
print('Unsatisfied Percentage: {0:.4f}'.format((unsatisfied_cnt / total_cnt)))

### 각 피처의 값 분포 확인

In [None]:
cust_df.describe()

In [None]:
## var3 값 중 -99999와 같은 이상값이 존재함.
cust_df['var3'].replace(-999999, 2, inplace=True)
cust_df.drop('ID', axis=1, inplace=True)

## X, Y 분류
x_features = cust_df.iloc[:, :-1]
y_labels = cust_df.iloc[:, -1]

### Training Set 과 Validation Set 구분

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_features, y_labels, test_size=0.2)

# *XGBoost*

In [None]:
## GPU Information
!nvidia-smi

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

xgb_clf = XGBClassifier(n_estimators=500, tree_method='gpu_exact', random_state=156)

xgb_clf.fit(x_train, y_train, early_stopping_rounds=100, eval_metric='auc', eval_set=[(x_train, y_train), (x_test, y_test)])

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(x_test)[:, 1], average='macro')

print('ROC AUC: {0:.4f}'.format(xgb_roc_score))

### *XGBoost Hyper Parameters Tuning Using GridSearchCV*

In [None]:
from sklearn.model_selection import GridSearchCV

xgb_clf = XGBClassifier(n_estimators=100, tree_method='gpu_hist')

params = {
    'max_depth': [5, 7],
    'min_child_weight': [1, 3],
    'colsample_bytree': [0.5, 0.75]
}

gridcv = GridSearchCV(xgb_clf, param_grid=params, cv=3)
gridcv.fit(x_train, y_train, early_stopping_rounds=30, eval_metric='auc', eval_set=[(x_train, y_train), (x_test, y_test)])

print('Best Params: \n', gridcv.best_params_)

xgb_roc_score = roc_auc_score(y_test, gridcv.predict_proba(x_test)[:, 1], average='macro')
print('ROC AUC: {0: .4f}'.format(xgb_roc_score))

In [None]:
xgb_clf = XGBClassifier(n_estimators=1000, random_state=156, learning_rate=0.02, max_depth=5, min_child_weight=3, colsample_bytree=0.5, reg_alpha=0.03, tree_method='gpu_hist')

xgb_clf.fit(x_train, y_train, early_stopping_rounds=200, eval_metric='auc', eval_set=[(x_train, y_train), (x_test,  y_test)])

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(x_test)[:, 1], average='macro')

print('ROC AUC: {0: .4f}'.format(xgb_roc_score))

In [None]:
from xgboost import plot_importance
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_importance(xgb_clf, ax=ax, max_num_features=20, height=0.4)

# *LightGBM(Continued)*

In [None]:
from lightgbm import LGBMClassifier

lgb_clf = LGBMClassifier(n_estimators=500, tree_method='gpu_exact', random_state=156)

lgb_clf.fit(x_train, y_train, early_stopping_rounds=100, eval_metric='auc', eval_set=[(x_train, y_train), (x_test, y_test)])

lgb_roc_score = roc_auc_score(y_test, lgb_clf.predict_proba(x_test)[:, 1], average='macro')

print('ROC AUC: {0:.4f}'.format(lgb_roc_score))

### *LightGBM Hyper Parameters Tuning Using GridSearchCV*

In [None]:
from sklearn.model_selection import GridSearchCV

lgb_clf = LGBMClassifier(n_estimators=100, tree_method='gpu_hist')

params = { 'num_leaves': [32, 64 ],
           'max_depth':[128, 160],
           'min_child_samples':[60, 100],
           'subsample':[0.8, 1]
         }

gridcv = GridSearchCV(lgb_clf, param_grid=params, cv=3)
gridcv.fit(x_train, y_train, early_stopping_rounds=30, eval_metric='auc', eval_set=[(x_train, y_train), (x_test, y_test)])

print('Best Params: \n', gridcv.best_params_)

lgb_roc_score = roc_auc_score(y_test, gridcv.predict_proba(x_test)[:, 1], average='macro')
print('ROC AUC: {0: .4f}'.format(lgb_roc_score))

In [None]:
lgb_clf = LGBMClassifier(n_estimators=1000, tree_method='gpu_hist', max_depth=128, min_child_samples=100, num_leaves=64, subsample=0.8, learning_rate=0.02, reg_alpha=0.03)

lgb_clf.fit(x_train, y_train, early_stopping_rounds=200, eval_metric='auc', eval_set=[(x_train, y_train), (x_test,  y_test)])

lgb_roc_score = roc_auc_score(y_test, lgb_clf.predict_proba(x_test)[:, 1], average='macro')

print('ROC AUC: {0: .4f}'.format(lgb_roc_score))

In [None]:
from lightgbm import plot_importance
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_importance(lgb_clf, ax=ax, max_num_features=20, height=0.4)