In [1]:
import datetime
from copy import copy
import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
now = datetime.datetime.now()

In [3]:
train_df = pd.read_csv('dataset/train_01.csv')
test_df = pd.read_csv('dataset/test_01.csv')

In [4]:
ID = test_df['id']
x = train_df.drop(columns=['y', 'id'])
y = train_df['y']
test_df = test_df.drop(columns='id')

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    shuffle=True,
                                                    random_state=18
                                                   )

# RF

In [None]:
def objective_rf(trial):
 
    param_rf = {      
        'n_estimators': 1800,
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_split': trial.suggest_int('min_samples_split', 5, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'random_state': 18,
        'n_jobs': 2
    }
    
    rf = RandomForestClassifier(**param_rf)
    rf.fit(x_train, y_train)
    predictions = rf.predict(x_test)
    auc_score = roc_auc_score(predictions, y_test)
    
    return auc_score

In [None]:
%%time

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=300, n_jobs=2)
print(study_rf.best_params)
print(study_rf.best_value)
rf_best_params = study_rf.best_params

In [None]:
rf = RandomForestClassifier(**rf_best_params)
rf.fit(x_train, y_train)

In [None]:
train_score_rf = rf.score(x_train, y_train)
test_score_rf = rf.score(x_test, y_test)

rf_p = rf.predict(x_test)

acc_rf = accuracy_score(rf_p, y_test)
auc_rf = roc_auc_score(rf_p, y_test)

print('train score:{}, test_score:{}'.format(train_score_rf, test_score_rf))
print('acc:{}, auc:{}'.format(acc_rf, auc_rf))

In [None]:
features = x_train.columns
importances = rf.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(6,6))
plt.barh(range(len(indices)), importances[indices], color='darkviolet', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.show()

In [None]:
rf_name = 'models/RF_' + now.strftime('%d_%H%M') + '.pkl'
pickle.dump(rf, open(rf_name, 'wb'))

In [None]:
pred_rf = rf.predict_proba(test_df)
rf_p = pd.DataFrame(pred_rf)
rf_p.head()

In [None]:
submission = pd.DataFrame({
    'ID':ID,
    'pred':rf_p[1]
})


now = datetime.datetime.now()
file_name = 'submit/submit_RF' + now.strftime('%d_%H%M') + '.csv'

submission.to_csv(file_name, index=False, header=False)

# XGB

In [10]:
def objective_xgb(trial):
    
    param_xgb ={
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'eta': 0.05,
        'n_estimators': 1800,
        'early_stopping_rounds': 100,
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'gamma': 0.0,
        'alpha': 0.0,
        'seed': 18,
        'n_jobs': 2
    }
    
    xgb = XGBClassifier(**param_xgb)
    xgb.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], eval_metric='logloss', verbose=False)
    predictions = xgb.predict(x_test)
    auc_score = roc_auc_score(predictions, y_test)
    
    return auc_score

In [11]:
%%time

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=300, n_jobs=4)
print(study_xgb.best_params)
print(study_xgb.best_value)
xgb_best_params = study_xgb.best_params

[I 2020-08-10 20:50:51,587] Finished trial#0 resulted in value: 0.8146358097467363. Current best value is 0.8146358097467363 with parameters: {'max_depth': 3}.
[I 2020-08-10 20:50:55,567] Finished trial#3 resulted in value: 0.7671764762401514. Current best value is 0.8146358097467363 with parameters: {'max_depth': 3}.
[I 2020-08-10 20:50:59,897] Finished trial#1 resulted in value: 0.751707509456286. Current best value is 0.8146358097467363 with parameters: {'max_depth': 3}.
[I 2020-08-10 20:51:16,537] Finished trial#2 resulted in value: 0.7347909028368521. Current best value is 0.8146358097467363 with parameters: {'max_depth': 3}.
[I 2020-08-10 20:51:33,311] Finished trial#4 resulted in value: 0.7347909028368521. Current best value is 0.8146358097467363 with parameters: {'max_depth': 3}.
[I 2020-08-10 20:51:43,485] Finished trial#5 resulted in value: 0.7347909028368521. Current best value is 0.8146358097467363 with parameters: {'max_depth': 3}.
[I 2020-08-10 20:52:00,329] Finished tria

KeyboardInterrupt: 

In [None]:
xgb_model = XGBClassifier(**xgb_best_params)
xgb_model.fit(x_train, y_train)

In [None]:
train_score_xgb = xgb_model.score(x_train, y_train)
test_score_xgb = xgb_model.score(x_test, y_test)

xgb_p = xgb_model.predict(x_test)

acc_xgb = accuracy_score(xgb_p, y_test)
auc_xgb = roc_auc_score(xgb_p, y_test)

print('train score:{}, test_score:{}'.format(train_score_xgb, test_score_xgb))
print('acc:{}, auc:{}'.format(acc_xgb, auc_xgb))

In [None]:
_, ax = plt.subplots(figsize=(8, 8))
xgb.plot_importance(xgb_model,
                    ax=ax,
                    importance_type='gain',
                    color='darkviolet',
                    show_values=False)
plt.show()

In [None]:
xgb_name = 'models/XG_' + now.strftime('%d_%H%M') + '.pkl'
pickle.dump(xgb_model, open(xgb_name, 'wb'))

In [None]:
pred_xgb = xgb_model.predict(test_df)
pred = pd.DataFrame({'XGBoost': pred_xgb})

In [None]:
submission = pd.DataFrame({
    'ID':ID,
    'pred':pred['XGBoost']
})


now = datetime.datetime.now()
file_name = 'submit/submit_XGBoost' + now.strftime('%d_%H%M') + '.csv'

submission.to_csv(file_name, header=False, index=False)

# LGB

In [None]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

In [None]:
def objective_lgb(trial):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)
    
    params_lgb = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'n_estimators': 1800,
        'learning_rate': 0.05,
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_uniform('subsample', 0.3, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 35),
        'feature fraction': trial.suggest_uniform('feature fraction', 0.80, 0.95),
        'random_state': 18,
        'n_jobs': 2
    }
                          
    lgb = LGBMClassifier(**params_lgb)
    lgb = lgb.fit(x_train, y_train)
    
    predictions = lgb.predict(x_test)
    auc_score = roc_auc_score(predictions, y_test)
    
    return auc_score

In [None]:
%%time

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=100, n_jobs=4)

print(study_lgb.best_params)
print(study_lgb.best_value)
lgb_best_params = study_lgb.best_params

In [None]:
lgb_model = LGBMClassifier(**lgb_best_params)
lgb_model.fit(x_train, y_train)

In [None]:
train_score_lgb = lgb_model.score(x_train, y_train)
test_score_lgb = lgb_model.score(x_test, y_test)

lgb_p = lgb_model.predict(x_test)

acc_lgb = accuracy_score(lgb_p, y_test)
auc_lgb = roc_auc_score(lgb_p, y_test)

print('train score:{}, test_score:{}'.format(train_score_lgb, test_score_lgb))
print('acc:{}, auc:{}'.format(acc_lgb, auc_lgb))

In [None]:
lgb.plot_importance(lgb_model, figsize=(8,8), color='darkviolet')

In [None]:
lgb_name = 'models/LGBM_' + now.strftime('%d_%H%M') + '.pkl'
pickle.dump(lgb_model, open(lgb_name, 'wb'))

In [None]:
pred_lgb = lgb_model.predict_proba(test_df)
lgb_p = pd.DataFrame(pred_lgb)
lgb_p.head()

In [None]:
submission = pd.DataFrame({
    'ID':ID,
    'pred':lgb_p[1]
})


now = datetime.datetime.now()
file_name = 'submit/submit_LGBM' + now.strftime('%d_%H%M') + '.csv'

submission.to_csv(file_name, index=False, header=False)

# CB

In [21]:
def objective_cb(trial):
    
    param_cb = {
        'iterations': 1800,
        'learning_rate': 0.05,
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 9),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50),
        'random_seed': 18
    }
    
    cb = CatBoostClassifier(**param_cb)
    cb.fit(x_train, y_train, verbose=False)
    predictions = cb.predict(x_test)
    auc_score = roc_auc_score(predictions, y_test)
    
    return auc_score

In [22]:
%%time

study_cb = optuna.create_study(direction='maximize')
study_cb.optimize(objective_cb, n_trials=100, n_jobs=4)

print(study_cb.best_params)
print(study_cb.best_value)
cb_best_params = study_cb.best_params

[W 2020-08-10 21:46:55,208] Setting status of trial#5 as TrialState.FAIL because of the following error: ValueError('y should be a 1d array, got an array of shape (8130, 2) instead.')
Traceback (most recent call last):
  File "/home/awax2/miniconda3/envs/lab/lib/python3.7/site-packages/optuna/study.py", line 648, in _run_trial
    result = func(trial)
  File "<ipython-input-19-6a59f22e5a7a>", line 16, in objective_cb
    auc_score = roc_auc_score(y_test, predictions)
  File "/home/awax2/miniconda3/envs/lab/lib/python3.7/site-packages/sklearn/utils/validation.py", line 73, in inner_f
    return f(**kwargs)
  File "/home/awax2/miniconda3/envs/lab/lib/python3.7/site-packages/sklearn/metrics/_ranking.py", line 393, in roc_auc_score
    sample_weight=sample_weight)
  File "/home/awax2/miniconda3/envs/lab/lib/python3.7/site-packages/sklearn/metrics/_base.py", line 77, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "/home/awax2/miniconda

KeyboardInterrupt: 

In [None]:
cb = CatBoostClassifier(**params_cb)
cb.fit(x_train, y_train)

In [None]:
train_score_cb = cb.score(x_train, y_train)
test_score_cb = cb.score(x_test, y_test)

cb_p = cb_model.predict(x_test)

acc_cb = accuracy_score(cb_p, y_test)
auc_cb = roc_auc_score(cb_p, y_test)

print('train score:{}, test_score:{}'.format(train_score_cb, test_score_cb))
print('acc:{}, auc:{}'.format(acc_cb, auc_cb))

In [None]:
pred_cb = cb.predict_proba(test_df)

In [None]:
pred = pd.DataFrame({

    'CatBoost': pred_cab
})

## submit

In [None]:
pred_rfc = rfc.predict(test_df)
pred_xgb = xgb_model.predict(test_df)
pred_lgb = lgb_model.predict(test_df)
pred_cab = cab.predict(test_df)

In [None]:
pred = pd.DataFrame({
    'RandomForest': pred_rfc,
    'XGBoost': pred_xgb,
    'LightGBM': pred_lgb,
    'CatBoost': pred_cab
})

pred['sum'] = pred.sum(axis=1)
pred['pred'] = [i/4 for i in pred['sum']]
pred.head()

In [None]:
submission = pd.DataFrame({
    'ID':ID,
    'pred':pred['pred']
})


now = datetime.datetime.now()
file_name = 'submit/submit_' + now.strftime('%d_%H%M') + '.csv'

submission.to_csv(file_name, header=False, index=False)

In [None]:
pred = pd.DataFrame({
    'RandomForest': rf_p[1],
    'LightGBM': lgb_p[1]
})

pred['sum'] = pred.sum(axis=1)
pred['pred'] = [i/2 for i in pred['sum']]
pred.head()