In [1]:
import datetime
from copy import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [2]:
now = datetime.datetime.now()

In [3]:
train_df = pd.read_csv('dataset/train_13_2335.csv')
test_df = pd.read_csv('dataset/test_13_2335.csv')

In [4]:
ID = test_df['id']
x = train_df.drop(columns=['y', 'id'])
y = train_df['y']
test_df = test_df.drop(columns=['y', 'id'])

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    shuffle=True,
                                                    random_state=18)

In [6]:
skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=18)

# ・RANDOM FOREST

In [7]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

RandomForestClassifier()

In [8]:
train_score_rf = rf.score(x_train, y_train)
test_score_rf = rf.score(x_test, y_test)

rf_p = rf.predict(x_test)

acc_rf = accuracy_score(rf_p, y_test)
auc_rf = roc_auc_score(rf_p, y_test)

print('train score:{}, test_score:{}'.format(train_score_rf, test_score_rf))
print('acc:{}, auc:{}'.format(acc_rf, auc_rf))

train score:0.9999538745387454, test_score:0.9293357933579336
acc:0.9293357933579336, auc:0.836468890692528


In [9]:
%%time

rf_params = {
   'n_estimators': [i for i in np.arange(1000,1200,100)],
    'criterion': ['gini', 'entropy'],
    'max_depth': [i for i in np.arange(3,10)],
    'min_samples_split': [i for i in np.arange(5,15)],
    'min_samples_leaf': [i for i in np.arange(1,5)]
}

rf = RandomForestClassifier(random_state=18) 
gs_rf = GridSearchCV(estimator=rf, param_grid=rf_params, cv=skf, n_jobs=-1)
gs_rf.fit(x_train, y_train)

print(gs_rf.best_score_)
print(gs_rf.best_params_)

KeyboardInterrupt: 

In [10]:
train_score_gsrf = gs_rf.score(x_train, y_train)
test_score_gsrf = gs_rf.score(x_test, y_test)

gsrf_p = gs_rf.predict(x_test)

acc_gsrf = accuracy_score(gsrf_p, y_test)
auc_gsrf = roc_auc_score(gsrf_p, y_test)

print('train score:{}, test_score:{}'.format(train_score_gsrf, test_score_gsrf))
print('acc:{}, auc:{}'.format(acc_gsrf, auc_gsrf))

AttributeError: 'GridSearchCV' object has no attribute 'scorer_'

In [None]:
features = x_train.columns
importances = gs_rf.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(6,6))
plt.barh(range(len(indices)), importances[indices], color='darkviolet', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.show()

In [None]:
pred_rf = gs_rf.predict_proba(test_df)
rf_p = pd.DataFrame(pred_rf)
rf_p.head()

# ・LightGBM

In [None]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

In [None]:
lgb_model = lgb.LGBMClassifier(n_estimators=1000, random_state=18, n_jobs=2)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=30, verbose=100)

In [None]:
train_score_lgb = lgb_model.score(x_train, y_train)
test_score_lgb = lgb_model.score(x_test, y_test)

lgb_p = lgb_model.predict(x_test)

acc_lgb = accuracy_score(lgb_p, y_test)
auc_lgb = roc_auc_score(lgb_p, y_test)

print('train score:{}, test_score:{}'.format(train_score_lgb, test_score_lgb))
print('acc:{}, auc:{}'.format(acc_lgb, auc_lgb))

In [None]:
%%time

params_lgb = {
    'learning_rate': [i for i in np.arange(0.01, 0.05, 0.01)],
    'max_depth': [i for i in np.arange(3, 10)],
    'min_child_weight': [i for i in np.arange(1, 5)],
    'feature fraction': [i for i in np.arange(0.80, 0.95, 0.05)],
    'subsample': [i for i in np.arange(0.3, 0.9, 0.1)]
}

lgb_model = lgb.LGBMClassifier(n_estimators=1000, random_state=18, n_jobs=3)
gs_lgb = GridSearchCV(lgb_model, param_grid=params_lgb, cv=skf, n_jobs=2)
gs_lgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=30)

print(gs_lgb.best_score_)
print(gs_lgb.best_params_)

In [None]:
train_score_gslgb = gs_lgb.score(x_train, y_train)
test_score_gslgb = gs_lgb.score(x_test, y_test)

gslgb_p = gs_lgb.predict(x_test)

acc_gslgb = accuracy_score(gslgb_p, y_test)
auc_gslgb = roc_auc_score(gslgb_p, y_test)

print('train score:{}, test_score:{}'.format(train_score_gslgb, test_score_gslgb))
print('acc:{}, auc:{}'.format(acc_gslgb, auc_gslgb))

In [None]:
lgb.plot_importance(ga_lgb, figsize=(8,8), color='darkviolet')

In [None]:
pred_lgb = gs_lgb.predict_proba(test_df)
lgb_p = pd.DataFrame(pred_lgb)
lgb_p.head()

# ・SUBMIT

In [None]:
pred = pd.DataFrame({
    'RandomForest': rf_p[1],
    'LightGBM': lgb_p[1]
})

pred.head()

In [None]:
pred['sum'] = pred.sum(axis=1)
pred['pred'] = [i/2 for i in pred['sum']]
pred.head()

In [None]:
submission = pd.DataFrame({
    'ID':ID,
    'pred':pred['pred']
})

file_name = 'submit/submit_' + now.strftime('%d_%H%M') + '.csv'

submission.to_csv(file_name, header=False, index=False)

In [None]:
# RandomForest

submission = pd.DataFrame({
    'ID':ID,
    'pred':pred['RandomForest']
})

file_name = 'submit/submit_RF' + now.strftime('%d_%H%M') + '.csv'

submission.to_csv(file_name, index=False, header=False)

In [None]:
# LightGBM

submission = pd.DataFrame({
    'ID':ID,
    'pred':pred['LightGBM']
})

file_name = 'submit/submit_LGBM' + now.strftime('%d_%H%M') + '.csv'

submission.to_csv(file_name, index=False, header=False)