In [1]:
from copy import copy
import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

In [None]:
train_df = pd.read_csv('dataset/train_01.csv')
test_df = pd.read_csv('dataset/test_01.csv')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

kfold = StratifiedKFold(n_splits=5)

# RFC

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

In [None]:
train_score_rfc = rfc.score(x_train, y_train)
test_score_rfc = rfc.score(x_test, y_test)

print('train score:{}, test_score:{}'.format(train_score_rfc, test_score_rfc))

In [None]:
features = x_train.columns
importances = rfc.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(6,6))
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.show()

# XGB

In [None]:
xgb = xgb.XGBClassifier()
xgb.fit(x_train, y_train)

In [None]:
train_score_xgb = xgb.score(x_train, y_train)
test_score_xgb = xgb.score(x_test, y_test)

print('train score:{}, test_score:{}'.format(train_score_xgb, test_score_xgb))

# LGB

In [None]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test)

params_lgb = {
    'objective': 'binary',
    'metric':'auc',
    'num_iterations':1000,
    'eary_stopping': 50,
}

lgb = lgb.train(params_lgb, lgb_train, valid_sets=lgb_eval, verbose_eval=100)

# CB

In [None]:
params_cab = {
    'iterations':1000,
    'custom_loss':['Accuracy'],
    'early_stopping_rounds':50,
}

In [None]:
cab = CatBoostClassifier(**params_cab)

cab.fit(x_train, y_train)

In [None]:
train_score_cab = cab.score(x_train, y_train)
test_score_cab = cab.score(x_test, y_test)

print('train score:{}, test_score:{}'.format(train_score_cab, test_score_cab))

## submit

In [None]:
pred_rfc = rfc.predict(test_df)
pred_xgb = xgb.predict(test_df)
pred_lgb = lgb.predict(test_df)
pred_cab = cab.predict(test_df)

In [None]:
pred = pd.DataFrame({
    'RandomForest': pred_rfc,
    'XGBoost': pred_xgb,
    'LigntGBM': pred_lgb,
    'CatBoost': pred_cab
})

pred['sum'] = pred.sum(axis=1)
pred['pred'] = 0
pred.head()

In [None]:
pred['pred'] = np.where(pred['sum'] >= 2.0, 1, 0)
pred.head()

In [None]:
submission = pd.DataFrame({
    'ID':ID,
    'pred':pred.pred
})

submission.to_csv('submit/initial_submit.csv', header=False, index=False)