In [None]:
%config Completer.use_jedi = False

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sub.head()

In [None]:
train['target'].plot(kind='hist')

In [None]:
label = train['target'].values
train.drop(['target'], axis=1, inplace=True)

In [None]:
all_data = pd.concat([train, test], axis=0)
for i in train.columns:
    all_data[i + '_count'] = all_data.groupby(i)[i].transform('count')
train, test = all_data.iloc[: len(train)], all_data.iloc[len(train): ]

In [None]:
fold = StratifiedKFold(10, shuffle=True, random_state=2021)
val_label_list = []
val_pre_list = []
test_pre_numpy = np.zeros(len(test))
importance = np.zeros((train.shape[1], 10))
importance_df = pd.DataFrame(columns=['features', 'importance', 'std'])
for fold_, (trn_id, test_id) in enumerate(fold.split(train, label)):
    print("Fold {} Train".format(fold_ + 1))
    train_fold, val = train.values[trn_id], train.values[test_id]
    train_label, val_label = label[trn_id], label[test_id]
    clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=3000, 
                              subsample_for_bin=200000, objective='binary', class_weight=None, min_split_gain=0.0, min_child_weight=0.001, 
                              min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, 
                              random_state=2021, n_jobs=- 1, silent=True)
    clf.fit(train_fold, train_label, eval_set=(val, val_label), eval_metric='auc', early_stopping_rounds=200, verbose=500)
    val_pre = clf.predict_proba(val)[:, 1]
    val_label_list.append(val_label)
    val_pre_list.append(val_pre)
    test_pre = clf.predict_proba(test)[:, 1]
    test_pre_numpy += test_pre/10
    importance[:, fold_] = clf.feature_importances_
importance_df['features'] = train.columns
importance_df['importance'] = np.mean(importance, axis=1)
importance_df['std'] = np.std(importance, axis=1)
importance_df = importance_df.sort_values(by='importance', ascending=False)

print('oof auc score is {}'.format(roc_auc_score(np.concatenate(val_label_list, axis=0), np.concatenate(val_pre_list, axis=0))))
plt.rc('font', family='Times New Roman', size=11, weight='bold')
plt.figure(figsize=(8, 8))
lgb_feature = importance_df['features'].values[: 20]
x = np.arange(len(lgb_feature))
err_attr={"elinewidth":1, "ecolor":"black", "capsize":2, 'alpha':0.6}
plt.bar(x, importance_df['importance'].values[: 20], width=0.65, label='LGB Importance', yerr=importance_df['std'].values[: 20], error_kw=err_attr, color='cornflowerblue', alpha=0.9)
plt.xticks(x)
plt.gca().set_xticklabels(lgb_feature)
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
sub.target = test_pre_numpy
sub.to_csv('submission.csv', index=False)

In [None]:
test_pre_numpy

In [None]:
sub['target'] = test_pre_numpy

In [None]:
sub.head()