In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from tqdm import tqdm
import matplotlib.pyplot as plt 
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
X = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv',index_col=0)
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv',index_col=0)

y = X['target']
X = X.drop(['target'], axis= 1)
temp = X.append(test, ignore_index=False,sort=False)

In [None]:
label = LabelEncoder()
categorical_feature = np.where(temp.dtypes != 'float64')[0].tolist()
categorical_feature_columns = temp.select_dtypes(exclude=['float64']).columns

for column in categorical_feature_columns:
        label.fit(temp[column])
        temp[column] = label.transform(temp[column])
        
X = temp[:len(X)]
test = temp[len(X):]
temp = pd.DataFrame

In [None]:
#parameters after optuna
lgbm_parameters = {
    'cat_feature': categorical_feature,
    'metric': 'auc', 
    'n_estimators': 20000,
    'reg_alpha': 0.000721024661208569,
    'reg_lambda': 47.79748127808107,
    'colsample_bytree': 0.24493010466517195,
    'subsample': 0.12246675404710294,
    'learning_rate': 0.013933182980403087,
    'max_depth': 21,
    'num_leaves': 90,
    'min_child_samples': 144,
    'cat_smooth': 63
}

In [None]:
lgbm_test_pred = np.zeros(len(test))
n_splits=10

kf = KFold(n_splits=n_splits, shuffle=True)
auc=[]

for trn_idx, val_idx in tqdm(kf.split(X,y)):
    x_train_idx = X.iloc[trn_idx]
    y_train_idx = y.iloc[trn_idx]
    x_valid_idx = X.iloc[val_idx]
    y_valid_idx = y.iloc[val_idx]

    lgbm_model = LGBMClassifier(**lgbm_parameters)
    lgbm_model.fit(x_train_idx, y_train_idx, eval_set = ((x_valid_idx,y_valid_idx)),verbose = 1000, early_stopping_rounds = 200)  
    lgbm_test_pred += lgbm_model.predict_proba(test)[:,1]/n_splits
    auc.append(roc_auc_score(y_valid_idx, lgbm_model.predict_proba(x_valid_idx)[:,1])) 

print(f'AUC: {np.mean(auc)}')
pd.DataFrame({'id':test.index,'target':lgbm_test_pred}).to_csv('submission.csv', index=False)

In [None]:
plt.rcParams["figure.figsize"] = (6, 5)
lightgbm.plot_importance(lgbm_model,max_num_features = 16,height=.9)