In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as gbm

# Load data 

In [None]:
train_path = "/kaggle/input/tabular-playground-series-mar-2021/train.csv"
test_path = "/kaggle/input/tabular-playground-series-mar-2021/test.csv"

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)


In [None]:
len(df_train)

In [None]:
df_train.head()

In [None]:
df_train.columns

# Make category label integer 

In [None]:
cat_col = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18']
cont_col = ['cont0', 'cont1', 'cont3', 'cont4',
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9']
remove = ['cont10', 'cont2']

In [None]:
for col in cat_col:
    encoder = LabelEncoder()
    # encoder.fit(list(df_train[col]) + list(df_test[col]))
    encoder.fit(list(df_train[col].values) + list(df_test[col].values))
    df_train[col] = encoder.transform(df_train[col].values)
    df_test[col] = encoder.transform(df_test[col].values)
    print("done ", col)
df_train = df_train.drop(remove, axis=1)
df_test = df_test.drop(remove, axis=1)

In [None]:
X = df_train.drop(["id", "target"], axis=1)
y = df_train["target"]


In [None]:
X.shape

In [None]:
y.shape

# split training and validation 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

In [None]:
train_data = gbm.Dataset(X_train, label=y_train, categorical_feature=cat_col)

In [None]:
validation_data = gbm.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
lgbm_params = {
    'boosting': 'gbdt',
    'application': 'binary',
    'learning_rate': 0.005,
    'metric': 'binary_logloss',  
    'lambda_l1': 5e-05, 
    'lambda_l2': 1.35e-08, 
    'num_leaves': 300, 
    'feature_fraction': 0.75 ,
    'bagging_fraction': 0.4 ,
    'bagging_freq': 5, 
    'min_child_samples': 100
}


In [None]:
num_round = 3000
bst = gbm.train(lgbm_params, train_data, num_round, valid_sets=[validation_data], early_stopping_rounds=10, verbose_eval=500)

# evaluation

In [None]:
local_flag = False
if "target" in df_test.columns:
    df_test2 = df_test.drop(["id", "target"], axis=1)
    local_flag = True
else:
    df_test2 = df_test.drop(["id"], axis=1)

In [None]:
# for col in cat_col:
#     df_test2[col] = df_test2[col].apply(hash_cat_two)
#     print("done ", col)

In [None]:
# ?bst

In [None]:
df_test2.head()

In [None]:
pred = bst.predict(df_test2, num_iteration=bst.best_iteration)

In [None]:
df_test2

In [None]:
df_pred_final = df_test[["id"]]

In [None]:
df_pred_final["target"] = pred

In [None]:
df_pred_final.head()

In [None]:
if local_flag==False:
    df_pred_final.to_csv('submission.csv', index=False)
else:
    from sklearn.metrics import roc_auc_score
    score = roc_auc_score(df_test["target"], pred)
    print(score)

In [None]:
# y_test_label = df_test["target"]

In [None]:
# roc_auc_score(y_test_label, pred)

In [None]:
bst.num_trees()


In [None]:
bst.best_iteration