In [None]:
import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold , StratifiedKFold
import lightgbm

In [None]:
path = '../input/tabular-playground-series-mar-2021/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sub = pd.read_csv(path + 'sample_submission.csv')

In [None]:
cat = [col for col in train.columns if 'cat' in col]
cont = [col for col in test.columns if 'cont' in col]
all_features = cat + cont

In [None]:
[x for x in range(19)]

# Label Encoder

In [None]:
all_df = pd.concat([train , test]).reset_index(drop = True)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat:
    all_df[col] = le.fit_transform(all_df[col])

train = all_df[:train.shape[0]]
test = all_df[train.shape[0]:]

In [None]:
data = train[all_features]
target = train['target']

# optuna

In [None]:
def objective(trial , data = data , target = target):
    train_x , test_x , train_y , test_y = train_test_split(data , target , \
            test_size = 0.028059109276941666 , random_state = 42)
    
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-5 , 10),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 1e-5 , 10),
        'num_leaves' : trial.suggest_int('num_leaves' , 11 , 300),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0 , 0.1),
        'max_depth' : trial.suggest_int('max_depth' , 5 , 20),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 100),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
        'subsample' : trial.suggest_uniform('subsample' , 0 , 1.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree' , 1e-5 , 1),
        'random_state' : trial.suggest_categorical('random_state' , [0,42,2021,555]),
        'metric' : 'auc',
        'device_type' : 'gpu',
    }
    model = lightgbm.LGBMClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] , early_stopping_rounds = 200 , \
             verbose = False)
    preds = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y , preds)
    return auc

In [None]:
# study = optuna.create_study(direction = 'maximize' , study_name = 'lgbm')
# study.optimize(objective , n_trials = 50)
# print('numbers of the finished trials:' , len(study.trials))
# print('the best params:' , study.best_trial.params)
# print('the best value:' , study.best_value)

In [None]:
# import plotly
# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline


In [None]:
# optuna.visualization.plot_optimization_history(study)

In [None]:
# optuna.visualization.plot_param_importances(study)

parameters copy from   https://www.kaggle.com/vitnam/mar-2021-single-lgbm

In [None]:
#parameters source:https://www.kaggle.com/vitnam/mar-2021-single-lgbm
params = {'reg_alpha': 4.203457823159052, 'reg_lambda': 6.34173530304477, 'num_leaves': 148,
 'min_child_samples': 55, 'max_depth': 16, 'learning_rate': 0.01, 'colsample_bytree': 0.22290988791359692,
 'n_estimators': 2703, 'cat_smooth': 37, 'cat_l2': 10, 'min_data_per_group': 97, 'device': 'gpu',
 'random_state': 26, 'cat_feature': [0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,
  15,  16,  17,  18], 'n_jobs': -1, 'boosting_type': 'gbdt', 'metric': 'AUC'}

In [None]:
#The score for the best parameter I got is 0.89211
#the parameters is below
# params = {
#     'reg_alpha': 0.014335154764390193, 'reg_lambda': 0.008054411322239597, 'num_leaves': 214, \
#     'learning_rate': 0.007986033003932509, 'max_depth': 14, 'n_estimators': 5581, \
#     'min_child_samples': 94, 'min_child_weight': 4.579485567290539e-05, \
#     'subsample': 0.3380435315962088, 'colsample_bytree': 0.16829994859168315, 'random_state': 555
# }
# params['metric'] = 'auc'
# params['device'] = 'gpu'


In [None]:
preds = np.zeros(test.shape[0])
oof_preds = np.zeros(train.shape[0])
kf = StratifiedKFold(n_splits = 20 , random_state = 0 , shuffle = True)
roc = []
n = 0
for trn_idx , val_idx in kf.split(data , target):
    train_x = data.iloc[trn_idx]
    train_y = target.iloc[trn_idx]
    val_x = data.iloc[val_idx]
    val_y = target.iloc[val_idx]
    
    model = lightgbm.LGBMClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(val_x , val_y)] , early_stopping_rounds = 200 , \
             verbose = False)
    preds += model.predict_proba(test[all_features])[:,1]/kf.n_splits
    oof_preds += model.predict_proba(data[all_features])[:,1]/kf.n_splits
    roc.append(roc_auc_score(val_y , model.predict_proba(val_x)[:,1]))
    print(n+1 , roc[n])
    n+=1

In [None]:
sub['target'] = preds
sub.to_csv('slgbmsubmission.csv' , index = False)

In [None]:
output = pd.DataFrame({'id':train['id'] , 'target':oof_preds})
output.to_csv('slgbmoof_predictions.csv' , index = False)

In [None]:
sub