In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import optuna

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
sample_submission.head()

In [None]:
columns = df_test.columns[1:]
X = df_train[columns].values
X_test = df_test[columns].values
target = df_train['claim'].values.reshape(-1,)

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

In [None]:
def objective(trial,data=X,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(X, target, test_size=0.2,random_state=42)
    dtrain = lgb.Dataset(train_x, label=train_y)
    
    params = {
        'metric': 'AUC', 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),        
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 200),
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    }
    model = lgb.train(params, dtrain) 
    
    preds = model.predict(test_x)
    
    auc = roc_auc_score(test_y, preds)
    
    return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
params=study.best_params   
params['metric'] = 'AUC'
print(params)

In [None]:
%%time
test_preds = np.zeros((df_test.shape[0],))
kf = StratifiedKFold(n_splits=10,random_state=48,shuffle=True)
auc = [] 
n=0
for trn_idx, test_idx in kf.split(X, target):
    X_tr,X_val = X[trn_idx],X[test_idx]
    y_tr,y_val = target[trn_idx],target[test_idx]
    model = lgb.LGBMClassifier(**params,
                                device='gpu')
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    test_preds+=model.predict(X_test)/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict(X_val)))
    print(n+1,auc[n])
    n+=1

In [None]:
sample_submission['claim'] = test_preds
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head()