In [None]:
import numpy as np 
import pandas as pd 
import os
import random
import warnings
warnings.simplefilter('ignore')
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb
import optuna

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sample_sub = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
for col_name in train_df.columns:
    col_type = train_df[col_name].dtype
    if col_type == 'object' or col_type.name == 'category':
        train_df[col_name] = train_df[col_name].astype('category')

for col_name in test_df.columns:
    col_type = test_df[col_name].dtype
    if col_type == 'object' or col_type.name == 'category':
        test_df[col_name] = test_df[col_name].astype('category')

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
def train_fn(params, dataset, callbacks):
    returned_dict = lgb.cv(params, dataset, return_cvbooster=True,num_boost_round=20_000, early_stopping_rounds=100, metrics='binary_logloss', callbacks=callbacks)
    return min(returned_dict["binary_logloss-mean"]), returned_dict['cvbooster']

In [None]:
train_dataset = lgb.Dataset(train_df.drop(['Survived'], axis=1), label = train_df.Survived)

In [None]:
test_dataset = lgb.Dataset(test_df)

In [None]:
# https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization?fbclid=IwAR0wAbCA7YT6feymAadUAZ8x-pikdZSwcxPevsFKHhZdNa4626tQHn3n5EI

def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 2, 256),
        'objective': 'binary',
        'learning_rate': trial.suggest_loguniform('lambda_l1', 1e-5, 0.1),
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        "bagging_freq": trial.suggest_int('bagging_freq', 1,10),
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'cat_smooth' : trial.suggest_loguniform('cat_smooth', 1.0, 100.0),
        'cat_l2' : trial.suggest_loguniform('cat_l2', 1e-8, 10.0),
        'path_smooth': trial.suggest_float('path_smooth', 0.0,10.0),
        "verbosity": -1,
        }
        
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'binary_logloss')
    
    score, _ = train_fn(params, train_dataset, [pruning_callback])
    return score

In [None]:
study = optuna.create_study()
study.optimize(objective, timeout=30000)

In [None]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
trials_df = study.trials_dataframe()
trials_df

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_intermediate_values(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
score, model = train_fn(study.best_trial.params, train_dataset, None)
print(score)

In [None]:
preds = model.predict(test_df)
sample_sub['Survived'] = np.where(np.mean(preds, axis=0) > 0.5, 1, 0)
sample_sub.to_csv('submission.csv', index=False)