In [None]:
import numpy as np 
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from skimage.filters import threshold_otsu
import lightgbm as lgb
import gc
from tqdm import tqdm

SEED = 0

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv", index_col='id')
test = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv", index_col='id')

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

In [None]:
train['abs_sum'] = train[features].abs().sum(axis=1)
train['sem'] = train[features].sem(axis=1)
train['std'] = train[features].std(axis=1)
train['avg'] = train[features].mean(axis=1)
train['max'] = train[features].max(axis=1)
train['min'] = train[features].min(axis=1)

test['abs_sum'] = test[features].abs().sum(axis=1)
test['sem'] = test[features].sem(axis=1)
test['std'] = test[features].std(axis=1)
test['avg'] = test[features].mean(axis=1)
test['max'] = test[features].max(axis=1)
test['min'] = test[features].min(axis=1)

In [None]:
X = train.drop(["target"], axis=1)
X_test = test
y = train["target"]

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
del test, train, scaler
gc.collect()

In [None]:
import optuna
from optuna.samplers import TPESampler
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

def objective(trial, X_train=X_train, X_valid=X_valid, y_train=y_train, y_valid=y_valid):
    """
    A function to train a model using different hyperparamerters combinations provided by Optuna. 
    Loss of validation data predictions is returned to estimate hyperparameters effectiveness.
    """       
    #A set of hyperparameters to optimize by optuna
    lgbm_params = {
                    "objective": trial.suggest_categorical("objective", ['binary']),
                    "boosting_type": trial.suggest_categorical("boosting_type", ['gbdt']),
                    "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                    "max_depth": trial.suggest_int("max_depth", 2, 8),
                    "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, step=0.001),
                    "n_estimators": trial.suggest_int("n_estimators", 2000, 15000),        
                    "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 50.0, step=0.1),
                    "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 200.0, step=0.1),
                    "random_state": trial.suggest_categorical("random_state", [0]),
                    "bagging_seed": trial.suggest_categorical("bagging_seed", [0]),
                    "feature_fraction_seed": trial.suggest_categorical("feature_fraction_seed", [0]), 
                    "n_jobs": trial.suggest_categorical("n_jobs", [4]), 
                    "subsample": trial.suggest_float("subsample", 0.5, 1, step=0.01),
                    "subsample_freq": trial.suggest_int("subsample_freq", 1, 7),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1, step=0.001),
                    'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                    'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10, step=1e-3),
                    'metric': trial.suggest_categorical('metric', ['AUC'])
                    }
    
    # Model loading and training
    model = LGBMClassifier(**lgbm_params)
    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=20,
              verbose=False)
  
    print(f"Number of boosting rounds: {model.best_iteration_}")
    oof = model.predict_proba(X_valid)[:, 1]
   
    return roc_auc_score(y_valid, oof)

In [None]:
def create_optuna_study(objective, study_name, train_time):
    study = optuna.create_study(direction='maximize', 
                                sampler=TPESampler(), 
                                study_name=study_name)
    study.optimize(objective, 
                   timeout=train_time)
    trial = study.best_trial
    
    print('Number of finished trials: ', len(study.trials))
    print('Best trial:')
    print('\tValue: {}'.format(trial.value))
    print('\tParams: ')
    for key, value in trial.params.items():
        print("\t\t'{}': {},".format(key, value))
    
    return trial, study

In [None]:
# Splitting data into train and valid folds using target bins for stratification

train_time = 1 * 60 * 60
OPTUNA = True

# Optimize
if OPTUNA:
    trial, study = create_optuna_study(objective, 'Trial', train_time)
    params = trial.params
else:
    params = {'objective': binary,
		'boosting_type': gbdt,
		'num_leaves': 233,
		'max_depth': 2,
		'learning_rate': 0.059000000000000004,
		'n_estimators': 12003,
		'reg_alpha': 4.9,
		'reg_lambda': 116.1,
		'random_state': 0,
		'bagging_seed': 0,
		'feature_fraction_seed': 0,
		'n_jobs': 4,
		'subsample': 0.86,
		'subsample_freq': 4,
		'colsample_bytree': 0.997,
		'min_child_samples': 91,
		'min_child_weight': 5.48,
		'metric': AUC,
             }

In [None]:
%%time

splits = 10
kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=SEED)

preds = np.zeros(len(X_test))

for train_idx, valid_idx in kf.split(X, y):
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    print('#' * 40)
    
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=10,
              verbose=200)
    
    preds += model.predict(X_test) / splits
    
    gc.collect()

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv', index_col='id')
submission['target'] = preds
submission.to_csv('submission.csv')