In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import optuna

import eli5
from eli5.sklearn import PermutationImportance

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install feature-engine

In [None]:
from feature_engine import encoding

In [None]:
df = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=[0])
X_train = df.drop(columns='target', axis=1)
y_train = df['target']

In [None]:
X_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=[0])

In [None]:
vars_num = [var for var in X_train.columns if X_train[var].dtype != 'O']
vars_cat = [var for var in X_train.columns if X_train[var].dtype == 'O']

### Pipeline for preprocessing with Feature-Engine

In [None]:
pre_pipe = Pipeline([

    ('encoder_rare_label',
     encoding.RareLabelEncoder(tol=0.01, n_categories=4, variables=vars_cat)),
    
    ('categorical_encoder',
     encoding.OrdinalEncoder(encoding_method='ordered',
                             variables=vars_cat)),

    ])

In [None]:
pre_pipe.fit(X_train, y_train)

In [None]:
X_train = pre_pipe.transform(X_train)
X_test = pre_pipe.transform(X_test)

In [None]:
X_train.head()

In [None]:
X_test.head()

### Parameter Search with Optuna

In [None]:
def objective(trial, data=X_train, target=y_train):
    seed = 1234
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

    for train_index, valid_index in split.split(data, target):
        X_train = data.iloc[train_index]
        y_train = target.iloc[train_index]
        X_valid = data.iloc[valid_index]
        y_valid = target.iloc[valid_index]


    lgbm_params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 170, 250),
        'min_child_samples': trial.suggest_int('min_child_samples', 40, 60),
        'max_depth': trial.suggest_int('max_depth', 15, 25),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 3000, 4500),
        'random_state': seed,
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        # 'device': 'gpu'
    }
    

    model = LGBMClassifier(**lgbm_params)  
    
    model.fit(
            X_train,
            y_train,
            early_stopping_rounds=100,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )

    y_valid_pred = model.predict_proba(X_valid)[:,1]
    
    roc_auc = roc_auc_score(y_valid, y_valid_pred)
    
    return roc_auc

In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
seed = 1234
paramsLGBM = study.best_trial.params
paramsLGBM['boosting_type'] = 'gbdt'
paramsLGBM['random_state'] = seed
# paramsLGBM['device'] = 'gpu'

In [None]:
paramsLGBM

In [None]:
model = LGBMClassifier(**paramsLGBM)  
    
model.fit(X_train, y_train)

y_test_pred = model.predict_proba(X_test)[:,1]

In [None]:
y_test_pred = model.predict_proba(X_test)[:,1]

### Submission

In [None]:
sub = pd.DataFrame(y_test_pred, index=X_test.index).reset_index().rename(columns={0: 'target'})
sub.to_csv('optuna_final.csv', index=False)

In [None]:
sub