In [None]:
import pandas as pd

### Load Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv");
test = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv");
submit = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv");

# Specify the maximum number of columns to display
pd.set_option('display.max_columns', 32)

In [None]:
# Check the number of data
print(len(train), len(test),len(submit))

In [None]:
train.head()

In [None]:
test.head()

### Check for missing data

In [None]:
### Check for missing data
y_train = train['target']
train_work = train.drop('target',axis=1)
train_work.head()

In [None]:
marge_data = pd.concat([train_work,test],join='inner')
print(len(marge_data))

In [None]:
marge_data.isnull().sum()

We were able to confirm that there was no missing data.

### Feature Engineering with Xfeat

In [None]:
!pip install --quiet git+https://github.com/pfnet-research/xfeat.git

In [None]:
from xfeat import SelectCategorical, LabelEncoder, Pipeline, SelectNumerical, GBDTFeatureSelector, GBDTFeatureExplorer
# Extract only categorical data
categorical_df = SelectCategorical().fit_transform(marge_data)
categorical_df.head()

In [None]:
# Extract only numerical data
numerical_df = SelectNumerical().fit_transform(marge_data)
numerical_df.head()

In [None]:
# Label Encoding
encoder = Pipeline([
    SelectCategorical(),
    LabelEncoder(output_suffix=''),
])

encoded_df = encoder.fit_transform(marge_data)
encoded_df.head()

In [None]:
marge_data_encoded = pd.concat([numerical_df,encoded_df], axis=1)
marge_data_encoded.head()

In [None]:
X_train = marge_data_encoded[:len(train)]
X_test = marge_data_encoded[len(train):]

### Feature search using Xfeat and optuna

In [None]:
from functools import partial
import optuna
import lightgbm as lgb

LGBM_PARAMS = {
        'objective': 'binary',
        'metric': 'binary_error',
        'verbosity': -1,
}


def objective(df, selector, trial):
    selector.set_trial(trial)
    selector.fit(df)
    input_cols = selector.get_selected_cols()

    # Set the parameters and range for Hyper Parameter Tuning.
    lgbm_params = {
        'num_leaves': trial.suggest_int('num_leaves', 3, 10),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
    }
    lgbm_params.update(LGBM_PARAMS)

    # Evaluate with selected columns
    train_set = lgb.Dataset(df[input_cols], label=df['target'])
    scores = lgb.cv(lgbm_params, train_set, num_boost_round=100, stratified=False, seed=1)
    
    binary_error_score = scores['binary_error-mean'][-1]
    return 1 - binary_error_score


# Create a feature searcher.
selector = GBDTFeatureExplorer(
    input_cols=X_train.columns.tolist(),
    target_col='target',
    fit_once=True,
    threshold_range=(0.8, 1.0),
    lgbm_params=LGBM_PARAMS,
)

# Hyper Parameter Tuning
study = optuna.create_study(direction='minimize')
study.optimize(partial(objective, pd.concat([X_train,y_train], axis=1), selector), n_trials=100)

# Check the selected features.
selector.from_trial(study.best_trial)

In [None]:
print('Selected columns:', selector.get_selected_cols())

In [None]:
print(study.best_params)

In [None]:
print(study.best_value)

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'num_leaves': study.best_params['num_leaves'],
    'max_depth': study.best_params['max_depth'],
    'verbose': 0,
}

In [None]:
lgb_train_data = lgb.Dataset(
data=X_train[selector.get_selected_cols()], 
label=y_train, 
feature_name='auto'
)

lgb_cv = lgb.cv(
    params = params,
    train_set = lgb_train_data,
    num_boost_round=2000,
    stratified=True,
    nfold = 5,
    verbose_eval=50,
    seed = 23)

In [None]:
best_cv_score = min(lgb_cv['binary_error-mean'])
print(best_cv_score)

In [None]:
model = lgb.train(
    params=params, 
    train_set = lgb_train_data
)

In [None]:
import numpy as np
y_pred = model.predict(X_test[selector.get_selected_cols()])
sub = submit
sub['target'] =  np.where(y_pred > 0.49, 1, 0)
sub.to_csv('./submission.csv', index=False)