In [None]:
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score, log_loss

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install feature-engine

In [None]:
from feature_engine.encoding import OrdinalEncoder

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv', index_col=[0])

In [None]:
df.head()

In [None]:
le = OrdinalEncoder(encoding_method='arbitrary',
                    variables=['target'])
df = le.fit_transform(df)

In [None]:
le.encoder_dict_

In [None]:
X_train = df.drop(columns='target', axis=1)
y_train = df['target']

In [None]:
X_test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv', index_col=[0])
X_test.head()

In [None]:
def objective(trial, data=X_train, target=y_train):
    seed = 1234
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

    for train_index, valid_index in split.split(data, target):
        X_train = data.iloc[train_index]
        y_train = target.iloc[train_index]
        X_valid = data.iloc[valid_index]
        y_valid = target.iloc[valid_index]


    lgbm_params = {
        'objective': 'multiclass',
        'reg_alpha': trial.suggest_float('reg_alpha', 10.0, 20.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 5.0, 15.0),
        'num_leaves': trial.suggest_int('num_leaves', 170, 250),
        'min_child_samples': trial.suggest_int('min_child_samples', 30, 60),
        'max_depth': trial.suggest_int('max_depth', 15, 30),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 2000, 3500),
        'random_state': seed,
        'boosting_type': 'gbdt',
        'metric': 'multi_logloss',
        'num_class': 9,
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])
        # 'device': 'gpu'
    }
    

    model = LGBMClassifier(**lgbm_params)  
    
    model.fit(
            X_train,
            y_train,
            early_stopping_rounds=100,
            eval_set=[(X_valid, y_valid)],
            eval_metric='multi_logloss',
            verbose=False
        )
    
    preds_opt = model.predict_proba(X_valid)
    log_loss_multi = log_loss(y_valid, preds_opt)
    return log_loss_multi

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials = 10)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
print("Best trial: ", study.best_trial.params)

In [None]:
best_params = study.best_params
# best_params["objective"] = "multiclass"
# best_params["metrics"] = "multi_logloss"
# best_params["num_class"] = 9

In [None]:
OPTUNA_OPTIMIZATION = True

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_optimization_history(study))
    display(optuna.visualization.plot_slice(study))

In [None]:
model = LGBMClassifier(**best_params)  
    
model.fit(X_train, y_train)

y_test_pred = model.predict_proba(X_test)

In [None]:
columns = ['Class_6', 'Class_2', 'Class_8', 'Class_3', 'Class_1', 'Class_5', 'Class_7', 'Class_9', 'Class_4']
index = X_test.index
sub = pd.DataFrame(y_test_pred,
                   columns=columns,
                   index=index)

In [None]:
sub.loc[:, ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']].reset_index().to_csv("submission1.csv", index=False)