# Idea
  
**See the previous kernel for details**

**success**  
Blending LGB+NN  
Mish activation function (NN)  
Cosine Annealing LR (NN)  
automating feature generating and selection by AutoFeat (CV)   
hyperparameter tuning by Optuna (LGB)   
Label-Encoding  
split f_27 one character at a time  
number of unique characters in f_27  

**failures(not use)**  
Simple Target-Encoding    
combine a small number of labels (8,9 or more) of categorical variables into one label  
count the maximum number of consecutive strings in f_27  

In [None]:
# base
import os
import random
import math
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# CV
from sklearn.model_selection import KFold, StratifiedKFold

# encoding
from sklearn.preprocessing import LabelEncoder

# scaler
from sklearn.preprocessing import StandardScaler

# lgb
import lightgbm as lgb

# tensorflow/keras
import tensorflow as tf

# metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score

# rank
from scipy.stats import rankdata 

# plot
import matplotlib.pyplot as plt
import seaborn as sns

# warning
import warnings
warnings.filterwarnings('ignore')

# param
n_splits=5
seed=2022

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

# Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')

In [None]:
float_col = [c for c in train.columns if train[c].dtype==np.float]

In [None]:
# Base FE
def split_txt(df):
    df_tmp = df.copy()
    split_df = df_tmp['f_27'].str.split('', expand=True).iloc[:,1:11]
    split_df.columns = [f'f_27_{i}' for i in range(10)]
    df_tmp = pd.concat([df_tmp, split_df], axis=1)
    return df_tmp

# Reference : features from https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model
def n_unique(row):
    unique_count = len(set(row["f_27"]))
    return unique_count

# Reference Awesome Features from https://www.kaggle.com/code/ambrosm/tpsmay22-gradient-boosting-quickstart/notebook
def ternary_int(df):
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    return df

In [None]:
# train
train = split_txt(train)
train["unique_characters"] = train.apply(n_unique, axis=1)
# train = ternary_int(train)
cat_cols = [f'f_27_{i}' for i in range(10)]
train_x = train.drop(['id', 'target', 'f_27'],axis=1)
train_y = train.target

# test
test = split_txt(test)
test['unique_characters'] = test.apply(n_unique, axis=1)
test = test.drop(['id', 'f_27'], axis=1)
test.loc[test.f_27_1 == 'O', 'f_27_1'] = 'B'
test.loc[test.f_27_4 == 'N', 'f_27_4'] = 'B'
# test = ternary_int(test)

# label encoding
for c in cat_cols:
    le = LabelEncoder()
    le.fit(train_x[c])
    train_x[c] = le.transform(train_x[c])
    test[c] = le.transform(test[c])

# standard scaler
scaler = StandardScaler()
scaler.fit(train_x)
train_x = pd.DataFrame(scaler.transform(train_x), columns=train_x.columns)
test = pd.DataFrame(scaler.transform(test), columns=test.columns)

In [None]:
train_x.head(3)

In [None]:
train_y

In [None]:
test.head(3)

# EDA

In [None]:
# Reference from https://www.kaggle.com/code/ambrosm/tpsmay22-gradient-boosting-quickstart/notebook
from matplotlib.colors import ListedColormap

def plot_feat_scatter(df_x, df_y, col):
    lr = 5
    lc = len(df_x.columns)//5 + 1
    
    plt.rcParams['axes.facecolor'] = 'k'
    plt.figure(figsize=(lr*5, lc*5))
    cmap = ListedColormap(["#ffd700", "#0057b8"])
    # target == 0 → yellow; target == 1 → blue
    
    for i, c in enumerate(df_x.columns):
        if col != c:
            ax = plt.subplot(lc, lr, i+1)
            ax.scatter(df_x[col], df_x[c], s=1, c=df_y, cmap=cmap)
            ax.set_xlabel(col)
            ax.set_ylabel(c)
            ax.set_aspect('equal')
            if i == 0:
                ax0 = ax
        else:pass

    plt.tight_layout(w_pad=1.0)
    plt.savefig(f'projections_{col}.png')
    plt.show()
    plt.rcParams['axes.facecolor'] = '#0057b8'

In [None]:
# %%time
# plot_feat_scatter(train_x[train_x.columns], train_y, col=train_x.columns[0])

In [None]:
%%time
for col in tqdm(float_col):
    print('#'*5 + f'  {col}  ' + '#'*5)
    plot_feat_scatter(train_x[float_col], train_y, col)

# Model

In [None]:
# # lightgbm
# class ModelLgb:

#     def __init__(self):
#         self.model = None

#     def fit(self, tr_x, tr_y, va_x, va_y):
#         params = {
#         'objective':'binary',
#         'metric':'auc',
#         'seed': seed,
#         'verbosity':-1,
#         'learning_rate':0.1,
#         'reg_alpha':0,
#         'reg_lambda':1,
#         'num_leaves': 480, 
#         'max_depth': 31,
#         'feature_fraction': 0.9558908495366608, 
#         'bagging_fraction': 0.9018494038054344, 
#         'bagging_freq': 5, 
#         'min_child_samples': 8,
#         }
        
#         num_round = 10000
#         early_stopping_rounds=100
        
#         lgb_train = lgb.Dataset(tr_x, tr_y)
#         lgb_eval = lgb.Dataset(va_x, va_y)
        
#         self.model = lgb.train(params, lgb_train, valid_sets=lgb_eval, 
#                                num_boost_round=num_round, early_stopping_rounds=early_stopping_rounds,
#                                verbose_eval=100
#                               )
        
#         lgb.plot_importance(self.model, figsize=(20,30))
        
#     def predict(self, x):
#         pred = self.model.predict(x, num_iteration=self.model.best_iteration)
#         return pred

In [None]:
# # run model & make prediction feature
# def mk_predict(model, train_x, train_y, test_x):
    
#     set_seed(seed)
    
#     va_preds = []
#     va_idxes = []
    
#     test_preds = []
    
#     aucs = []
    
#     kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
#     for i, (tr_idx, va_idx) in tqdm(enumerate(kf.split(train_x))):
        
#         print('='*15 + f'fold{i+1}' + '='*15)

#         tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
#         tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
#         model.fit(tr_x, tr_y, va_x, va_y)

#         # valid predict & index
#         va_pred = model.predict(va_x)
#         va_preds.append(va_pred)
#         va_idxes.append(va_idx)
        
#         # test predict
#         test_pred = model.predict(test_x)
#         test_preds.append(test_pred)
        
#         # valid AUC
#         fpr, tpr, _ = roc_curve(va_y, va_pred)
#         va_auc = auc(fpr, tpr)
#         print(f'AUC : {va_auc}')
#         aucs.append(va_auc)
        
#     # sort valid pred    
#     va_idxes = np.concatenate(va_idxes)
#     va_preds = np.concatenate(va_preds, axis=0)
#     order = np.argsort(va_idxes)
#     train_preds = va_preds[order]
    
#     test_preds = np.mean(test_preds, axis=0)
    
#     # mean AUC
#     print(f'Mean AUC : {np.mean(aucs)}')
         
#     return train_preds, test_preds

# LGB

In [None]:
# %%time
# model_lgb = ModelLgb()
# pred_train_lgb, pred_test_lgb = mk_predict(model_lgb, train_x, train_y, test)

In [None]:
# # lgb first layer pred
# lgb_train_x_2 = pd.DataFrame({'id': train.id, 'target': pred_train_lgb})
# lgb_test_x_2 = pd.DataFrame({'id': sub.id, 'target': pred_test_lgb})

# lgb_train_x_2.to_csv('lgb_first_layer_train_preds.csv', index=False)
# lgb_test_x_2.to_csv('lgb_first_layer_test_preds.csv', index=False)