# Purpose

In this notebook, I'd like to compare LabeEncoder and OneHotEncoder in the LightGBM model.

Thank you [@maostack](http://) and I referred to [https://www.kaggle.com/maostack/tps-mar-baseline](https://www.kaggle.com/maostack/tps-mar-baseline) to build the LightGBM model.


---Conclusion---

**OneHotEncoder for cat1,cat5 and cat10** can slightly boost the score.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
from itertools import product

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import lightgbm as lgb

Loading Data (though I will now use the test data in this notebook)

In [None]:
train=pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test=pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

In [None]:
y=train['target']
train.drop(['id','target'],axis=1,inplace=True)
test_id=test['id']
test.drop(['id'],axis=1,inplace=True)
col_cat=train.select_dtypes('object').columns.to_list()
col_num=train.select_dtypes('number').columns.to_list()

train.shape,y.shape,test.shape

Function to encode categorical features separately (also applicable to adding PCA columns of numerical features)

In [None]:
class SeparateEncoder():
    def cat_transform(self,df,cat_cols,encoding_list):
        df_mod=pd.DataFrame()
        for idx,cat in zip(encoding_list,cat_cols):
            if idx==0:#One Hot
                df_oh=pd.get_dummies(df[cat],prefix=cat,prefix_sep='_')
                df_mod=pd.concat([df_mod,df_oh],axis=1)
            elif idx==1:#LabelEncoder
                le=LabelEncoder()
                df_le=pd.DataFrame(le.fit_transform(df[cat]),columns=[cat])
                df_mod=pd.concat([df_mod,df_le],axis=1)
        (_,cat_feature_count)=df_mod.shape
        return df_mod,cat_feature_count
    
    def num_transform(self,df,num_cols,add_pca=True,n_components=5):
        df_num=df[num_cols]
        SS=StandardScaler()
        df_num=pd.DataFrame(SS.fit_transform(df_num),columns=num_cols)
        if add_pca:
            pca=PCA(n_components=n_components,random_state=0)
            df_pca=pd.DataFrame(pca.fit_transform(df_num))
            df_num=pd.concat([df_num,df_pca],axis=1)
        else:
            pass
        return df_num  
    
    def merge_transform(self,df,cat_cols,num_cols,encoding_list,add_pca=True,n_components=5):
        df_cat,cat_feature_count=self.cat_transform(df,cat_cols,encoding_list)
        df_num=self.num_transform(df,num_cols,add_pca=add_pca,n_components=n_components)
        df_merge=pd.concat([df_cat,df_num],axis=1)
        return df_merge,cat_feature_count

In [None]:
lgbm_params = {'max_depth': 16,
               'subsample': 0.8, 
               'colsample_bytree': 0.2,
               'learning_rate': 0.01,
               'reg_lambda': 10,
               'reg_alpha': 17,
               'min_child_samples': 31, 
               'num_leaves': 66,
               'max_bin': 522,
               'cat_smooth': 81,
               'cat_l2': 0.03,
               'metric': 'auc',
               'objective':'binary',
               'n_jobs': -1, 
               'n_estimators': 100,
               'force_col_wise':True,
               'verbosity':-2
              }

In [None]:
n_splits=5
skf=StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=0)
se=SeparateEncoder()
results=pd.DataFrame(columns=['encoding_list','score'])

If I try every combination (i.e. cat0: LabelEncoding/OneHotEncoding, cat1: LabelEncoding/OneHotEncoding, etc.), it will take tremendous amount of time.

Therefore, in the first trial, I've chosen one categorical feature to apply OneHotEncoding, to see whether these choice improve the score or not

In [None]:
%%time
for i in range(len(col_cat)+1):
    print(f'-----Cat{i}')
    encoding_list=np.ones(len(col_cat))
    if i==len(col_cat):pass
    else:encoding_list[i]=0
    df_merge,cat_feature_count=se.merge_transform(train,col_cat,col_num,encoding_list,add_pca=False)
    score=0
    for k,(train_idx,val_idx) in enumerate(skf.split(df_merge,y)):
        print(f'------Fold{k+1}')
        X_train,y_train=df_merge.iloc[train_idx,:],y[train_idx]
        X_val,y_val=df_merge.iloc[val_idx,:],y[val_idx]

        l_train=lgb.Dataset(X_train,y_train)
        l_val=lgb.Dataset(X_val,y_val)
        model = lgb.train(params=lgbm_params,
                      num_boost_round=1000,
                      early_stopping_rounds=400,
                      train_set=l_train,
                      valid_sets=[l_val,l_train],
                      verbose_eval=500)         
        val_pred=model.predict(X_val)
        score+=roc_auc_score(y_val,val_pred)/n_splits

    results.loc[i,'encoding_list']=encoding_list
    results.loc[i,'score']=score

In [None]:
results.sort_values('score',ascending=False)

Since index=19 is controll (all LabelEncoder), applying OneHotEncoder to most of the categories (except cat3,9) might improve the score.

From now, I'd like to focus on the top 5 categories (cat1,5,6,10,16) to figure out the exact combination of LabelEncoder or OneHotEncoder in those categories.

In [None]:
results_2=pd.DataFrame(columns=['encoding_list','score'])

In [None]:
%%time
from itertools import product
for i,e_list in enumerate(product([0,1],repeat=5)):
    if sum(e_list)==5 or sum(e_list)==4: continue

    print(f'-----Cat{i}')
    encoding_list=np.ones(len(col_cat))
    for l,m in zip([1,5,6,10,16],list(range(6))):
        encoding_list[l]=e_list[m]

    df_merge,cat_feature_count=se.merge_transform(train,col_cat,col_num,encoding_list,add_pca=False)
    score=0
    for k,(train_idx,val_idx) in enumerate(skf.split(df_merge,y)):
        print(f'------Fold{k+1}')
        X_train,y_train=df_merge.iloc[train_idx,:],y[train_idx]
        X_val,y_val=df_merge.iloc[val_idx,:],y[val_idx]

        l_train=lgb.Dataset(X_train,y_train)
        l_val=lgb.Dataset(X_val,y_val)
        model = lgb.train(params=lgbm_params,
                      num_boost_round=1000,
                      early_stopping_rounds=400,
                      train_set=l_train,
                      valid_sets=[l_val,l_train],
                      verbose_eval=500)         
        val_pred=model.predict(X_val)
        score+=roc_auc_score(y_val,val_pred)/n_splits

    results_2.loc[i,'encoding_list']=encoding_list
    results_2.loc[i,'score']=score

In [None]:
results_2.sort_values('score',ascending=False).head(10)

In [None]:
best_idx=results_2.sort_values('score',ascending=False).head(1).index[0]
best_encoding_list=results_2.loc[best_idx,'encoding_list']
best_encoding_list

The "best_encoding_list" shows that 


# OneHotEncoder for cat1, cat5, cat10

can slightly boost the score.

**Thanks for reading!**

**I'm sorry for being messy at some parts.**

**Please let me know if you have any questions and advice.**