# Summary

Simple LightGBM model without Tuning.

For more details about encoding, please visit here --> [TPS Mar 21| LabelEncoder vs. OneHotEncoder](https://www.kaggle.com/mayasakaguchi/tps-mar-21-labelencoder-vs-onehotencoder)

LightGBM parameter from [https://www.kaggle.com/svyatoslavsokolov/tps-mar-2021-lgbm](https://www.kaggle.com/svyatoslavsokolov/tps-mar-2021-lgbm)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings 
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
from itertools import product

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import lightgbm as lgb

In [None]:
train=pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test=pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')


y=train['target']
train.drop(['id','target'],axis=1,inplace=True)
test.drop(['id'],axis=1,inplace=True)

total=pd.concat([train,test],axis=0,ignore_index=True)

col_cat=total.select_dtypes('object').columns.to_list()
col_num=total.select_dtypes('number').columns.to_list()

train.shape,total.shape,len(col_cat),len(col_num)

In [None]:
class SeparateEncoder():
    def cat_transform(self,df,cat_cols,encoding_list):
        df_mod=pd.DataFrame()
        for idx,cat in zip(encoding_list,cat_cols):
            if idx==0:#One Hot
                df_oh=pd.get_dummies(df[cat],prefix=cat,prefix_sep='_')
                df_mod=pd.concat([df_mod,df_oh],axis=1)
            elif idx==1:#LabelEncoder
                le=LabelEncoder()
                df_le=pd.DataFrame(le.fit_transform(df[cat]),columns=[cat])
                df_mod=pd.concat([df_mod,df_le],axis=1)
        (_,cat_feature_count)=df_mod.shape
        return df_mod,cat_feature_count
    
    def num_transform(self,df,num_cols,add_pca=True,n_components=5):
        df_num=df[num_cols]
        SS=StandardScaler()
        df_num=pd.DataFrame(SS.fit_transform(df_num),columns=num_cols)
        if add_pca:
            pca=PCA(n_components=n_components,random_state=0)
            df_pca=pd.DataFrame(pca.fit_transform(df_num))
            df_num=pd.concat([df_num,df_pca],axis=1)
        else:
            pass
        return df_num  
    
    def merge_transform(self,df,cat_cols,num_cols,encoding_list,add_pca=True,n_components=5):
        df_cat,cat_feature_count=self.cat_transform(df,cat_cols,encoding_list)
        df_num=self.num_transform(df,num_cols,add_pca=add_pca,n_components=n_components)
        df_merge=pd.concat([df_cat,df_num],axis=1)
        return df_merge,cat_feature_count

In [None]:
#parameters after optuna
lgbm_params = {
    'objective':'binary',
    'metric': 'auc', 
    'n_jobs':-1,
    'verbosity':-2,
    'n_estimators': 20000,
    'reg_alpha': 0.000721024661208569,
    'reg_lambda': 47.79748127808107,
    'colsample_bytree': 0.24493010466517195,
    'subsample': 0.12246675404710294,
    'learning_rate': 0.013933182980403087,
    'max_depth': 21,
    'num_leaves': 90,
    'min_child_samples': 144,
    'cat_smooth': 63
}

In [None]:
n_splits=10
skf=StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=0)
se=SeparateEncoder()

In [None]:
%%time
encoding_list=np.ones(len(col_cat))
e_list=[0,0,0]
for l,m in zip([1,5,10],list(range(3))):
    encoding_list[l]=e_list[m]

df_merge_total,cat_feature_count=se.merge_transform(total,col_cat,col_num,encoding_list,add_pca=False)
df_merge_train,df_merge_test=df_merge_total.iloc[:len(train),:],df_merge_total.iloc[len(train):,:]
score=0
preds=pd.DataFrame(columns=[f'pred{c}'for c in range(n_splits)])

for k,(train_idx,val_idx) in enumerate(skf.split(df_merge_train,y)):
    print(f'------Fold{k+1}')
    X_train,y_train=df_merge_train.iloc[train_idx,:],y[train_idx]
    X_val,y_val=df_merge_train.iloc[val_idx,:],y[val_idx]

    l_train=lgb.Dataset(X_train,y_train)
    l_val=lgb.Dataset(X_val,y_val)
    model = lgb.train(params=lgbm_params,
                    num_boost_round=1000,
                    early_stopping_rounds=400,
                    train_set=l_train,
                    valid_sets=[l_val,l_train],
                    verbose_eval=500)         
    val_pred=model.predict(X_val)
    score+=roc_auc_score(y_val,val_pred)/n_splits
    preds[f'pred{k}']=model.predict(df_merge_test)/n_splits

print('\n')
print('='*50)
print(score)
print('='*50)

# Submission Files

In [None]:
submission=pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
submission['target']=preds.sum(axis=1)
submission.to_csv('submission_lgbm.csv',index=False)
submission

**Thanks for reading!**

**Please let me know if you have any questions and advice.**