# Required Library

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm_notebook as tqdm
import datetime
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import gc
import os




for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load data

In [None]:
train = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")
test = pd.read_csv("../input/cat-in-the-dat-ii/test.csv")

In [None]:
train.head()

In [None]:
test.head()

# Preprocessing

**1. Ordinal data**

Convert Ordinal data to numbers representing order.

Because ord_0 looks already done, I'll try 'ord_1', 'ord_2', 'ord_3', 'ord_4' and 'ord_5'.

In [None]:
train.loc[:,['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']]

In [None]:
def ret_ord_1(ord_1_data):
    """
    convert index from sorted ord_1 data.
    if data is np.nan, return -1.
    """
    try:
        if (math.isnan(ord_1_data)):
            return -1 #np.nan
    except TypeError:
        ord_1_list = ['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster']
        indexes = [ i for i in range(len(ord_1_list))]
        num_index = ord_1_list.index(ord_1_data)
        return indexes[num_index]

In [None]:
def ret_ord_2(ord_2_data):
    """
    convert index from sorted ord_2 data.
    if data is np.nan, return -1.
    """
    
    try:
        if math.isnan(ord_2_data):
            return -1 #np.nan
    
    except TypeError:
            ord_2_list = ['Freezing', 'Cold', 'Warm', 'Hot','Boiling Hot', 'Lava Hot']
            indexes = [ i for i in range(len(ord_2_list))]
            num_index = ord_2_list.index(ord_2_data)
            return indexes[num_index]

In [None]:
def ret_ord_3(ord_3_data):
    """
    convert index from sorted ord_3 data.
    if data is np.nan, return -1.
    """
    try:
        if math.isnan(ord_3_data):
            return -1 #np.nan
    
    except TypeError:
            ord_3_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o']
            indexes = [ i for i in range(len(ord_3_list))]
            num_index = ord_3_list.index(ord_3_data)
            return indexes[num_index]

In [None]:
def ret_ord_4(ord_4_data):
    """
    convert index from sorted ord_4 data.
    if data is np.nan, return -1.
    """
    try:
        if math.isnan(ord_4_data):
            return -1 #np.nan
    
    except TypeError:
            ord_4_list = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
            indexes = [ i for i in range(len(ord_4_list))]
            num_index = ord_4_list.index(ord_4_data)
            return indexes[num_index]

In [None]:
def ret_ord_5(ord_5_data):
    """
    convert index from sorted ord_5 data.
    if data is np.nan, return -1.
    """
    
    try:
        if math.isnan(ord_5_data):
            return -1 #np.nan
    
    except TypeError:
            ord_5_list = ['AG','AI','AU','AW','Ay','BL','BX','Bx','CN','CU','Cn','DI','DN','DR','DT','Dj','Dn',
                          'EC','Ey','FB','FH','Fl','GZ','HF','HK','HO','Hk','IA','IS','Ib','In','Io','Iq','JQ',
                          'JT','Ji','Kq','LS','LY','Lo','MF','MU','MV','MX','Mg','Mq','NS','NT','Nh','OM','OZ',
                          'Oe','Ox','PG','PS','Pk','Pw','QV','Qm','RB','RD','RT','RV','Re','Rj','Ro','Rv','Rz',
                          'SL','SS','Sk','Sz','TH','TL','TP','TZ','Tg','Ty','Tz','US','UV','WC','WW','Wr','XC',
                          'XI','XM','XR','XU','YJ','Yb','Yi','Yr','Zv','aA','aE','al','be','cR','cY','cg','cy',
                          'dh','dp','eA','eN','ep','fF','fO','fV','fe','gK','gL','gV','gc','gj','gt','hG','hT',
                          'ht','hx','iS','iv','ja','jf','jp','kB','kP','kT','kv','lA','lR','lS','ly','mD','mP',
                          'mX','mi','mo','nS','ne','nf','nj','nn','oI','oJ','oU','oh','ok','pB','pE','pT','pZ',
                          'pl','qN','qP','rA','rM','rg','rl','sF','sY','sc','sf','tT','th','tn','uI','uP','uQ',
                          'uW','uZ','ur','us','vQ','vq','vw','vx','wJ','wU','wa','xB','xF','xG','yE','yK','zc',
                          'ze','zf','zp']
            
            ord_5_list = list(set([ i for i in ord_5_list]))
            indexes = [ i for i in range(len(ord_5_list))]
            num_index = ord_5_list.index(ord_5_data)
            return indexes[num_index]


In [None]:
train['ord_1'] = train['ord_1'].map(ret_ord_1)
train['ord_2'] = train['ord_2'].map(ret_ord_2)
train['ord_3'] = train['ord_3'].map(ret_ord_3)
train['ord_4'] = train['ord_4'].map(ret_ord_4)
train['ord_5'] = train['ord_5'].map(ret_ord_5)

In [None]:
train.loc[:,['ord_0','ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']]

In [None]:
test['ord_1'] = test['ord_1'].map(ret_ord_1)
test['ord_2'] = test['ord_2'].map(ret_ord_2)
test['ord_3'] = test['ord_3'].map(ret_ord_3)
test['ord_4'] = test['ord_4'].map(ret_ord_4)
test['ord_5'] = test['ord_5'].map(ret_ord_5)

In [None]:
test.loc[:,['ord_0','ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']]

**2. Label encording**

I'll apply Label encording to "bin_3", "bin_4", "nom_0", "nom_1", "nom_2", "nom_3", "nom_4", "nom_5", "nom_6", "nom_7", "nom_8" and "nom_9".
These columns look categolical.

In [None]:
cat_cols = ["bin_3", "bin_4", "nom_0", "nom_1", "nom_2", "nom_3", "nom_4", "nom_5", "nom_6", 
            "nom_7", "nom_8", "nom_9"]
for c in cat_cols:
    le = LabelEncoder()
    tmp = []
    train[c][pd.isnull(train)[c]]= 'NaN'
    test[c][pd.isnull(test)[c]]= 'NaN'
    
    
    for i in train[c]:
        tmp.append(i)
    for i in test[c]:
        tmp.append(i)
        
    
    le.fit(tmp)
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])
    
    del tmp
    gc.collect()

**3.Standard Scale**

In [None]:
from sklearn.preprocessing import StandardScaler
col_StandardScale=['bin_0','bin_1', 'bin_2', 'bin_3', 'bin_4', 
                   'nom_0','nom_1','nom_2', 'nom_3', 'nom_4', 
                   'nom_5','nom_6', 'nom_7', 'nom_8','nom_9' ,
                   'ord_0', 'ord_1', 'ord_2','ord_3', 'ord_4', 
                   'ord_5', 'day', 'month']
obj=StandardScaler()
train[col_StandardScale]=obj.fit_transform(train[col_StandardScale])
test[col_StandardScale]=obj.transform(test[col_StandardScale])

**4.Check data for training & prediction.**

In [None]:
train.head()

In [None]:
test.head()

# Training & Prediction

I refered https://www.kaggle.com/drcapa/categorical-feature-engineering-2-xgb for decide hyperparameters.
Thank you so mutch.

**1.Grid Search**

In [None]:
categorical = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4' ,'nom_5', 'nom_6','nom_7', 'nom_8', 'nom_9']
numeric_cols = [col for col in train.columns if col not in categorical + ['target','id']]
features = categorical + numeric_cols

In [None]:
def search_XGBoost(max_depth,subsample,colsample_bytree,lr):
    folds = 3
    scores = []

    kf = StratifiedKFold(n_splits=folds, shuffle=True)
    for tr_idx, val_idx in tqdm(kf.split(train[features], train['target']), total = folds):
        X_train = train[features].iloc[tr_idx] 
        y_train = train['target'].iloc[tr_idx]
        X_test = train[features].iloc[val_idx]
        y_test = train['target'].iloc[val_idx]
    
        model = xgb.XGBClassifier(objective ='binary:logistic',
                      colsample_bytree = colsample_bytree,
                      learning_rate = lr,
                      max_depth = max_depth,
                      n_estimators = 400,
                      eta = 0.25,
                      scale_pos_weight = 2,
                      random_state = 2020,
                      subsample = subsample)
    
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False,)
    
        preds_val = model.predict_proba(X_test)[:,1]
        score = roc_auc_score(y_test ,preds_val)
        print("score: %f" % (score))
        scores.append(score)
    
    print("CV score: ",sum(scores)/len(scores))
    return sum(scores)/len(scores)

In [None]:
max_depths =[16]#14, 16, 18, 20 ]
subsamples = [0.86]
colsample_bytrees = [0, 0.05, 0.1, 0.15]
lrs = [0.25]

max_auc = 0

for i in max_depths:
    for j in subsamples:
        for k in colsample_bytrees:
            for l in lrs:          
                tauc = search_XGBoost(i, j, k, l)
                print("max_depth,ubsample,colsample_bytree,learning_rate: ", i,j,k,l)
                if max_auc < tauc:
                    max_auc = tauc
                    best_max_depths = i
                    best_subsamples = j
                    best_colsample_bytrees = k
                    best_learning_rate = l

**2.Train Model for Prediction**

In [None]:
model = xgb.XGBClassifier(objective ='binary:logistic',
                      colsample_bytree = best_colsample_bytrees,
                      learning_rate = best_learning_rate,
                      max_depth = best_max_depths,
                      n_estimators = 400,
                      eta=0.25,
                      scale_pos_weight = 2,
                      random_state = 2020,
                      subsample = best_subsamples)
model.fit(train[features], train['target'])

**3.Prediction**

In [None]:
pred = model.predict_proba(test[features])[:,1]

# Submit

In [None]:
submit = pd.read_csv("../input/cat-in-the-dat-ii/sample_submission.csv")

In [None]:
submit["target"] = pred

In [None]:
submit.head()

In [None]:
submit.to_csv("./submit.csv", index=False)