In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import xgboost as xgb

## https://www.kaggle.com/subinium/11-categorical-encoders-and-benchmark
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
#Encoding techniques
#Taken reference from https://www.kaggle.com/discdiver/category-encoders-examples
#https://www.kaggle.com/ruchibahl18/categorical-data-encoding-techniques

#bin_3, bin_4 :- Convert Y/N and T/F to 1/0
#nom_0 - nom_4 :- Encode using One hot encoding
#nom_5 - nom_9 :- Target encode them as they are high cardinal variables
#ord_1, ord_2 :- Convert into numerical order using hard coded values as Label encoder might not be able to understand the order
#ord_3 - ord_4 :- Encode using ascii as they are alphabetical values
#ord_5 :- Separate two alphabets and then do label encoding
#day, month:- Encode using sin and cosine values as they are cyclic in nature

In [None]:
train = pd.read_csv('/kaggle/input/cat-in-the-dat/train.csv')
test = pd.read_csv('/kaggle/input/cat-in-the-dat/test.csv')

In [None]:
positive_count = len(train[train.target == 1])
negative_count = len(train[train.target == 0])

print(f'positive_count: {positive_count}')
print(f'negative_count: {negative_count}')

In [None]:
## FE new columns from ord_5

train["ord_5a"]=train["ord_5"].str[0]
train["ord_5b"]=train["ord_5"].str[1]
train.drop(['ord_5'], axis=1, inplace = True)

test["ord_5a"]=test["ord_5"].str[0]
test["ord_5b"]=test["ord_5"].str[1]
test.drop(['ord_5'], axis=1, inplace = True)

In [None]:
def intersec_label_encode(vals, df1, df2, col):
    encoder = LabelEncoder()
    encoder.fit(vals)
    df1[col] = encoder.transform(df1[col])
    df2[col] = encoder.transform(df2[col])
    return df1, df2

intersec_cols = ['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5a', 'ord_5b']
for col in intersec_cols:
    intersec_vals = np.intersect1d(train[col].unique(),test[col].unique())
    train, test = intersec_label_encode(intersec_vals, train, test, col)    
    

In [None]:
## FE sin and cos columns from month and day

def date_cyc_enc(df, col, max_vals):
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_vals)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_vals)
    return df

train = date_cyc_enc(train, 'day', 7)
train = date_cyc_enc(train, 'month', 12)
train.drop(['day', 'month'], axis=1, inplace = True)

test = date_cyc_enc(test, 'day', 7)
test = date_cyc_enc(test, 'month', 12)
test.drop(['day', 'month'], axis=1, inplace = True)

In [None]:
high_cardinal_cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

target_encoder = CatBoostEncoder()
train[high_cardinal_cols] = target_encoder.fit_transform(train[high_cardinal_cols], train.target)
test[high_cardinal_cols] = target_encoder.transform(test[high_cardinal_cols])  

In [None]:
train['bin_3'] = [0 if x == 'F' else 1 for x in train['bin_3']]
train['bin_4'] = [0 if x == 'N' else 1 for x in train['bin_4']]

test['bin_3'] = [0 if x == 'F' else 1 for x in test['bin_3']]
test['bin_4'] = [0 if x == 'N' else 1 for x in test['bin_4']]

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

label_cols = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
for col in label_cols:    
    label_encoder = LabelEncoder()
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.transform(test[col])  

imp = IterativeImputer(max_iter=10, random_state=0)
train[label_cols] = imp.fit_transform(train[label_cols])
test[label_cols] = imp.fit_transform(test[label_cols])

In [None]:
train.head()

In [None]:
target = train.target
train_id = train['id']
train = train.drop(['id', 'target'], axis=1)

test_id = test['id']
test.drop('id', axis=1, inplace=True)

print(len(target), len(train))
print(len(test))

In [None]:
from sklearn.model_selection import train_test_split

features = list(train.columns) # you can custumize later.
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.25, random_state=42)

print(f'Train set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

params = {
        'learning_rate' : [0.07, 0.1, 0.09],
        'min_child_weight': [10, 8, 6],
        'gamma': [1, 2],
        'subsample': [0.8],
        'colsample_bytree': [0.1, 0.4, 0.3, 0.5],
        'max_depth': [3, 4, 5],
        'reg_alpha':[0.009, 0.01, 0.02],
        'reg_lambda':[0.04, 0.05, 0.06]
        }
        
class_weight = negative_count/positive_count
print(f'class_weight: {class_weight}')
#xgb = XGBClassifier(objective='binary:logistic', n_estimators=200, scale_pos_weight=class_weight, silent=True, nthread=1)

folds = 3
SEED = 42

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = SEED)
search = RandomizedSearchCV(
    xgb, 
    param_distributions=params, 
    n_iter=20,
    scoring='f1', 
    n_jobs=4, 
    cv=skf.split(X_train, y_train), 
    verbose=2 )

search.fit(X_train, y_train)

In [None]:
print('\n Best hyperparameters:')
print(search.best_params_)

#Best hyperparameters:
#{'subsample': 0.8, 'reg_lambda': 0.05, 'reg_alpha': 0.01, 'min_child_weight': 8, 'max_depth': 4, 'learning_rate': 0.3, 'gamma': 1, 'colsample_bytree': 0.4}
#{'subsample': 0.8, 'reg_lambda': 0.05, 'reg_alpha': 0.01, 'min_child_weight': 6, 'max_depth': 3, 'learning_rate': 0.3, 'gamma': 1, 'colsample_bytree': 0.3}
#{'subsample': 0.8, 'reg_lambda': 0.05, 'reg_alpha': 0.02, 'min_child_weight': 6, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.1}

In [None]:
class_weight = negative_count/positive_count
print(f'class_weight: {class_weight}')
param_dist = {
    'objective':'binary:logistic', 
    'n_estimators':2000, 
    'learning_rate':0.01, 
    'scale_pos_weight':class_weight,
    'max_depth':5,
    'min_child_weight': 6,
    'colsample_bytree': 0.1, # 0.6
    'subsample': 0.8,
    'gamma': 1, #0, 1 or 5
    'reg_lambda': 0.05, 
    'reg_alpha': 0.02
}

clf = XGBClassifier(**param_dist)
# Or you can use: clf = xgb.XGBClassifier(**param_dist)

clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)], 
        eval_metric=['auc','logloss'],
        early_stopping_rounds=100,
        verbose=True)

#results = cross_val_score(model, X_train_enc, y_train, cv=kfold)
#print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
from matplotlib import pyplot


# Load evals result by calling the evals_result() function
results = clf.evals_result()

epochs = len(results['validation_0']['auc'])
x_axis = range(0, epochs)

# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

# plot classification AUC
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Test')
ax.legend()
pyplot.ylabel('Classification AUC')
pyplot.title('XGBoost Classification AUC')
pyplot.show()


In [None]:
sub_df = pd.DataFrame(test_id, columns=['id'])
sub_df['target'] = clf.predict_proba(test)[:, 1]
sub_df.to_csv('submission.csv', index=False)

In [None]:
sub_df.head(10)