In [None]:
import os
import pandas as pd
import numpy as np
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
os.listdir('cat_in_dat/')

In [None]:
train = pd.read_csv('cat_in_dat/train_cat_kaggle.csv')
test = pd.read_csv('cat_in_dat/test_cat_kaggle.csv')
y_test = pd.read_csv('cat_in_dat/y_test_cat_kaggle.csv', header=None).values.flatten()
y_train = train['target'].values.flatten()
train.drop(['target'], axis=1, inplace=True)
ntrain = train.shape[0]
y = np.hstack([y_train, y_test])

In [None]:
train_test = pd.concat([train, test])
train_test.drop('id', axis=1, inplace=True)

In [None]:
train_test.head()

## Feature engineering

In [None]:
train_test['ord_5_1'] = train_test['ord_5'].apply(lambda x: x[0] if type(x) == str else np.nan)
train_test['ord_5_2'] = train_test['ord_5'].apply(lambda x: x[1] if type(x) == str else np.nan)
train_test['ord_5_1_u'] = train_test['ord_5_1'].apply(lambda x: (x.upper() == x)*1 if type(x) == str else np.nan)
train_test['ord_5_2_u'] = train_test['ord_5_2'].apply(lambda x: (x.upper() == x)*1 if type(x) == str else np.nan)

In [None]:
# Converting ordinal labels into ordered values
ord_1 = {
    'Novice' : 0,
    'Contributor' : 1,
    'Expert' : 2,
    'Master' : 3,
    'Grandmaster' : 4
}

ord_2 = {
    'Freezing' : 0,
    'Cold' : 1,
    'Warm' : 2,
    'Hot' : 3,
    'Boiling Hot' : 4,
    'Lava Hot' : 5
}

train_test['ord_1'] = train_test['ord_1'].map(ord_1)
train_test['ord_2'] = train_test['ord_2'].map(ord_2)
train_test['num_nan_count'] = train_test.isnull().sum(axis=1)

In [None]:
bin_col = [col for col in train_test.columns if col.startswith('bin_')]
nom_col = [col for col in train_test.columns if col.startswith('nom_')]
nom_col_low = [c for c in nom_col if len(train_test[c].unique()) <= 10]
nom_col_high = [c for c in nom_col if len(train_test[c].unique()) > 10]
ord_col = [col for col in train_test.columns if col.startswith('ord_')]
num_col = [col for col in train_test.columns if col.startswith('num_')]
all_cat_columns = [col for col in train_test.columns if col not in num_col]

In [None]:
%%time
# Cat boost encoder
for col in nom_col_high:
    fill_value = -1
    if train_test[col].dtype == 'O':
        fill_value = 'missing'
    si = SimpleImputer(strategy='constant', fill_value=fill_value)
    tr = CatBoostEncoder()
    temp = si.fit_transform(train_test[col].values.reshape(-1, 1))
    tr.fit(temp[:ntrain], y_train)
    train_test[col + '_te'] = tr.transform(temp)

In [None]:
%%time
# Label encoder
for col in ord_col:
    fill_value = -1
    if train_test[col].dtype == 'O':
        fill_value = 'missing'
    si = SimpleImputer(strategy='constant', fill_value=fill_value)
    tr = LabelEncoder()
    temp = si.fit_transform(train_test[col].values.reshape(-1, 1))
    train_test[col + '_le'] = tr.fit_transform(temp.flatten())

In [None]:
%%time
# One hot encoder
for col in bin_col + nom_col_low + ['day'] + ['month']:
    fill_value = -1
    if train_test[col].dtype == 'O':
        fill_value = 'missing'
    si = SimpleImputer(strategy='constant', fill_value=fill_value)
    tr = OneHotEncoder(categories='auto', sparse=False)
    temp = si.fit_transform(train_test[col].values.reshape(-1, 1))
    temp = tr.fit_transform(temp.reshape(-1, 1))
    columns = [col + '_' + col_names for col_names in tr.get_feature_names()]
    res = pd.DataFrame(temp, columns=columns)
    train_test = pd.concat([train_test.reset_index(drop=True), res.reset_index(drop=True)], axis=1)

In [None]:
%%time
# Freq encoding
for col in nom_col:
    fill_value = -1
    if train_test[col].dtype == 'O':
        fill_value = 'missing'
    si = SimpleImputer(strategy='constant', fill_value=fill_value)
    temp = pd.Series(si.fit_transform(train_test[col].values.reshape(-1, 1)).flatten())
    frequencies = temp.value_counts().to_dict()
    train_test[col + '_freq'] = temp.map(frequencies)

In [None]:
train_test.drop(all_cat_columns, axis=1, inplace=True)

In [None]:
train_test.head()

In [None]:
train_test.shape

In [None]:
train = train_test.iloc[:ntrain]
test = train_test.iloc[ntrain:]

## Grid search

In [None]:
from sklearn.model_selection import ParameterSampler
params = {
    'objective': 'binary:logistic',
    'n_estimators': 10000,
    'n_jobs': -1,
    'verbosity': 1,
    'patience': 20,
    'random_state': 0,
    'tree_method': 'gpu_hist'
}

grid_params = {
    'learning_rate': [.01, .05, .1],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [.6, .7, .8, .9],
    'colsample_bytree': [.6, .7, .8, .9, 1.0],
    'reg_lambda': [.01, .025, .05, .075, .1, .5],
    'reg_alpha': [0., .01, .025, .05, .1, .5],
    'gamma': [.05, .075, .1, .3, .5, .7, 1.]
}

list_params = list(ParameterSampler(grid_params,
                                    n_iter=20,
                                    random_state=0))

for param in list_params:
    param.update(params)

In [None]:
from sklearn.metrics import roc_auc_score
aucs = []
clfs_grid = []
for param in list_params:
    print(param)
    clf = XGBClassifier(**param)
    clf.fit(train.values, y_train,
        eval_set=[(train.values, y_train), (test.values, y_test)],
        eval_metric='auc',
        early_stopping_rounds=params['patience'],
        verbose=5)
    prediction = clf.predict_proba(test.values)
    auc = roc_auc_score(y_test, prediction[: ,1])
    aucs.append(auc)
    clfs_grid.append(clf)

In [None]:
for i,j in zip(aucs, list_params):
    print(i)
    print(j)

## Cross val

In [None]:
best_params = {
    'subsample': 0.7,
    'reg_lambda': 0.075,
    'reg_alpha': 0.01,
    'min_child_weight': 5,
    'max_depth': 5,
    'learning_rate': 0.01,
    'gamma': 0.5,
    'colsample_bytree': 0.7,
    'objective': 'binary:logistic',
    'n_estimators': 10000,
    'n_jobs': -1,
    'verbosity': 1,
    'patience': 20,
    'random_state': 0,
    'tree_method': 'gpu_hist'
}

In [None]:
from sklearn.model_selection import StratifiedKFold
clfs = []
predictions = []
skf = StratifiedKFold(n_splits=5)
for train_index, valid_index in skf.split(train.values, y_train):
    X_train = train.values[train_index]
    Y_train = y_train[train_index]
    X_valid = train.values[valid_index]
    Y_valid = y_train[valid_index]
    clf = XGBClassifier(**best_params)
    clf.fit(X_train, Y_train,
            eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
            eval_metric='auc',
            early_stopping_rounds=best_params['patience'],
            verbose=5)
    clfs.append(clf)
    prediction = clf.predict_proba(test.values)
    predictions.append(prediction)

In [None]:
from sklearn.metrics import roc_auc_score
test_predictions = np.array(predictions).mean(axis=0)[:, 1]
auc = roc_auc_score(y_test, test_predictions)
auc

In [None]:
auc

In [None]:
nom_col_high

In [None]:
import matplotlib
%matplotlib inline  
imp = pd.DataFrame(clf.feature_importances_, index=train.columns, columns=['Feature_importance'])
imp.sort_values('Feature_importance').plot(kind='barh', figsize=(8,20))