In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install feature-engine

In [None]:
from feature_engine.encoding import OrdinalEncoder
from feature_engine.encoding import RareLabelEncoder

In [None]:
df = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=[0])
X_train = df.drop(columns='target', axis=1)
y_train = df['target']

In [None]:
X_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=[0])

In [None]:
X_train.head()

In [None]:
vars_num = [var for var in X_train.columns if X_train[var].dtype != 'O']
vars_cat = [var for var in X_train.columns if X_train[var].dtype == 'O']

In [None]:
print('Number of numerical variables: ' + str(len(vars_num)))
vars_num

In [None]:
print('Number of categorical variables: ' + str(len(vars_cat)))
vars_cat

In [None]:
X_train[vars_cat].nunique()

In [None]:
for col in ['cat3']:

    temp_df = pd.Series(X_train[col].value_counts() / len(X_train) )

    # make plot with the above percentages
    fig = temp_df.sort_values(ascending=False).plot.bar()
    fig.set_xlabel(col)

    # add a line at 5 % to flag the threshold for rare categories
    fig.axhline(y=0.01, color='red')
    fig.set_ylabel('Percentage of houses')
    plt.show()

In [None]:
rare_label_enc = RareLabelEncoder(tol=0.01, n_categories=4, variables=['cat3'])
rare_label_enc.fit(X_train)

In [None]:
X_train_rare = rare_label_enc.transform(X_train)

In [None]:
for col in ['cat3']:

    temp_df = pd.Series(X_train_rare[col].value_counts() / len(X_train) )

    # make plot with the above percentages
    fig = temp_df.sort_values(ascending=False).plot.bar()
    fig.set_xlabel(col)

    # add a line at 5 % to flag the threshold for rare categories
    fig.axhline(y=0.01, color='red')
    fig.set_ylabel('Percentage of houses')
    plt.show()

In [None]:
pipe = Pipeline([

    ('encoder_rare_label',
     RareLabelEncoder(n_categories=4, variables=vars_cat)),
    
    ('categorical_encoder',
     OrdinalEncoder(encoding_method='ordered', variables=vars_cat)),

    # xgboost
    ('xgb', XGBClassifier(eta=0.1, min_child_weight=7,
                          colsample_bytree=0.8, subsample=0.8,
                          alpha=0, reg_lambda=1, gamma=0.2,
                          tree_method='gpu_hist'))
])

In [None]:
param_grid = {
    'encoder_rare_label__tol': [0.01, 0.05],
    'xgb__max_depth': [11, 13]
}

In [None]:
grid_search = GridSearchCV(pipe, param_grid,
                           cv=4, n_jobs=-1, scoring='roc_auc', verbose=True)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_