In [None]:
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":

    # Read the training data
    df = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")

    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1

    # randomize the rows
    df = df.sample(frac=1).reset_index(drop=True)

    # fetch the labels
    y = df.target.values

    # init the kfold class from model selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    for fold, (train_, val_) in enumerate(kf.split(X=df, y=y)):
        df.loc[val_, 'kfold'] = fold

    # save the new csv with kfold column
    df.to_csv("train_KFolds.csv",index=False)

In [None]:
# All the data is Label Encoded
# Using XGBoost

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def run(fold):

    # load the full training data with folds
    df = pd.read_csv("./train_KFolds.csv")

    # all columns are features except id, target and kfold colums
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
    ]

    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter as all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # label encode the features
    for col in features:

        # init LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()

        # fit the label encoder on all the data
        lbl.fit(df[col])

        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])

    # get the training data using folds
    df_train = df[df.kfold != -1].reset_index(drop=True)

    # get the validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # get the training data
    x_train = df_train[features].values

    # get the validation data
    x_valid = df_valid[features].values

    # init XGBoost Model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=200
    )

    # fit the model on training data (Label Encoded)
    model.fit(x_train, df_train.target.values)

    # predict on validation data
    # we need probabilty values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]

    # get the auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)

    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)