In [None]:
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":

    # Read the training data
    df = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")

    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1

    # randomize the rows
    df = df.sample(frac=1).reset_index(drop=True)

    # fetch the labels
    y = df.target.values

    # init the kfold class from model selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    for fold, (train_, val_) in enumerate(kf.split(X=df, y=y)):
        df.loc[val_, 'kfold'] = fold

    # save the new csv with kfold column
    df.to_csv("train_KFolds.csv",index=False)

In [None]:
# All the data is One Hot Encoded
# We will use Sparse Representation to reduce the size of Sparse Matrix using SVD

import pandas as pd
from scipy import sparse
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):

    # load the full training dataset with folds
    df = pd.read_csv("./train_KFolds.csv")

    # all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
    ]

    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter as all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # get the training data
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # get the validation data
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # init OneHotEncoder from sklearn
    ohe = preprocessing.OneHotEncoder()

    # fit ohe on training + validation data
    full_data = pd.concat(
        [df_train[features], df_valid[features]],
        axis=0
    )
    ohe.fit(full_data[features])

    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])

    # init TRUNCATED SVD
    # we are reducing the data to 120 components
    svd = decomposition.TruncatedSVD(n_components=120)

    # fit the svd on full sparse training data
    full_sparse = sparse.vstack((x_train, x_valid))
    svd.fit(full_sparse)

    # transform sparse training data
    x_train = svd.transform(x_train)

    # transform the validation data
    x_valid = svd.transform(x_valid)

    # init Random Forest Model
    model = ensemble.RandomForestClassifier(n_jobs=-1)

    # fit the model on training data (One Hot Encoded)
    model.fit(x_train, df_train.target.values)

    # predict on validation data
    # we need probability values as we are calculating AUC
    # we will use probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]

    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)

    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)