In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import model_selection
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
## create fold.py
if __name__ == "__main__":
    
    df = pd.read_csv("../input/cat-in-the-dat/train.csv")
    
    # we carete a new column called kfold and fill it with -1
    df['kfold'] = -1
    
    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # fetch labels
    y = df.target.values
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    for fold, (train_, valid_) in enumerate(kf.split(X=df, y=y)):
        df.loc[valid_, 'kfold'] = fold
        
    # save the new csv with kfold column
    df.to_csv("./cat_train_fold.csv", index=False)


In [None]:
df.kfold.value_counts()

In [None]:
df[df.kfold==0].target.value_counts()

In [None]:
df[df.kfold==1].target.value_counts()

In [None]:
df[df.kfold==2].target.value_counts()

In [None]:
df[df.kfold==3].target.value_counts()

In [None]:
df[df.kfold==4].target.value_counts()

In [None]:
df_fold = pd.read_csv('./cat_train_fold.csv')
df_fold.head()

We see that in each fold, the distribution of targets is the same. This is what we need. It can also be similar and doesn’t have to be the same all the time. Now, when we build our models, we will have the same distribution of targets across every fold.

In [None]:
# One of the simplest models we can build is by one-hot encoding all the data and using logistic regression

from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

def logistic_regression(fold):
    # load the full training data with folds
    df = pd.read_csv('./cat_train_fold.csv')
    
    # all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
    ]
    # fill all NaN values with NONE
    #note that I am converting all columns to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True) # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True) # initialize OneHotEncoder from scikit-learn

    ohe = preprocessing.OneHotEncoder()

    # fit ohe on training + validation features
    full_data = pd.concat(
    [df_train[features], df_valid[features]], axis=0
    )
    ohe.fit(full_data[features])

    # transform training data
    x_train = ohe.transform(df_train[features])

    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression(solver='liblinear')

    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)

    # predict on validation data
    # we need the probability values as we are calculating AUC # we will use the probability of 1s

    valid_preds = model.predict_proba(x_valid)[:, 1]

    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)

    # print auc
    print(f"Fold = {fold}, AUC = {auc}")
    

if __name__ == "__main__":
    for fold_ in range(5):
        logistic_regression(fold_)
    

We see that AUC scores are quite stable across all folds. </br>
The average AUC is : 
0.78631449527.</br> Quite good for our first model!
</br>
</br>
</hr>
Many people will start this kind of problem with a tree-based model, such as random forest. For applying random forest in this dataset, instead of one-hot encoding, we can use label encoding and convert every feature in every column to an integer as discussed previously.


In [None]:
from sklearn import ensemble
def Random_forest(fold):
    # load the full training data with folds
    df = pd.read_csv('./cat_train_fold.csv')
    
    # all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
    ]
    # fill all NaN values with NONE
    #note that I am converting all columns to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    
    # now its time to label endoe the features
    for col in features:
        
        # initialize labelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()
        
        # fil label encdoer on all data
        lbl.fit(df[col])
        
        # transform all the data
        df.loc[:,col] = lbl.transform(df[col])
        
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True) # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True) # initialize OneHotEncoder from scikit-learn

    
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize random forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    model.fit(x_train, df_train.target.values)

    # predict on validation data
    # we need the probability values as we are calculating AUC 
    # we will use the probability of 1s

    valid_preds = model.predict_proba(x_valid)[:, 1]

    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)

    # print auc
    print(f"Fold = {fold}, AUC = {auc}")
    

if __name__ == "__main__":
    for fold_ in range(5):
        Random_forest(fold_)

Wow! Huge difference! The random forest model, without any tuning of hyperparameters, performs a lot worse than simple logistic regression.</br><br>
<h3><i>
    And this is a reason why we should always start with simple models first. A fan of random forest would begin with it here and will ignore logistic regression model thinking it’s a very simple model that cannot bring any value better than random forest. That kind of person will make a huge mistake. In our implementation of random forest, the folds take a much longer time to complete compared to logistic regression. So, we are not only losing on AUC but also taking much longer to complete the training. Please note that inference is also time-consuming with random forest and it also takes much larger space.</i></h3>