## Creating folds for validation

In [None]:
import enum
import pandas as pd
from pandas.core.algorithms import mode
from sklearn import model_selection

if __name__ == "__main__":

    df = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")

    # We create a new column called kfold and fill it with -1
    df["kfold"] = -1

    # randomizing the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # Fetch targets || Required in Stratified KFold
    y = df.target.values

    # init the kfold class from model selection module
    kf = model_selection.StratifiedKFold(n_splits=5) # k = 5

    for fold, (trn_, val_) in enumerate(kf.split(X=df, y=y)):

        df.loc[val_, 'kfold'] = fold

    # save the new csv with kfold column
    df.to_csv("train_kfolds", index=False)

## Cross Validation

In [None]:
# Using logistic Regression

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression

def run(fold):

    # read the training dataset with kfolds
    df = pd.read_csv("./train_kfolds")

    features = [
        f for f in df.columns if f not in["id", "target", "kfold"]
    ]

    class_map = {}
    for i in range(1,5):
        class_map[f"Class_{i}"] = i

    df["target"] = df["target"].map(class_map)

    # Split into train and valid according to the fold
    train = df[df["kfold"] != fold].reset_index(drop=True)
    valid = df[df["kfold"] == fold].reset_index(drop=True)

    X_train = train[features]
    y_train = train["target"].values

    X_valid = valid[features]
    y_valid = valid["target"].values

    # Logistic Regression Model
    lr = LogisticRegression(multi_class='ovr')
    lr.fit(X_train, y_train)

    # Getting the probabilities
    pred_probs = lr.predict_proba(X_valid)

    # Getting the labels
    predictions = lr.predict(X_valid)

    # Auc roc
    auc = roc_auc_score(y_valid, pred_probs, multi_class='ovr')

    # Accuracy
    acc_sc = accuracy_score(y_valid, predictions)

    # log loss
    ll = log_loss(y_valid, pred_probs)

    print(f" AUC Score : {auc} || Accuracy : {acc_sc} || log_loss = {ll}")


if __name__ == "__main__":

    for f in range(5):
        run(f)

# Fitting on the Whole Data

In [None]:
df = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
features = [
        f for f in df.columns if f not in["id", "target"]
]
X_train = df[features]
y_train = df["target"].values

In [None]:
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)

## Testing and Submission

In [None]:
test = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
X_test = test[features]

In [None]:
# prediciting probabilities
pred_probs = lr.predict_proba(X_test)

In [None]:
pred_probs.shape

In [None]:
# Submission time !!
sub = pd.read_csv("../input/tabular-playground-series-may-2021/sample_submission.csv")

In [None]:
sub.head()

In [None]:
sub_cols = [
    col for col in sub.columns if col != "id"
]

In [None]:
sub[sub_cols] = pred_probs

In [None]:
sub.head()

In [None]:
sub.to_csv("Log_res.csv", index=False)