In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")
sub_df = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

In [None]:
target = train_df["target"].values

train_df = train_df.drop(columns=["id", "target"]).values
test_df = test_df.drop(columns=["id"]).values

In [None]:
lda = LDA()

tmp = lda.fit_transform(train_df, target)
train_df = np.append(train_df, tmp, axis=1)

tmp = lda.transform(test_df)
test_df = np.append(test_df, tmp, axis=1)

In [None]:
clf_pipe = Pipeline([
    ("scaler", MinMaxScaler()),
    ("logistic regression", LogisticRegression(solver="liblinear"))
])

cv = StratifiedKFold(n_splits=5)
test_preds = []
mean_score = 0

for fold, (train_idx, val_idx) in enumerate(cv.split(train_df, target)):
    X_train, y_train = train_df[train_idx, :], target[train_idx]
    X_val, y_val = train_df[val_idx, :], target[val_idx]

    clf_pipe.fit(X_train, y_train)

    y_pred = clf_pipe.predict_proba(X_val)
    score = roc_auc_score(y_val, y_pred[:, 1])
    mean_score += score

    print(f"FOLD {fold} Score: {score}")

    test_preds.append(clf_pipe.predict_proba(test_df)[:, 1])

print()
print(f"Mean score of all folds: {mean_score/5}")

In [None]:
sub_df["target"] = sum(test_preds)/5
sub_df.to_csv("submission.csv", index=False)

sub_df.head()