In [None]:
import datetime
import os
import sys

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

if "ipykernel" in sys.modules:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

In [None]:
data_src_dir = "/kaggle/input/tabular-playground-series-nov-2021"
data_dst_dir = "/kaggle/working/data"
filename_list = ["train", "test", "sample_submission"]
if not os.path.exists(data_dst_dir):
    os.makedirs(data_dst_dir)
for filename in filename_list:
    data_src_path = os.path.join(data_src_dir, filename) + ".csv"
    data_dst_path = os.path.join(data_dst_dir, filename) + ".pkl"
    if not os.path.exists(data_dst_path):
        df = pd.read_csv(data_src_path, index_col=0)
        df.to_pickle(data_dst_path)

df_train = pd.read_pickle(os.path.join(data_dst_dir, "train.pkl"))
df_test = pd.read_pickle(os.path.join(data_dst_dir, "test.pkl"))
df_submission = pd.read_pickle(os.path.join(data_dst_dir, "sample_submission.pkl"))
X_test = df_test.values

In [None]:
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
predict_test = np.zeros(len(df_test))
for index_train, index_val in tqdm(skf.split(df_train.drop("target", axis=1), df_train["target"]), total=n_splits):
    clf = LogisticRegression(max_iter=1e8)
    X_train = df_train.iloc[index_train].drop("target", axis=1)
    y_train = df_train.iloc[index_train]["target"]
    X_val = df_train.iloc[index_val].drop("target", axis=1)
    y_val = df_train.iloc[index_val]["target"]
    clf = clf.fit(X_train, y_train)
    predict_val = clf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, predict_val)
    tqdm.write(f"auc: {auc:.2f}")
    predict_test += clf.predict_proba(X_test)[:, 1] / n_splits
df_submission["target"] = predict_test
now = str(datetime.datetime.now().replace(microsecond=0)).replace(" ", "-").replace(":", "-")
predict_dir = os.path.join(data_dst_dir, "predict")
if not os.path.exists(predict_dir):
    os.makedirs(predict_dir)
df_submission.to_csv(os.path.join(predict_dir, f"{now}_decision_tree.csv"))