In [None]:
import scipy
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.model_selection import cross_validate
from statistics import mean, stdev
from math import sqrt
import category_encoders as ce

In [None]:
D0 = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/train.csv", index_col="id")
D_test = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/test.csv", index_col="id")

y_train = D0["target"]
D = D0.drop(columns="target")
test_ids = D_test.index

In [None]:
ordinal_order = {
    #"ord_0": [1.0, 2.0, 3.0],
    "ord_1": ["Novice", "Contributor", "Expert", "Master", "Grandmaster"],
    #"ord_2": ["Freezing", "Cold", "Warm", "Hot", "Boiling Hot", "Lava Hot"],
    **{col: sorted(D[col].dropna().unique()) for col in ["ord_3", "ord_4", "ord_5"]}, #"day", "month",
}

for k, vals in ordinal_order.items():
    vals.insert(len(vals)//2, "nan")
    ordinal_order[k] = list(map(str, vals))

In [None]:
target_encs = {"nom_6": ce.LeaveOneOutEncoder(cols=["nom_6"])}

In [None]:
X_train=D.astype(str)
X_test=D_test.astype(str)

In [None]:
def make_cat_trans(onehot_cols, ordinal_order, target_encs):
    encs = []
    
    if onehot_cols:
        encs.append(make_column_transformer((OneHotEncoder(sparse=True, dtype="int8", handle_unknown="ignore"), onehot_cols)))

    if ordinal_order:
        ordinal_enc = make_column_transformer(*[(OrdinalEncoder([vals], dtype="float32"), [col]) for col, vals in ordinal_order.items()])
        encs.append(make_pipeline(ordinal_enc, MinMaxScaler(copy=False)))
        encs.append(make_pipeline(ordinal_enc, MinMaxScaler(copy=False), FunctionTransformer(lambda x:4*(x-0.5)**2, validate=False)))
    
    if target_encs:
        encs.append(make_column_transformer(*[(enc, [col]) for col, enc in target_encs.items()]))

    trans = make_union(*encs)
    
    return trans

onehot_cols = X_train.columns.difference(ordinal_order.keys()).tolist()

trans = make_cat_trans(onehot_cols, ordinal_order, target_encs)

In [None]:
%%time
clf=LogisticRegression(C=0.05, solver="lbfgs", max_iter=5000)

model=make_pipeline(trans, clf)

model.fit(X_train, y_train)

pred = model.predict_proba(X_test)[:, 1]

pd.DataFrame({"id": test_ids, "target": pred}).to_csv("submission.csv", index=False)