In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

pd.options.display.max_columns = None

In [None]:
submission_test = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/test.csv")
train = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/train.csv")

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
submission_test.head()

In [None]:
submission_test.describe()

In [None]:
for col in train.columns:
    print(col)
    print(train[col].unique())

In [None]:
encoder = OneHotEncoder()
def change_column_order(df, col_number, number_of_columns):
    cols = df.columns.tolist()
    cols = cols[:col_number] + cols[-number_of_columns:] + cols[col_number:-number_of_columns]
    df = df[cols]
    return df

def one_hot_encoding(df, col_name):
    df[col_name] = df[col_name].fillna("NaN")
    unique_vals = sorted(df[col_name].unique().tolist())
    columns = {v: f"{col_name}_{k}" for v, k in enumerate(unique_vals)}
    enc_df = pd.DataFrame(encoder.fit_transform(df[[col_name]]).toarray()).rename(columns=columns)
    df = df.join(enc_df)
    df = change_column_order(df, df.columns.get_loc(col_name), len(unique_vals))
    del df[col_name]
    del df[f"{col_name}_NaN"]
    return df

def hexstr2int(df, col_name):
    df[col_name] = df[col_name].map(lambda x: int(x, 16) if isinstance(x, str) else x)
    df[col_name] = df[col_name].fillna(df[col_name].median())
    return df

def preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    df["bin_0"] = df["bin_0"].fillna(df["bin_0"].median())
    df["bin_1"] = df["bin_1"].fillna(df["bin_1"].median())
    df["bin_2"] = df["bin_2"].fillna(df["bin_2"].median())
    
    df["bin_3"] = df["bin_3"].map({"F": 0, "T": 1})
    df["bin_3"] = df["bin_3"].fillna(df["bin_3"].median())
    
    df["bin_4"] = df["bin_4"].map({"N": 0, "Y": 1})
    df["bin_4"] = df["bin_4"].fillna(df["bin_4"].median())
    
    for i in range(0, 5):
        df = one_hot_encoding(df, f"nom_{i}")
    for i in range(5, 10):
        df = hexstr2int(df, f"nom_{i}")
    
    df["ord_0"] = df["ord_0"].fillna(df["ord_0"].median())
    
    df["ord_1"] = df["ord_1"].map({"Novice": 0, "Contributor": 1, "Expert": 2, "Master": 3, "Grandmaster": 4})
    df["ord_1"] = df["ord_1"].fillna(df["ord_1"].median())
    
    df["ord_2"] = df["ord_2"].map({"Freezing": 0, "Cold": 1, "Warm": 2, "Hot": 3, "Boiling Hot": 4, "Lava Hot": 5})
    df["ord_2"] = df["ord_2"].fillna(df["ord_2"].median())
    
    df["ord_3"] = df["ord_3"].map(lambda x: ord(x) if isinstance(x, str) else x)
    df["ord_3"] = df["ord_3"].fillna(df["ord_3"].median())
    
    df["ord_4"] = df["ord_4"].map(lambda x: ord(x) if isinstance(x, str) else x)
    df["ord_4"] = df["ord_4"].fillna(df["ord_4"].median())
    
    df["ord_5"] = df["ord_5"].map(lambda x: sum(ord(ch) for ch in x) if isinstance(x, str) else x)
    df["ord_5"] = df["ord_5"].fillna(df["ord_5"].median())
    
    df["day"] = df["day"].fillna(df["day"].median())
    df["month"] = df["month"].fillna(df["month"].median())
    
    return df

In [None]:
class Normalizer:
    def __init__(self, X_train: pd.DataFrame):
        self.mins = [X_train[col].min() for col in X_train]
        self.maxes = [X_train[col].max() for col in X_train]

    def __call__(self, df: pd.DataFrame):
        for idx, col in enumerate(df):
            df[col] = (df[col] - self.mins[idx]) / (self.maxes[idx] - self.mins[idx])
        return df

In [None]:
cols = train.columns.tolist()
cols.remove("target")
cols.remove("id")
preprocessed = preprocessing(train[cols])
preprocessed_submission = preprocessing(submission_test[cols])
X_train, X_test, y_train, y_test = train_test_split(preprocessed, train["target"], test_size=0.20, random_state=42)

In [None]:
normalizer = Normalizer(X_train)
X_train = normalizer(X_train)
X_test = normalizer(X_test)
preprocessed = normalizer(preprocessed)
preprocessed_submission = normalizer(preprocessed_submission)

In [None]:
X_train.describe()

In [None]:
y_train.describe()

In [None]:
preprocessed_submission.describe()

In [None]:
def logistic_regression_grid_search(solver: tuple, penalties: tuple) -> GridSearchCV:
    tol = (0.01, 0.001, 0.0001)
    C = (0.1, 0.5, 1.0, 1.5, 10.0)
    params = {"penalty": penalties, "tol": tol, "C": C, "solver": solver, "random_state": (42,), "dual": (False, )}
    search = GridSearchCV(LogisticRegression(), param_grid=params, verbose=5, return_train_score=True, scoring=make_scorer(roc_auc_score), n_jobs=-1)
    search.fit(X_train, y_train)
    return search

In [None]:
newton = logistic_regression_grid_search(("newton-cg",), ("l2", "none"))

In [None]:
lbfgs = logistic_regression_grid_search(("lbfgs",), ("l2", "none"))

In [None]:
liblinear = logistic_regression_grid_search(("liblinear",), ("l1",))

In [None]:
sag = logistic_regression_grid_search(("sag",), ("l2", "none"))

In [None]:
saga = logistic_regression_grid_search(("saga",), ("l1", "l2", "elasticnet", "none"))

In [None]:
print(newton.best_score_)
print(newton.best_params_)
print()
print(lbfgs.best_score_)
print(lbfgs.best_params_)
print()
print(liblinear.best_score_)
print(liblinear.best_params_)
print()
print(sag.best_score_)
print(sag.best_params_)
print()
print(saga.best_score_)
print(saga.best_params_)

In [None]:
def linear_svc_grid_search():
    tol = (0.1, 0.01, 0.001, 0.0001)
    C = (0.1, 0.5, 1.0, 1.5, 10.0)
    penalty = ("l1", "l2")
    loss = ("hinge", "squared_hinge")
    params = {"penalty": penalty, "tol": tol, "C": C, "loss": loss, "dual": (False, )}
    search = GridSearchCV(LinearSVC(), param_grid=params, verbose=5, return_train_score=True, scoring=make_scorer(roc_auc_score), n_jobs=-1)
    search.fit(X_train, y_train)
    return search


In [None]:
svc_search = linear_svc_grid_search()

In [None]:
print(svc_search.best_score_)
print(svc_search.best_params_)

In [None]:
def find_best_threshold(model, X_test, y_test) -> tuple:
    best = [0, 0]
    predicted = model.predict_proba(X_test)
    for i in range(0, 101):
        threshold = i / 100
        score = f1_score(y_test, (predicted >= threshold).astype(int)[:, 1])
        if score >= best[1]:
            best = [threshold, score]
    return tuple(best)

In [None]:
def print_sorted_coefs(model):
    coefs = model.coef_[0]
    coefs = tuple((col, coefs[index]) for index, col in enumerate(X_train.columns))
    coefs = sorted(coefs, key=lambda x: abs(x[1]), reverse=True)
    for coef in coefs:
        print(coef)

In [None]:
best_lr = sorted((newton, lbfgs, liblinear, sag, saga), key=lambda x: x.best_score_, reverse=True)[0]
print(best_lr.best_params_)

In [None]:
model = LogisticRegression(**best_lr.best_params_)
model.fit(X_train, y_train)
print_sorted_coefs(model)

In [None]:
plot_roc_curve(model, X_test, y_test)
print(f"f1 score: {f1_score(y_test, model.predict(X_test))}")
print(f"roc auc:  {roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])}")
best = find_best_threshold(model, X_test, y_test)
print(f"Best f1 score: {best[1]}; for threshold: {best[0]}")

In [None]:
model_svc = LinearSVC(**svc_search.best_params_)
model_svc.fit(X_train, y_train)
print_sorted_coefs(model_svc)

In [None]:
plot_roc_curve(model_svc, X_test, y_test)
print(f"f1 score: {f1_score(y_test, model_svc.predict(X_test))}")
print(f"roc auc:  {roc_auc_score(y_test, model_svc.predict(X_test))}")

In [None]:
model = LogisticRegression(**best_lr.best_params_)
model.fit(preprocessed, train["target"])
print_sorted_coefs(model)

In [None]:
predicted = model.predict_proba(preprocessed_submission)
predicted = (predicted >= best[0]).astype(int)[:, 1]

In [None]:
submission = pd.DataFrame({"id": np.array(submission_test.id), "target": predicted})
submission.to_csv("submission.csv", index=False)
submission

In [None]:
submission.describe()