In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plt.rcParams["figure.figsize"] = (20,10)
plt.style.use("ggplot")

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")

In [None]:
categorical_columns = [f"f_{i}" if len(str(i)) == 2 else f"f_0{i}" for i in range(7,19) ] + ["f_27", "f_29","f_30"]
real_value_columns = [ f"f_0{i}" for i in range(0,7) ] + [ f"f_{i}" for i in range(19,29) ]
real_value_columns.pop(real_value_columns.index("f_27"))

In [None]:
x_train, x_rem, y_train, y_rem = train_test_split(train_df.drop(["id","target"], axis=1), train_df.target, test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(x_rem, y_rem, test_size=0.3)

In [None]:
model = CatBoostClassifier(
    cat_features=categorical_columns,
    n_estimators=10000,
    learning_rate=0.080315154739037707,
    depth=3,
    l2_leaf_reg=1,
    task_type="GPU",
    verbose=1000
)

model.fit(x_train, y_train, eval_set=(x_val, y_val), use_best_model=True)

In [None]:
model.fit(train_df.drop(["id","target"], axis=1), train_df.target)

In [None]:
sns.lineplot(x=model.feature_names_, y=model.feature_importances_, marker="o")

In [None]:
prediction = model.predict_proba(test_df.drop(["id"],axis=1))[:,1]

In [None]:
prediction_df = test_df[["id"]]
prediction_df["target"] = prediction

In [None]:
prediction_df.to_csv("submission.csv", index=False)