In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option("display.max_rows", 500)
pd.set_option('display.max_columns', 500)

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')

# import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
train_df.drop('id', axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
train_df.isnull().sum().sum()

In [None]:
train_df.describe().T.style.background_gradient()

In [None]:
train_df.info()

In [None]:
train_df.dtypes.value_counts()

In [None]:
int_cols = train_df.select_dtypes(include=["integer"]).columns.tolist()
float_cols = train_df.select_dtypes(include=["floating"]).columns.tolist()

In [None]:
train_df[int_cols] = train_df[int_cols].apply(pd.to_numeric, downcast='integer')
train_df[float_cols] = train_df[float_cols].apply(pd.to_numeric, downcast='float')

In [None]:
train_df.info()

In [None]:
train_df.dtypes.value_counts()

In [None]:
columns = train_df.columns.tolist()
num_of_charts = 8
divided_columns = [columns[i:i+num_of_charts] for i in range(0, len(columns), num_of_charts)]

for sub_col in divided_columns:
    fig, ax = plt.subplots(1, num_of_charts, figsize=(num_of_charts * 2, 2))
    for i, name in enumerate(sub_col):
        sns.histplot(data=train_df, x=name, ax=ax[i])
    plt.tight_layout()
    plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier

In [None]:
X = train_df.drop('target', axis=1)
y = train_df.target

In [None]:
# from skopt import BayesSearchCV
# from skopt.space import Categorical, Real, Integer

In [None]:
# # https://catboost.ai/en/docs/concepts/parameter-tuning

# search_spaces = dict(
#     depth= Integer(6, 14),
#     l2_leaf_reg= Integer(2, 30),
#     random_strength=Real(1e-5, 10, 'log-uniform'),
#     bagging_temperature=Real(0.0, 1.0),
#     scale_pos_weight=Real(0.01, 1.0, 'uniform')
# )

# search_cv = BayesSearchCV(estimator=CatBoostClassifier(verbose=False, iterations=100, task_type='GPU'), n_iter=20, search_spaces=search_spaces, cv=3, verbose=1)
# search_cv.fit(X,y)

In [None]:
# search_cv.best_estimator_.get_params()

In [None]:
# pd.DataFrame(search_cv.cv_results_)[['rank_test_score', 'mean_test_score', 'std_test_score', "mean_score_time", 'mean_fit_time', 'params']].style.background_gradient(axis=1)

In [None]:
model_hyperparams = {
 'random_state': 1,
 'verbose': False,
 'task_type': 'GPU',
 'depth': 12,
 'bagging_temperature': 0.04460451018213408,
 'l2_leaf_reg': 30,
 'random_strength': 0.012178576224275596,
 'scale_pos_weight': 0.9409642487395263
}

n_splits = 10

In [None]:
estimator_list = []

oof_predict = np.zeros(y.shape)

kfold = StratifiedKFold(n_splits=n_splits)

for i, (train_index, test_index) in enumerate(kfold.split(X,y)):
    
    print(":: Split ::", i)
    
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y[train_index], y[test_index]
    print(f"{X_train.shape}, {X_test.shape}, {y_train.shape}, {y_test.shape}")
    
    estimator = CatBoostClassifier(**model_hyperparams)
    
    print("Fitting...")
    estimator.fit(X_train, y_train)
    estimator_list.append(estimator)
    
    print("Prediction...")
    predict = estimator.predict_proba(X_test)[:, 1]
    
    score = roc_auc_score(y_test, predict)
    print("Score:", score)

    oof_predict[test_index] += predict / n_splits
    
final_score = roc_auc_score(y, oof_predict)

print("Final Score", final_score)

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
test_df.drop('id', axis=1, inplace=True)

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
int_cols_no_target = list(int_cols)
int_cols_no_target.remove('target')


test_df[int_cols_no_target] = test_df[int_cols_no_target].apply(pd.to_numeric, downcast='integer')
test_df[float_cols] = test_df[float_cols].apply(pd.to_numeric, downcast='float')

test_df.info()

In [None]:
test_df.dtypes.value_counts()

In [None]:
test_pred = [m.predict_proba(test_df)[:, 1] for m in estimator_list]
test_pred = np.array(test_pred).mean(axis=0)

In [None]:
test_pred[:10]

In [None]:
submit_df = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
submit_df['target'] = test_pred
submit_df.to_csv('submission.csv', index=False)