In [None]:
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics
import seaborn as sns
from functools import partial
from catboost import CatBoostClassifier
from tqdm import tqdm
import datatable as dt

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
%%time
train = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas().drop('id', axis=1)
train = reduce_memory_usage(train)
test = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas().drop('id', axis=1)
test = reduce_memory_usage(test)
sample = dt.fread('../input/tabular-playground-series-oct-2021/sample_submission.csv').to_pandas()
sample = reduce_memory_usage(sample)

In [None]:
train.dtypes

In [None]:
bool_cols_train = []
for i, col in enumerate(train.columns):
    if train[col].dtypes == bool:
        bool_cols_train.append(i)
    
print(bool_cols_train)

In [None]:
bool_cols_test = []
for i, col in enumerate(test.columns):
    if train[col].dtypes == bool:
        bool_cols_test.append(i)
    
print(bool_cols_test)

In [None]:
# reduce memory usage by converting boolean columns into integers
train.iloc[:, bool_cols_train] = train.iloc[:, bool_cols_train].astype(int)
test.iloc[:, bool_cols_test] = test.iloc[:, bool_cols_test].astype(int)

In [None]:
print("Train set shape", train.shape, "\n", "Test set shape", test.shape)

In [None]:
X = train.drop('target', axis=1).values
y = train['target'].values

In [None]:
# delete the train data to reduce memory usage
del train

In [None]:
"""def optimize_CB(trial, x, y, 
                  depth=None, grow_policy=None, random_strength=None, l2_leaf_reg=None):
    
    # if you want to optimize the parameter, don't pass anything
    # if you want to use the default value, pass 'default'
    # if you want to specify a value, pass that value
    
    if depth == None:
        depth = trial.suggest_int("depth", 3, 12)
    elif depth == 'default':
        depth = 6
    else:
        depth = depth
        
    if grow_policy == None:
        grow_policy = trial.suggest_categorical("grow_policy", ["Depthwise","SymmetricTree","Lossguide"])
    elif grow_policy == 'default':
        grow_policy = "SymmetricTree"
    else:
        grow_policy = grow_policy
        
    if random_strength == None:
        random_strength = trial.suggest_float("random_strength", 0.0, 3.0)
    elif random_strength == 'default':
        random_strength = 1.0
    else:
        random_strength = random_strength
       
    if l2_leaf_reg == None:
        l2_leaf_reg = trial.suggest_loguniform("l2_leaf_reg", 1e-6, 1e2)
    elif l2_leaf_reg == 'default':
        l2_leaf_reg = 3.0
    else:
        l2_leaf_reg = l2_leaf_reg
    
    params = {
        "depth": depth,
        "grow_policy": grow_policy,
        "random_strength": random_strength,
        "l2_leaf_reg": l2_leaf_reg,
        }
            
    skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores_train = []
    scores_valid = []
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(x, y)):
        
        print(f"Fold {fold+1} -------------->")
        x_train, y_train = x[train_idx], y[train_idx]
        x_valid, y_valid = x[valid_idx], y[valid_idx]
       
        #params["cat_features"] = cat_feats

        model = CatBoostClassifier(
                                   **params,
                                   learning_rate=0.03,
                                   iterations=10000,
                                   loss_function='CrossEntropy',
                                   eval_metric='AUC',
                                   use_best_model=True,
                                   early_stopping_rounds=100,
                                   task_type='GPU',
                                   )

        
        model.fit(x_train, y=y_train,
              #embedding_features=None,
              use_best_model=True,
              eval_set=[(x_valid, y_valid)],
              verbose=100)

        #preds_train = model.predict_proba(x_train)[:, 1]
        preds_valid = model.predict_proba(x_valid)[:, 1]
        
        try:
            #score_train = metrics.roc_auc_score(y_train, preds_train)
            score_valid = metrics.roc_auc_score(y_valid, preds_valid)
            print(score_valid)
            #scores_train.append(score_train)
            scores_valid.append(score_valid)
        except:
            pass
    
    #print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
    print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))
    
    cv_score = np.mean(scores_valid)
    
    return cv_score"""

In [None]:
"""# I will make only one trial to optimize the the depth, you can then optimize all the other parameters by just commenting out the parameter you 
# want to optimize

import warnings
warnings.filterwarnings('ignore')

optimization_function = partial(optimize_CB, x=X, y=y,
                               depth='default', 
                               grow_policy='default',
                               random_strength='default',
                               l2_leaf_reg='default'
                               )
study = optuna.create_study(direction='minimize')
study.optimize(optimization_function, n_trials=1)

dict_rmse = dict()
dict_2 = study.best_params
dict_2['rmse'] = study.best_value
dict_rmse['params'] = dict_2
dict_rmse['Number of finished trials'] = len(study.trials)

print(dict_rmse)"""

In [None]:

params = {
        "depth": 6,
        "grow_policy": "SymmetricTree",
        "l2_leaf_reg": 3.0,
        "random_strength": 1.0,
        }


# KFold
n_splits=5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
scores_train = []
scores_valid = []
preds_valid_array = np.zeros((X.shape[0], ))
preds_test_array = np.zeros((test.shape[0], ))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    y_train_log = y_train
    y_valid_log = y_valid

    model = CatBoostClassifier(
                           **params,
                           learning_rate=0.03,
                           iterations=10000,
                           loss_function='CrossEntropy',
                           eval_metric='AUC',
                           use_best_model=True,
                           early_stopping_rounds=100,
                           task_type='GPU'
                           )

        
    model.fit(
          x_train, y=y_train,
          #embedding_features=None,
          use_best_model=True,
          eval_set=[(x_valid, y_valid)],
          verbose=100
             )



    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    preds_test = model.predict_proba(test)[:, 1]
    
    preds_valid_array[valid_idx] += preds_valid
    preds_test_array += preds_test / n_splits
    
    score_train = metrics.roc_auc_score(y_train, preds_train)
    score_valid = metrics.roc_auc_score(y_valid, preds_valid)
    print(score_valid)
    scores_train.append(score_train)
    scores_valid.append(score_valid)
        
print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

pd.DataFrame({'claim': preds_valid_array}).to_csv('catboost_valid.csv', index=False)
sample.iloc[:, 1] = preds_test_array
sample.to_csv('catboost_test.csv', index=False)