In [None]:
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics
import seaborn as sns
from functools import partial
from catboost import CatBoostClassifier
import lightgbm as lgbm
from tqdm import tqdm
import datatable as dt

In [None]:
%%time
train = dt.fread('../input/tabular-playground-series-sep-2021/train.csv').to_pandas().drop('id', axis=1)
test = dt.fread('../input/tabular-playground-series-sep-2021/test.csv').to_pandas().drop('id', axis=1)
sample = dt.fread('../input/tabular-playground-series-sep-2021/sample_solution.csv').to_pandas()

In [None]:
print("Train set shape", train.shape, "\n", "Test set shape", test.shape)

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]
# Counting amount of missing values in each row and adding it as a new feature
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

In [None]:
"""plt.figure(figsize=(24, 6*(118/4)))
for i in tqdm(range(len(train.columns.tolist())-1)):
    plt.subplot(30, 4, i+1)
    sns.histplot(train[f'f{i+1}'], kde=True)
    sns.histplot(test[f'f{i+1}'], kde=True, color='red')
plt.show()"""

In [None]:
"""corr = train.corr()
plt.figure(figsize=(30, 30))
sns.heatmap(corr)
plt.show()"""

In [None]:
"""cols = train.columns.tolist()
for col in cols:
    print(col)
    print(corr[col].sort_values(ascending=False)[1:11])
    print('=======================')"""

In [None]:
X = train.drop('claim', axis=1).values
y = train['claim'].values.astype(int)

In [None]:
"""plt.figure(figsize=(12, 7))
sns.countplot(y)
plt.show()"""

In [None]:
"""def optimize_CB(trial, x, y, 
                  depth=None, grow_policy=None, random_strength=None, l2_leaf_reg=None):
    
    # if you want to optimize the parameter, don't pass anything
    # if you want to use the default value, pass 'default'
    # if you want to specify a value, pass that value
    
    if depth == None:
        depth = trial.suggest_int("depth", 3, 12)
    elif depth == 'default':
        depth = 6
    else:
        depth = depth
        
    if grow_policy == None:
        grow_policy = trial.suggest_categorical("grow_policy", ["Depthwise","SymmetricTree","Lossguide"])
    elif grow_policy == 'default':
        grow_policy = "SymmetricTree"
    else:
        grow_policy = grow_policy
        
    if random_strength == None:
        random_strength = trial.suggest_float("random_strength", 0.0, 3.0)
    elif random_strength == 'default':
        random_strength = 1.0
    else:
        random_strength = random_strength
       
    if l2_leaf_reg == None:
        l2_leaf_reg = trial.suggest_loguniform("l2_leaf_reg", 1e-6, 1e2)
    elif l2_leaf_reg == 'default':
        l2_leaf_reg = 3.0
    else:
        l2_leaf_reg = l2_leaf_reg
    
    params = {
        "depth": depth,
        "grow_policy": grow_policy,
        "random_strength": random_strength,
        "l2_leaf_reg": l2_leaf_reg,
        }
            
    skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores_train = []
    scores_valid = []
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(x, y)):
        
        print(f"Fold {fold+1} -------------->")
        x_train, y_train = x[train_idx], y[train_idx]
        x_valid, y_valid = x[valid_idx], y[valid_idx]
       
        #params["cat_features"] = cat_feats

        model = CatBoostClassifier(
                                   **params,
                                   learning_rate=0.03,
                                   iterations=10000,
                                   loss_function='CrossEntropy',
                                   eval_metric='AUC',
                                   use_best_model=True,
                                   early_stopping_rounds=100,
                                   task_type='GPU',
                                   )

        
        model.fit(x_train, y=y_train,
              #embedding_features=None,
              use_best_model=True,
              eval_set=[(x_valid, y_valid)],
              verbose=100)

        #preds_train = model.predict_proba(x_train)[:, 1]
        preds_valid = model.predict_proba(x_valid)[:, 1]
        
        try:
            #score_train = metrics.roc_auc_score(y_train, preds_train)
            score_valid = metrics.roc_auc_score(y_valid, preds_valid)
            print(score_valid)
            #scores_train.append(score_train)
            scores_valid.append(score_valid)
        except:
            pass
    
    #print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
    print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))
    
    cv_score = np.mean(scores_valid)
    
    return cv_score"""

In [None]:
"""import warnings
warnings.filterwarnings('ignore')

optimization_function = partial(optimize_CB, x=X, y=y,
                               depth='default', 
                               grow_policy='default',
                               random_strength='default',
                               l2_leaf_reg='default'
                               )
study = optuna.create_study(direction='minimize')
study.optimize(optimization_function, n_trials=1)

dict_rmse = dict()
dict_2 = study.best_params
dict_2['rmse'] = study.best_value
dict_rmse['params'] = dict_2
dict_rmse['Number of finished trials'] = len(study.trials)

print(dict_rmse)"""

In [None]:
"""import warnings
warnings.filterwarnings('ignore')

optimization_function = partial(optimize_CB, x=X, y=y,
                               depth=3, 
                               #grow_policy='default',
                               random_strength='default',
                               l2_leaf_reg='default'
                               )
study = optuna.create_study(direction='maximize')
study.optimize(optimization_function, n_trials=10)

dict_rmse = dict()
dict_2 = study.best_params
dict_2['rmse'] = study.best_value
dict_rmse['params'] = dict_2
dict_rmse['Number of finished trials'] = len(study.trials)

print(dict_rmse)"""

In [None]:
"""import warnings
warnings.filterwarnings('ignore')

optimization_function = partial(optimize_CB, x=X, y=y,
                               depth=2, 
                               grow_policy='Lossguide',
                               random_strength='default',
                               l2_leaf_reg='default'
                               )
study = optuna.create_study(direction='maximize')
study.optimize(optimization_function, n_trials=1)

dict_rmse = dict()
dict_2 = study.best_params
dict_2['rmse'] = study.best_value
dict_rmse['params'] = dict_2
dict_rmse['Number of finished trials'] = len(study.trials)

print(dict_rmse)"""

In [None]:
"""import warnings
warnings.filterwarnings('ignore')

optimization_function = partial(optimize_CB, x=X, y=y,
                               depth=11, 
                               grow_policy='Lossguide',
                               #random_strength='default',
                               #l2_leaf_reg='default'
                               )
study = optuna.create_study(direction='maximize')
study.optimize(optimization_function, n_trials=20)

dict_rmse = dict()
dict_2 = study.best_params
dict_2['rmse'] = study.best_value
dict_rmse['params'] = dict_2
dict_rmse['Number of finished trials'] = len(study.trials)

print(dict_rmse)"""

In [None]:
"""# Using optimized parameters

params = {
        "depth": 11,
        "grow_policy": "Lossguide",
        "l2_leaf_reg": 4.1817146074043645,
        "random_strength": 2.096148200857938
        }


# KFold
n_splits=5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
scores_train = []
scores_valid = []
preds_valid_array = np.zeros((X.shape[0], ))
preds_test_array = np.zeros((test.shape[0], ))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    y_train_log = y_train
    y_valid_log = y_valid

    model = CatBoostClassifier(
                           **params,
                           learning_rate=0.03,
                           iterations=10000,
                           loss_function='CrossEntropy',
                           eval_metric='AUC',
                           use_best_model=True,
                           early_stopping_rounds=100,
                           task_type='GPU'
                           )

        
    model.fit(
          x_train, y=y_train,
          #embedding_features=None,
          use_best_model=True,
          eval_set=[(x_valid, y_valid)],
          verbose=100
             )



    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    preds_test = model.predict_proba(test)[:, 1]
    
    preds_valid_array[valid_idx] += preds_valid
    preds_test_array += preds_test / n_splits
    
    score_train = metrics.roc_auc_score(y_train, preds_train)
    score_valid = metrics.roc_auc_score(y_valid, preds_valid)
    print(score_valid)
    scores_train.append(score_train)
    scores_valid.append(score_valid)
        
print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

pd.DataFrame({'claim': preds_valid_array}).to_csv('catboost_valid_2.csv', index=False)
sample.iloc[:, 1] = preds_test_array
sample.to_csv('catboost_test_2.csv', index=False)"""

In [None]:
# Using default parameters

params = {
        "depth": 3,
        "grow_policy": "Lossguide",
        "random_strength": 1.0,
        "l2_leaf_reg": 3.0
        }


# KFold
n_splits=5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
scores_train = []
scores_valid = []
preds_valid_array = np.zeros((X.shape[0], ))
preds_test_array = np.zeros((test.shape[0], ))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    y_train_log = y_train
    y_valid_log = y_valid

    model = CatBoostClassifier(
                           **params,
                           learning_rate=0.03,
                           iterations=10000,
                           loss_function='CrossEntropy',
                           eval_metric='AUC',
                           use_best_model=True,
                           early_stopping_rounds=100,
                           task_type='GPU'
                           )

        
    model.fit(
          x_train, y=y_train,
          #embedding_features=None,
          use_best_model=True,
          eval_set=[(x_valid, y_valid)],
          verbose=100
             )



    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    preds_test = model.predict_proba(test)[:, 1]
    
    preds_valid_array[valid_idx] += preds_valid
    preds_test_array += preds_test / n_splits
    
    score_train = metrics.roc_auc_score(y_train, preds_train)
    score_valid = metrics.roc_auc_score(y_valid, preds_valid)
    print(score_valid)
    scores_train.append(score_train)
    scores_valid.append(score_valid)
        
print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

pd.DataFrame({'claim': preds_valid_array}).to_csv('catboost_valid_3.csv', index=False)
sample.iloc[:, 1] = preds_test_array
sample.to_csv('catboost_test_3.csv', index=False)