In [None]:
import pandas as pd
import numpy as np
import random
import time
import os

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import gc

from tqdm import tqdm

from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import StackingRegressor, StackingClassifier

import optuna
from optuna.samplers import TPESampler

In [None]:
SEED = 2021
TARGET = "claim"
N_SPLITS = 4
N_ESTIMATORS=10000
LOSS = 'CrossEntropy'
EVAL_METRIC = "AUC"

def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [None]:
df_train = pd.read_csv(r"../input/tabular-playground-series-sep-2021/train.csv", index_col=0)
df_test = pd.read_csv(r"../input/tabular-playground-series-sep-2021/test.csv", index_col=0)

features= df_train.columns[:-1]

In [None]:
X = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]
X_test = df_test

In [None]:
# Basic preprocessing
X["mean"] =  X.mean(axis=1)
X["std"] =  X.std(axis=1)
X["min"] =  X.min(axis=1)
X["max"] =  X.max(axis=1)
X["n_na"] = X.isna().sum(axis=1)

X_test["mean"] =  X_test.mean(axis=1)
X_test["std"] =  X_test.std(axis=1)
X_test["min"] =  X_test.min(axis=1)
X_test["max"] =  X_test.max(axis=1)
X_test["n_na"] = X_test.isna().sum(axis=1)

In [None]:
categorical_features = ['f40', 'f47', 'f65', 'f70']
for cf in categorical_features:
    X[cf+'_cat'] = (X[cf]>X[cf].median()).astype("int")
    X_test[cf+'_cat'] = (X_test[cf]>X[cf].median()).astype("int")

In [None]:
name = 'f40'
cat_name = "f40_cat"
X.loc[X[cat_name]==0,name]

In [None]:
%%time
for cf in categorical_features:
    cat_name = cf+'_cat' 
    gb_mean = X.groupby(cat_name).mean()
    
    temp = np.zeros((len(X), len(features)))
    
    for i, f in enumerate(tqdm(features)):
        name = f+'_' + cf + 'mean'
        temp[X[cat_name]==0,i] = gb_mean.loc[0, f]
        temp[X[cat_name]==1,i] = gb_mean.loc[1, f]
        
    
    df_temp = pd.DataFrame(temp, index=X.index, columns  = [f+'_f40mean' for f in features])
    break

X = pd.concat([X,df_temp], axis=1)

In [None]:
def run_kfold(model, test_data=None):
    kf = KFold(n_splits=N_SPLITS, random_state=SEED, shuffle=True)
    
    scores = []
    y_pred = dict()
    n_trees = []
    
    for i_fold,(train_idx, test_idx) in enumerate(kf.split(X)):
        print(25*"=" + f" Fold {i_fold} " + 25*"=")
        X_train = X.iloc[train_idx,:]
        y_train = y[train_idx]
        
        X_val = X.iloc[test_idx,:]
        y_val = y[test_idx]
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=1000, early_stopping_rounds = int(30/model.get_param("learning_rate")))

        y_oof = model.predict_proba(X_val)
        
        fold_score = roc_auc_score(y_val, y_oof[:,1])
        scores.append(fold_score)
        n_trees.append(model.get_best_iteration())
        print(f"*** Fold {i_fold} score :", fold_score, " ***")
        
        if test_data is not None :
            y_pred[i_fold] = model.predict_proba(test_data)[:,1]
        
    scores = np.array(scores)
    n_trees = int(np.median(n_trees))
    
    print('N trees : ', n_trees)
    print('CV auc scores: ',scores.mean(), " +/- ",  scores.std())
    return scores, y_pred

In [None]:
cat_param = {'learning_rate': 0.06,
 'iterations': 10000,
 'depth': 3,
 'l2_leaf_reg': 12.09463399692516,
 'random_strength': 3.5400249636744014}

model = CatBoostClassifier(
    grow_policy='Depthwise',
    leaf_estimation_method='Newton', 
    bootstrap_type='Bernoulli',
    loss_function= LOSS,
    eval_metric= EVAL_METRIC,
    task_type='GPU',
    silent=True,
    random_seed = SEED,
    **cat_param
)

scores, y_pred = run_kfold(model, X_test)
## save submission
pd.DataFrame(np.vstack(list(y_pred.values())).mean(axis=0), index=X_test.index, columns= [TARGET]).to_csv("cat_submission.csv")

In [None]:
feature_split = 50
n_f=0
while (n_f*feature_split) < len(X.columns):
    plt.figure(figsize=(20,5))
    
    plt.bar(range(len(X.columns[n_f*feature_split:(n_f+1)*feature_split])), 
            model.feature_importances_[n_f*feature_split:(n_f+1)*feature_split], 
            tick_label= X.columns[n_f*feature_split:(n_f+1)*feature_split])
    plt.ylim([0,1])
    n_f+=1

In [None]:
pd.DataFrame(model.feature_importances_, index= X.columns, columns=["features_importance"]).sort_values(by="features_importance", ascending=False).head(20)

In [None]:
X.columns

In [None]:
X["f47"].plot.hist(bins=50)