In [None]:
%%time

import os, psutil
import gc

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import itertools

from sklearn.model_selection import cross_validate,cross_val_score,train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score,make_scorer, precision_score, recall_score,f1_score, roc_curve,auc
from sklearn import ensemble,metrics,model_selection,neighbors,preprocessing, svm, tree
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna
from optuna.integration import LightGBMPruningCallback,XGBoostPruningCallback

import scikitplot.metrics as skplot
import datatable as dt

from sklearn.cluster import KMeans
from pickle import *

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Helper functions

In [None]:
def cpu_usage():
    pid = os.getpid()
    py = psutil.Process(pid)
    return f'Memory Usage : {round(py.memory_info()[0]/2**30,2)}'

In [None]:
# function to reduce data memory size
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum()/1024**2
    numerics = ['int8', 'int16', 'int32','int64', 'float16','float32','float64']
    for col in df.columns:
        col_type = df[col].dtype
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(df[col].dtype)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum()/1024**2
    print(f'Memory reduced from {round(start_mem,2)} -> {round(end_mem,2)}.\nReduction in memory size by {round(((start_mem - end_mem)/start_mem)*100,2)}%')
    cpu_usage()

In [None]:
def get_data_info():
    print(str.center(' Train Info ',40, '-'))
    print(f'Rows : {train.shape[0]}, Columns : {train.shape[1]}')
    temp = train.dtypes.value_counts()
    print([f'{temp.index[i]} : {temp[i]}' for i in range(0,len(temp))])
    print(f'Target {train.target.value_counts(normalize = True).index[0]} : {train.target.value_counts(normalize = True)[0]*100}%')
    print(f'Target {train.target.value_counts(normalize = True).index[1]} : {train.target.value_counts(normalize = True)[1]*100}%')
    
    print(str.center(' Test Info ',40, '-'))
    print(f'Rows : {test.shape[0]}, Columns : {test.shape[1]}')
    temp = test.dtypes.value_counts()
    print([f'{temp.index[i]} : {temp[i]}' for i in range(0,len(temp))])

## Get data and compress 

In [None]:
cols_to_remove = ['id']

In [None]:
%%time
train = dt.fread('../input/tabular-playground-series-nov-2021/train.csv').to_pandas()
test = dt.fread('../input/tabular-playground-series-nov-2021/test.csv').to_pandas()

In [None]:
train.drop(cols_to_remove, axis = 1, inplace = True)
test.drop(cols_to_remove, axis = 1, inplace = True)

In [None]:
reduce_memory_usage(train)
reduce_memory_usage(test)

In [None]:
train['target'] = train['target'].astype(int).astype(object)

In [None]:
cat_cols = test.select_dtypes(include = bool).columns
cont_cols = test.select_dtypes(include = 'float16').columns

## Feature engg with top 10 imp features

In [None]:
def create_cluster_features(imp_features = ['f50', 'f41', 'f97', 'f91', 'f27', 'f43', 'f34', 'f8', 'f80', 'f55', 'f71'],
                           n_clusters = 12,
                           train = train,
                           test = test):
    n_clusters = 12
    cd_feature = True # cluster distance instead of cluster number
    cluster_cols = [f"f{i+100}" for i in range(n_clusters)]
    kmeans = KMeans(n_clusters=n_clusters, init="k-means++", max_iter=500, random_state=42)
    
    # train impute
    X_cd = kmeans.fit_transform(train[imp_features])
    imp_cluster_values_train = pd.DataFrame(X_cd, columns = cluster_cols)
    train = pd.concat([train,imp_cluster_values_train], axis = 1)
    
    # test impute
    X_cd = kmeans.transform(test[imp_features])
    imp_cluster_values_test = pd.DataFrame(X_cd, columns = cluster_cols)
    test = pd.concat([test,imp_cluster_values_test], axis = 1)
    return train, test

In [None]:
train, test = create_cluster_features()

## Scaling data

In [None]:
mm_scaler = MinMaxScaler()
X = train.drop(columns = 'target', axis = 1)
y = train['target'].astype(int)

In [None]:
X = pd.DataFrame(mm_scaler.fit_transform(X), columns = X.columns)
test = pd.DataFrame(mm_scaler.transform(test),columns = test.columns)

### Catboost Parameter tuning with Optuna

In [None]:
def objective(trial, X = X, y = y):
    
    param_grid = {
        'bootstrap_type':'Poisson',
        'iterations' : trial.suggest_int('iterations', 50, 300),    
        'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-8, 1.0),
        'subsample': trial.suggest_float("subsample",0.2, 1.0),
        'min_data_in_leaf': trial.suggest_int("min_data_in_leaf",10, 100),
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }
    
    cv = StratifiedKFold(shuffle= True, random_state= 42)
    cv_scores = np.empty(5)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X,y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
    
    
        model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        random_seed=42,
        verbose= 0,
        border_count=64,
        **param_grid)    

        model.fit(X_train, y_train, eval_set=[(X_test, y_test)],early_stopping_rounds=400,verbose=False)

        y_preds = model.predict(X_test)
        auc_score = roc_auc_score(y_test, y_preds)
        cv_scores[idx] = auc_score
    
    return np.mean(cv_scores)

In [None]:
# %%time
# from optuna.samplers import TPESampler
# import multiprocessing
# study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed= 42))
# study.optimize(objective, n_trials = 100)

In [None]:
# import pickle
# pickle.dump(study.best_trial.params, open('CatBoost_Hyperparameter.pickle', 'wb'))
# print('CatBoost Hyperparameter:', study.best_trial.params)

In [None]:
# chosen from optuna 100 trials
cb_best_params =  {'iterations': 296, 
                   'reg_lambda': 0.053207743382150924, 
                   'subsample': 0.9427230636303613, 
                   'min_data_in_leaf': 26, 
                   'depth': 4, 
                   'learning_rate': 0.23100537129565457, 
                   'random_strength': 25, 
                   'bagging_temperature': 15.108913990665142, 
                   'od_type': 'Iter',
                  'bootstrap_type':'Poisson'}

In [None]:
cb_model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        random_seed=42,
        border_count=64,
        **cb_best_params)

In [None]:
cv = StratifiedKFold(shuffle= True, random_state= 42, n_splits = 5)
roc_score_all = np.empty(5)
cat_oof = np.zeros(X.shape[0])
for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y)):
    X_train, X_test = X.iloc[train_idx],X.iloc[valid_idx]
    y_train, y_test = y.iloc[train_idx],y.iloc[valid_idx]
    
    cb_model.fit(X_train, y_train,eval_set=[(X_test, y_test)],
              early_stopping_rounds=400,
              verbose=False )
    y_pred = cb_model.predict(X_test)
    y_pred_proba = cb_model.predict_proba(X_test)[:,1]
    cat_oof[valid_idx] = y_pred_proba
    roc_score = roc_auc_score(y_test,y_pred)
    roc_score_all[idx] = roc_score
    print(f'Fold : {idx}')
    print(f'ROC: {roc_score}')
    print('--'*40)
print(f'Overall ROC : {np.mean(roc_score_all)}')    

In [None]:
cat_oof

In [None]:
predictions = np.zeros(len(test))
predictions += cb_model.predict_proba(test)[:,1]/cv.n_splits

In [None]:
# making baseline submission
ss = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
ss['target'] = predictions
ss.to_csv('./catboost_submission.csv', index=False)
np.savez_compressed('oof_catboost.npz', cat_oof)