<div style=";
            color:red;
            font-size:40px;
            style:bold;
            text-align:center">
    <a href="https://www.kaggle.com/c/tabular-playground-series-oct-2021">Tabular Playground Series - Oct 2021</a>
    </div>

In [None]:
import pandas as pd
import numpy as np
import datatable as dt
import random 
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler

import lightgbm as lgb
import catboost as cat
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

from warnings import filterwarnings as warning
print('done!')
warning("ignore")

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 1000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000
SEED   = 42
SEED_X = 2021


In [None]:
def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

In [None]:
def reduce_memory(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
%%time

path = "../input/tabular-playground-series-oct-2021/"

# train = pd.read_csv(path + "train.csv")
# test  = pd.read_csv(path + "test.csv")
# sample   = pd.read_csv(path + "sample_submission.csv")

train = dt.fread(path + "train.csv").to_pandas().drop('id', axis =1)
train = reduce_memory(train)
test  = dt.fread(path + "test.csv").to_pandas().drop('id', axis =1)
test  = reduce_memory(test)
sample= dt.fread(path + "sample_submission.csv").to_pandas()
sample= reduce_memory(sample)

In [None]:
train.dtypes.unique()

In [None]:
bool_cols_train = []

for i, col in enumerate(train.columns):
    if train[col].dtypes == bool:
        bool_cols_train.append(i)



In [None]:
bool_cols_test = []

for i, col in enumerate(test.columns):
    if train[col].dtypes == bool:
        bool_cols_test.append(i)
        

In [None]:
train.iloc[:, bool_cols_train] = train.iloc[:, bool_cols_train].astype(int)
test.iloc[:,  bool_cols_test]  = test.iloc[:, bool_cols_test].astype(int)

In [None]:
train.shape, test.shape

In [None]:
X = train.drop('target', axis=1).copy()
y = train['target'].copy()
X_test = test.copy()

del train, test

In [None]:
X['std'] = X.std(axis=1)
X['min'] = X.min(axis=1)
X['max'] = X.max(axis=1)

X_test['std'] = X_test.std(axis=1)
X_test['min'] = X_test.min(axis=1)
X_test['max'] = X_test.max(axis=1)


In [None]:
params = {
    'max_depth': 6,
    'n_estimators': 9500,
    'learning_rate': 7e-3,
    'subsample': 0.7,
    'colsample_bytree': 0.2,
    'colsample_bylevel': 0.6000000000000001,
    'min_child_weight': 56.41980735551558,
    'reg_lambda': 75.56651890088857,
    'reg_alpha': 0.11766857055687065,
    'gamma': 0.6407823221122686,
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'use_label_encoder': False
    }

In [None]:
def cross_validate_model(class_name, class_params, X, y, test_data, n_splits = N_SPLITS):

    skf = StratifiedKFold(n_splits = N_SPLITS, shuffle=True, random_state=SEED)

    valid_predictions = {}
    test_predictions  = []
    
    oof_scores= []
    oof_preds = []

    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"{'='*10} fold {fold+1} strated {'='*10}")

        xtrain, ytrain = X.iloc[trn_idx], y.iloc[trn_idx]
        xvalid, yvalid = X.iloc[val_idx], y.iloc[val_idx]

        start = time.time()
        
        clf = class_name(**class_params)
        
        if class_name.__name__ == 'CatBoostClassifier':            
            clf.fit(xtrain, ytrain,
                    eval_set=[(xvalid, yvalid)],
                    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                    verbose=VERBOSE)            
        else:
            clf.fit(xtrain, ytrain,
                    eval_set=[(xtrain, ytrain), (xvalid, yvalid)],
                    eval_metric='auc',
                    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                    verbose=VERBOSE) # try False
            
        class_params['learning_rate'] = 1e-2
        clf_2 = class_name(**class_params)
        
        clf_2.fit(xtrain, ytrain,                  
                  eval_set=[(xtrain, ytrain), (xvalid, yvalid)],
                  eval_metric='auc',
                  early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                  verbose=False,
                  xgb_model=clf
                 )
        
        class_params['learning_rate'] = 457e-4
        clf = class_name(**class_params)
        
        clf.fit(xtrain, ytrain,                  
                  eval_set=[(xtrain, ytrain), (xvalid, yvalid)],
                  eval_metric='auc',
                  early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                  verbose=False,
                  xgb_model=clf_2
                 )
        

        preds_valid = clf.predict_proba(xvalid)[:, -1]
        preds_test  = clf.predict_proba(test_data)[:, -1]

#         valid_predictions.update(dict(zip(valid_ids, preds_valid)))
        test_predictions.append(preds_test) 

        elapsed = time.time() - start
        auc = roc_auc_score(yvalid, preds_valid)
        oof_preds.append(auc)

        print(f"fold {fold+1} - auc: {auc: .6f}, elapsed time: {elapsed:.2f} sec\n")

    print(f"Final roc auc = {np.mean(oof_preds)}")
    
    return test_predictions
#     return valid_predictions, test_predictions



In [None]:
pred = cross_validate_model(xgb.XGBClassifier, params,
                                 X, y, X_test, N_SPLITS
                                )

In [None]:
predictions = np.mean(np.column_stack(pred), axis=1)

sample.target = predictions
sample.to_csv("submission.csv", index=False)
sample.head()