In [None]:
import os
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error, roc_auc_score
from tqdm.notebook import tqdm
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from joblib import dump, load
import xgboost as xgb


import warnings
warnings.filterwarnings('ignore')

In [None]:
PATH = '../input/tabular-playground-series-nov-2021'
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))

In [None]:
def random_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
columns = [col for col in train.columns if col not in ['id', 'target'] ]

In [None]:
def run_fold(train, test, n_folds, seed):
    
    params = {
          'l2_regularization': 1.5575355843851526e-05,
          'early_stopping': 'False',
          'learning_rate': 0.02927207351391731,
          'max_iter': 1000,
          'max_depth': 28,
          'max_bins': 162,
          'min_samples_leaf': 2499,
          'max_leaf_nodes': 47
    }
    
    random_seed(seed)

    auc_score = []

    train = train.sample(frac=1).reset_index(drop=True)

    targets = train['target'].values

    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)    
        
    oof = np.zeros((train.shape[0],))
    test_preds = 0

    for f, (train_idx, val_idx) in tqdm(enumerate(kf.split(train, targets))):
            df_train, df_val = train.iloc[train_idx][columns], train.iloc[val_idx][columns]
            train_target, val_target = targets[train_idx], targets[val_idx]
        
            model = HistGradientBoostingClassifier(**params)
        
            model.fit(df_train[columns], train_target)
        
            oof_tmp = model.predict_proba(df_val[columns])[:,1]
            test_tmp = model.predict_proba(test[columns])[:,1]   
        
            oof[val_idx] = oof_tmp
            test_preds += test_tmp/n_folds
            auc = roc_auc_score(val_target, oof_tmp)
            auc_score.append(auc)
            print(f'FOLD: {f} SEED:{seed} AUC: {auc} Mean AUC: {np.mean(auc_score)}')
    return test_preds, oof


In [None]:
def run_model(train, test, n_folds):
    _predictions = 0
    _oof = 0
    
    SEED = [42, 43, 1019, 1020, 2019, 2021]
    
    for seed in SEED:
    
        predictions, oof = run_fold(train, test, n_folds, seed)
        _predictions +=predictions/len(SEED)
        _oof +=oof/len(SEED)
        
    return _predictions, _oof

In [None]:
if __name__=='__main__':
    
    predictions, oof = run_model(train, test, 5)
    
    sub['target'] = predictions
    sub.to_csv('submission.csv', index=False)