In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf 
import random

  
import warnings  
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
def binary_mean_encoding(data, test_data, cat_to_encode, target, nfolds):
    """
        This function encode a categorical feature into some features related to the target.
        Use this function for binary categorical targets (i.e. binary classification problems).
        This function uses a StratifiedKFold regularization.
    """
    kf = StratifiedKFold(n_splits=nfolds, shuffle=False)
    for train_idx, val_idx in kf.split(data, data[target]):
        X_train = data.iloc[train_idx,:]
        X_val = data.iloc[val_idx,:]
        mean_map = X_train.groupby(cat_to_encode)[target].mean()
        data.loc[val_idx, 'Mean_enc_'+cat_to_encode] = data.loc[val_idx, cat_to_encode].map(mean_map)    
        
    # if some splits do not contain certain categories: we fill missing values with the global feature value
    data['Mean_enc_'+cat_to_encode].fillna(data[target].mean(), inplace=True)
    
    # validating all values per category
    value_map = data.groupby(cat_to_encode)['Mean_enc_'+cat_to_encode].mean()
    data['Mean_enc_'+cat_to_encode] = data[cat_to_encode].map(value_map)
    test_data['Mean_enc_'+cat_to_encode] = test_data[cat_to_encode].map(value_map)
    test_data['Mean_enc_'+cat_to_encode].fillna(data[target].mean(), inplace=True)

In [None]:
def getModel(seed):
    """
        In this function, we define the model
    """
    model = XGBClassifier(tree_method='gpu_hist',
                          eval_metric='logloss',
                          use_label_encoder=False,
                          colsample_bytree= 0.6522,
                          gamma= 0,
                          learning_rate=0.005,
                          max_delta_step=2.5706,
                          max_depth = 9,
                          min_child_weight = 6.9800,
                          n_estimators = 5000,
                          subsample = 0.65,
                          random_state=(1+seed)
                         )
    return model

In [None]:
def run_fold(train_df, test_df, fold_number, seed_number, output_size, eval_function, id_name='id', target_name='target', pred_proba=False):
    seed_everything(seed_number)
    
    train_mask = train_df['kfold'] != fold_number
    valid_idc = train_df.loc[~train_mask].index
    
    X_train = train_df.drop(columns=[id_name, target_name]).loc[train_mask].reset_index(drop=True)
    y_train = train_df[target_name].loc[train_mask].reset_index(drop=True)

    
    X_val = train_df.drop(columns=[id_name, target_name]).loc[~train_mask].reset_index(drop=True)
    y_val = train_df[target_name].loc[~train_mask].reset_index(drop=True)
    
    X_train.drop(columns=['kfold'], inplace=True)
    X_val.drop(columns=['kfold'], inplace=True)
    
    oof = np.zeros((train_df.shape[0], output_size))
    preds = np.zeros((test_df.shape[0], output_size))
    
    model = getModel(seed_number)
    
    model.fit(X_train, y_train) # we can add more parameters
    
    train_loss = eval_function(y_train, model.predict(X_train))
    print(f"Seed: {seed_number}, FOLD: {fold_number}, train_loss: {train_loss}")
    valid_preds = model.predict(X_val)
    if pred_proba:
        oof[valid_idc] = model.predict_proba(X_val).reshape((len(valid_preds),output_size))
    else:
        oof[valid_idc] = valid_preds.reshape((len(valid_preds),output_size))
    valid_loss = eval_function(y_val, valid_preds)
    print(f"Seed: {seed_number}, FOLD: {fold_number}, val_loss: {valid_loss}")
    
    # we can add model save 
    
    if pred_proba:
        preds = model.predict_proba(test_df[X_train.columns]).reshape((len(test_df), output_size))
    else:
        preds = model.predict(test_df[X_train.columns]).reshape((len(test_df), output_size))
    
    return oof, preds

In [None]:
def run_k_fold(train_df, test_df, seed_number, output_size, eval_function, id_name='id', target_name='target', pred_proba=False):
    oof = np.zeros((train_df.shape[0], output_size))
    predictions = np.zeros((test_df.shape[0], output_size))
    
    for fold in range(N_FOLDS):
        oof_, preds_ = run_fold(train_df, test_df, fold, seed_number, output_size, eval_function, id_name, target_name, pred_proba)
        
        predictions += preds_ / N_FOLDS
        oof += oof_ 
        
    return oof, predictions

In [None]:
def execution(train_df, test_df, seeds, output_size, eval_function, id_name='id', target_name='target', pred_proba=False, stratification=False):
    oof = np.zeros((train_df.shape[0], output_size))
    preds = np.zeros((test_df.shape[0], output_size))
    for seed in seeds:
        if stratification:
            kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
        else:
            kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
            
        train_df['kfold'] = np.zeros(len(train_df))
        for f, (t_idx, v_idx) in enumerate(kf.split(train_df, train_df[target_name])) :
            train_df.loc[v_idx, 'kfold'] = int(f)
        train_df['kfold'] = train_df['kfold'].astype(int)
        oof_, preds_ = run_k_fold(train_df, test_df, seed, output_size, eval_function, id_name, target_name, pred_proba) 
        oof += oof_ / len(seeds)
        preds += preds_ / len(seeds)
    if not pred_proba:
        over_loss = eval_function(train_df[target_name], oof)
        print(f"The Loss is {over_loss}")
    return oof, preds 

In [None]:
tr_df = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
tr_df.head(2)

In [None]:
tt_df = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
tt_df.head(2)

In [None]:
cat_cols = [c for c in tr_df.columns if c.startswith('cat')]
num_cols = [c for c in tr_df.columns if c.startswith('cont')]

In [None]:
for col in cat_cols:
    binary_mean_encoding(tr_df, tt_df, col, 'target', 5)

In [None]:
tr_df.head(2)

In [None]:
tt_df.head(2)

In [None]:
encoders = []
for elt in cat_cols:
    enc = LabelEncoder()
    enc.fit(tr_df[elt])
    enc_dict = dict(zip(enc.classes_, enc.transform(enc.classes_)))
    tr_df['L_enc_'+elt] = tr_df[elt].apply(lambda x: enc_dict.get(x, -1))
    tt_df['L_enc_'+elt] = tt_df[elt].apply(lambda x: enc_dict.get(x, -1))
    encoders.append(enc)

In [None]:
tr_df.head(2)

In [None]:
tt_df.head(2)

Frequency Encoding

In [None]:
for elt in cat_cols:
    f_enc = tr_df.groupby(elt).size()/len(tr_df)
    tr_df['F_enc_'+elt] = tr_df[elt].map(f_enc)
    tt_df['F_enc_'+elt] = tt_df[elt].map(f_enc)
    tr_df['F_enc_'+elt].fillna(0, inplace=True)
    tt_df['F_enc_'+elt].fillna(0, inplace=True)

In [None]:
tr_df.head(2)

In [None]:
tt_df.head(2)

Weight of evidence

In [None]:
WoE_encoders = []
for elt in cat_cols:    
    c = tr_df.groupby(elt)['target'].mean().to_dict()
    woe = {}
    for k in c.keys():
        woe[k] = np.log(c[k]/(1-c[k]))*100
    tr_df['WoE_enc_'+elt] = tr_df[elt].map(woe)
    tt_df['WoE_enc_'+elt] = tt_df[elt].map(woe)
    tr_df['WoE_enc_'+elt].fillna(0, inplace=True)
    tt_df['WoE_enc_'+elt].fillna(0, inplace=True)
    WoE_encoders.append(woe)

In [None]:
tr_df.to_csv('train.csv', index=False)
tt_df.to_csv('test.csv', index=False)

In [None]:
tr_df.drop(columns=cat_cols, inplace=True)
tt_df.drop(columns=cat_cols, inplace=True)

In [None]:
N_FOLDS = 5
oof_, preds_ = execution(tr_df, tt_df, [0], 2, roc_auc_score, id_name='id', target_name='target', pred_proba=True, stratification=True)

In [None]:
preds = pd.DataFrame(preds_, columns=['p_0', 'p_1'])
preds.head(2)

In [None]:
oof = pd.DataFrame(oof_, columns=['p_0', 'p_1'])
oof.head(2)

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
sub.head(2)

In [None]:
sub['target'] = preds['p_1']
sub.to_csv('submission.csv', index=False)

In [None]:
oof.to_csv('folds.csv', index=False)