In [None]:
import os
import gc
import time
import random
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler

import xgboost as xgb

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 2021
seed_everything(SEED)

In [None]:
class config:
    paths = {
        # train path
        "train_csv"  : "../input/tabular-playground-series-oct-2021/train.csv",
        "test_csv" : "../input/tabular-playground-series-oct-2021/test.csv",
        "ss": "../input/tabular-playground-series-oct-2021/sample_submission.csv",
        
        'xgb_paths': "./xgb"
    }

    random_state = SEED

    model_params = {
        "xgb": {
            "subsample": 0.65,
            "colsample_bytree": 0.4,
            "max_depth": 7,
            "learning_rate": 0.01,
            "objective": "binary:logistic",
            'eval_metric': 'auc',
            "nthread": -1,
            'tree_method': 'gpu_hist',
            "max_bin": 192, 
            'min_child_weight': 2,
            'reg_lambda': 0.003,
            'reg_alpha': 0.02, 
            'seed' : SEED,
        }
    }

# Overview
- Train: 10_00_000 rows, 286 features, binary target.
- Test : 5_00_000 rows to test, metric: AUC_ROC.

In [None]:
def get_cols(df):
    binary_features = []
    for idx, dt in enumerate(df.dtypes):
        if dt=="int64":
            col = df.columns[idx]
            if col=='id' or col=='target':
                continue

            binary_features.append(col)

    cont_features = []
    for col in tqdm(df.columns):
        if col not in binary_features and col!='id' and col!="target":
            cont_features.append(col)
            
    print(f"No of binary features: {len(binary_features)} \t No of continuous features: {len(cont_features)}")
    return binary_features, cont_features

# CV 5-fold

In [None]:
class TrainFer:
    def __init__(self, params_dict, n_splits, model_path, random_state):
        self.params = params_dict
        self.n_splits = n_splits
        self.random_state = random_state
        self.model_path = model_path
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
            
    
    def train(self, X, y):
        oof_predictions = np.zeros(X.shape[0])
        kfold = KFold(n_splits=self.n_splits, random_state=0, shuffle=True)
        oof_scores = []

        for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
            print(f"\nFold - {fold}\n")
            x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

            dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
            dval = xgb.DMatrix(x_val, label=y_val, enable_categorical=True)

            model = xgb.train(params=self.params,
                              dtrain=dtrain,
                              num_boost_round=10000,
                              evals=[(dtrain, "dtrain"), (dval, "dval")],
                              verbose_eval=200,
                              early_stopping_rounds=50)

            fold_preds = model.predict(dval)
            oof_score = roc_auc_score(y_val, fold_preds)
            print(f"\nAUC_ROC of fold {fold}: {oof_score}")
            pickle.dump(model, open(os.path.join(self.model_path, f"xgb_bl_{fold}_{oof_score}.pkl"), "wb"))
            
            oof_scores.append(oof_score)
            oof_predictions[val_idx] = fold_preds
            
            del x_train, x_val, y_train, y_val, model, fold_preds, dtrain, dval
            _ = gc.collect()
            
            time.sleep(10)
        
        print(f"\nOOF Scores: {oof_scores}\n")
        auc_roc_score = roc_auc_score(y, oof_predictions)
        print(f"OOF AUC_ROC: {auc_roc_score}")

In [None]:
def infer_xgb(test_data, model_dir):
    print("\n[INFO] XGB Inference...")
    
    test_predictions = np.zeros(test_data.shape[0])
    test_data = xgb.DMatrix(test_data, enable_categorical=True)
    
    for mpth in tqdm(os.listdir(model_dir)):
        model = pickle.load(open(os.path.join(model_dir, mpth), "rb"))
        test_predictions += model.predict(test_data)/len(os.listdir(model_dir))
    
    return test_predictions
    pass

In [None]:
if __name__ == "__main__":
    train_df = pd.read_csv(config.paths["train_csv"])
    
    cat_feats, cont_feats = get_cols(train_df)
    model = TrainFer(config.model_params["xgb"], n_splits=5, model_path=config.paths["xgb_paths"], random_state=config.random_state) 

    model.train(train_df[cat_feats+cont_feats], train_df["target"])
    del train_df, model
    _ = gc.collect()
    
    test_df = pd.read_csv(config.paths["test_csv"])
    test_predictions = infer_xgb(test_df[cat_feats+cont_feats], "./xgb")
    test_df["target"] = test_predictions
    test_df[["id", "target"]].to_csv("submission.csv", index=False)
    pass

EOF!