In addition to submission.csv, generates OOF predictions.

- train_pred_1.csv
- test_pred_1.csv (same as submission file)

# Versions

- V3:
  - Tried Standardization.  No effect. 
  - Increased n_estimators to 70,000.  Increased early_stopping=200
  - Back to a lower learning_rate.
         'learning_rate': 0.007,
         #"learning_rate": 0.15525187869673937,
- V2: Try a different set of Optuna hyperparameters. Save OOF in train_pred_1.csv
- V1: Original

# Load Libraries

In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from pathlib import Path

# Configuration

In [None]:
class Config:
    debug = False
    competition = "TPS_202111"
    seed = 42
    NFOLDS = 5
    EPOCHS = 10


In [None]:
data_dir = Path('../input/tabular-playground-series-nov-2021') # Change me every month

# Load Train/Test Data

In [None]:
%%time
train_df = pd.read_csv(data_dir / "train.csv",
#                       nrows=100000
                      )

test_df = pd.read_csv(data_dir / "test.csv")
sample_submission = pd.read_csv(data_dir / "sample_submission.csv")

print(f"train data: Rows={train_df.shape[0]}, Columns={train_df.shape[1]}")
print(f"test data : Rows={test_df.shape[0]}, Columns={test_df.shape[1]}")


# Features

In [None]:
features = [col for col in train_df.columns if col not in ('id', 'target')]

# Standardize/Normalize Data

Don't necessarily need to do this for Trees

In [None]:
# scaler = StandardScaler()

# train_df[features] = scaler.fit_transform(train_df[features])
# test_df[features] = scaler.transform(test_df[features])

# Extract Target and Drop Unused Columns

In [None]:
y = train_df.target

test_df = test_df.drop(["id"], axis=1)
X = train_df.drop(["id", "target"], axis=1)
X.head()

# Model

In [None]:
xgb_params_v1 = {
    'max_depth': 6,
    'learning_rate': 0.007,
    'n_estimators': 10000, # 9500
    'subsample': 0.7,
    'colsample_bytree': 0.2,
    'colsample_bylevel': 0.6000000000000001,
    'min_child_weight': 56.41980735551558,
    'reg_lambda': 75.56651890088857,
    'reg_alpha': 0.11766857055687065,
    'gamma': 0.6407823221122686,
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'use_label_encoder': False
    }

In [None]:
xgb_params = {
    "objective": "binary:logistic",
    "use_label_encoder": False,
    "n_estimators": 70000,
    'learning_rate': 0.007,

#     "learning_rate": 0.15525187869673937,
    "subsample": 0.66,
    "colsample_bytree": 0.9500000000000001,
    "max_depth": 4,
    "booster": "gbtree",
    "gamma": 1.7000000000000002,
    "tree_method": "gpu_hist",
    "reg_lambda": 0.9541035898656812,
    "reg_alpha": 2.3445012085324084,
    "random_state": 42,
    "n_jobs": 4,
    "min_child_weight": 256,
}

In [None]:
final_test_predictions = []
final_valid_predictions = {}
scores = []

kf = StratifiedKFold(n_splits=Config.NFOLDS, shuffle=True, random_state=Config.seed)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X = X, y = y)):

    print(10*"=", f"Fold={fold}", 10*"=")
    start_time = time.time()
    x_train = X.loc[train_idx, :]
    x_valid = X.loc[valid_idx, :]
    
    y_train = y[train_idx]
    y_valid = y[valid_idx]
    model = XGBClassifier(**xgb_params)

    model.fit(x_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(x_valid, y_valid)],
          eval_metric='auc',
          verbose=0)

    
    preds_valid = model.predict_proba(x_valid)[:, -1]
    final_valid_predictions.update(dict(zip(valid_idx, preds_valid)))
    
    auc = roc_auc_score(y_valid,  preds_valid)
    scores.append(auc)

    run_time = time.time() - start_time
    print(f"Fold={fold}, auc: {auc:.8f}, Run Time: {run_time:.2f}")
    
    test_preds = model.predict_proba(test_df[features])[:, -1]
    final_test_predictions.append(test_preds)


### Are we improving?

In [None]:
best_oof_mean = 0.74291098

mean_auc = np.mean(scores)

mean_diff = mean_auc - best_oof_mean
print(f"OOF Mean Score difference: {mean_diff}")

```
Scores -> mean: 0.74291098, std: 0.00163617
Scores -> mean: 0.74291098, std: 0.00163617
Scores -> mean: 0.74291098, std: 0.00163617

```

In [None]:
print(f"Scores -> mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}")

# Save OOF Predictions

In [None]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

# Submission File

In [None]:
sample_submission['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.to_csv("test_pred_1.csv",index=None)
sample_submission.to_csv("submission.csv",index=None)
sample_submission