<a href="https://www.kaggle.com/code/sdysch/tps-aug-2022-xgboost?scriptVersionId=102683070" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
plt.style.use('seaborn')

# References
* https://www.kaggle.com/code/ambrosm/tpsaug22-eda-which-makes-sense/notebook
* https://www.kaggle.com/sdysch/tps-aug-2022/

# Reading in the data

In [3]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/test.csv')

In [4]:
df_train = df_train.drop(['id'], axis='columns')
df_test = df_test.drop(['id'], axis='columns')

# XGBoost model

In [5]:
X = df_train.drop(['failure', 'product_code'], axis='columns')
y = df_train['failure']

cat_cols = [v for v in X.columns if X[v].dtype in ['object', 'int' ]]
numerical_cols = [v for v in X.columns if v not in cat_cols]

int_cols = [
    # 'attribute_0',
    # 'attribute_1',
    'attribute_2',
    'attribute_3',
]

print(cat_cols)
print(numerical_cols)

xgb_weight = df_train[df_train['failure'] == 1].shape[0] / df_train[df_train['failure'] == 0].shape[0]
print(xgb_weight)

['attribute_0', 'attribute_1', 'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2']
['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']
0.27001577362458773


# Hyperparameter tuning

## Objective function

In [6]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import optuna

def objective(trial):
    # hyperparameter space
    n_estimators = trial.suggest_int('n_estimators', 10, 300)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    eta = trial.suggest_categorical('eta', [0.1, 0.15, 0.2, 0.3])
    gamma = trial.suggest_int('gamma', 0, 5)
    
    # pipeline setup
    # OneHotEncode categorical features
    category_transformer = Pipeline(
        [
            ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
        ],
    )

    # total preprocessing
    preproc = ColumnTransformer(
        transformers = [
            ('OneHotEncoder', category_transformer, cat_cols),
            # ('InteractionTerms', PolynomialFeatures(interaction_only=True), int_cols),
        ], remainder='passthrough'
    )

    # FIXME class weight balanced?
    model = XGBClassifier(
        objective='binary:logistic',
        learning_rate=0.05,
        max_depth=max_depth,
        n_estimators=n_estimators,
        eta=eta,
        gamma=gamma,
        scale_pos_weight=xgb_weight,
    )

    # final model
    _pipe = Pipeline(
        [
            ('Preprocessing', preproc),
            ('Scaler', StandardScaler()),
            ('XGBClassifer', model),
        ]
    )
    
    # scoring
    scoring = ['roc_auc']
    scores = list()

    # CV
    groups = df_train['product_code']
    cv = GroupKFold(len(groups.unique()))
    cv_results = cross_validate(_pipe, X, y, cv=cv, scoring=scoring, groups=groups)
    
    return np.mean(cv_results['test_roc_auc'].mean())

# Run trials

In [7]:
# optimise
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

In [8]:
# print(f'Best trial: {study.best_trial}')
# print(f'Best value: {study.best_value}')
# print(f'Best parameters: {study.best_params}')

# Best model

In [9]:
# pipeline setup
# OneHotEncode categorical features
category_transformer = Pipeline(
    [
        ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ],
)

# total preprocessing
preproc = ColumnTransformer(
    transformers = [
        ('OneHotEncoder', category_transformer, cat_cols),
        # ('InteractionTerms', PolynomialFeatures(interaction_only=True), int_cols),
    ], remainder='passthrough'
)

model = XGBClassifier(
    objective='binary:logistic',
    learning_rate=0.05,
    max_depth=2,
    n_estimators=68,
    eta=0.1,
    gamma=3,
    scale_pos_weight=xgb_weight,
)

# final model
pipe = Pipeline(
    [
        ('Preprocessing', preproc),
        ('Scaler', StandardScaler()),
        ('XGBClassifer', model),
    ]
)

# Submission

In [10]:
pipe.fit(X, y)
pred = pipe.predict_proba(df_test)
pred

array([[0.92554206, 0.07445793],
       [0.9272516 , 0.07274845],
       [0.93102753, 0.06897246],
       ...,
       [0.9371908 , 0.06280921],
       [0.92317045, 0.07682955],
       [0.94265664, 0.05734339]], dtype=float32)

In [11]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/sample_submission.csv')
submission['failure'] = pred[:, 1]

In [12]:
submission.to_csv('submission.csv', index=False)

In [13]:
submission

Unnamed: 0,id,failure
0,26570,0.074458
1,26571,0.072748
2,26572,0.068972
3,26573,0.068972
4,26574,0.105928
...,...,...
20770,47340,0.086153
20771,47341,0.057343
20772,47342,0.062809
20773,47343,0.076830


# Feature importances

In [14]:
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.1,
              eval_metric=None, gamma=3, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=2, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=68, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, ...)
