<a href="https://www.kaggle.com/code/sdysch/tps-aug-2022-xgboost?scriptVersionId=103224052" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [109]:
plt.style.use('seaborn')

# References
* https://www.kaggle.com/code/ambrosm/tpsaug22-eda-which-makes-sense/notebook
* https://www.kaggle.com/sdysch/tps-aug-2022/

# Reading in the data

In [110]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/test.csv')

In [111]:
df_train = df_train.drop(['id'], axis='columns')
df_test = df_test.drop(['id'], axis='columns')

# XGBoost model

In [112]:
X = df_train.drop(['failure', 'product_code'], axis='columns')
y = df_train['failure']

cat_cols = [v for v in X.columns if X[v].dtype in ['object', 'int' ]]
numerical_cols = [v for v in X.columns if v not in cat_cols]

int_cols = [
    # 'attribute_0',
    # 'attribute_1',
    'attribute_2',
    'attribute_3',
]

print(cat_cols)
print(numerical_cols)

xgb_weight = df_train[df_train['failure'] == 1].shape[0] / df_train[df_train['failure'] == 0].shape[0]
print(xgb_weight)

['attribute_0', 'attribute_1', 'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2']
['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']
0.27001577362458773


# Variable transformations

In [113]:
cols = [f'measurement_{i}' for i in range(18)]
for var in cols:
    df_train[var] /= df_train['loading']
    df_test[var] /= df_test['loading']
# df_train = df_train.drop(['loading'], axis='columns')
# df_test = df_test.drop(['loading'], axis='columns')

# Hyperparameter tuning

## Objective function

In [114]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import optuna

def objective(trial):
    
    # hyperparameter space
    # XGBoost
    
    n_estimators = trial.suggest_int('n_estimators', 10, 200, 10)
    eta = trial.suggest_float('eta', 0.05, 0.3, step=0.05)
    max_depth = trial.suggest_int('max_depth', 2, 5)
    gamma = trial.suggest_int('gamma', 0, 5)
    
    # colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1, step=0.1)
    # colsample_bylevel = trial.suggest_float('colsample_bylevel', 0, 1, step=0.1)
    # colsample_bynode = trial.suggest_float('colsample_bynode', 0, 1, step=0.1)
    # subsample = trial.suggest_float('subsample', 0.5, 1.0, step=0.25)

    
    # other
    # add_indicator = trial.suggest_categorical('add_indicator', [True, False])
    add_indicator = True
    # class_weight = trial.suggest_categorical('class_weight', [xgb_weight, 1])
    
    # pipeline setup
    
    # impute missing values
    imputer = Pipeline(
        [
            ('knn_imputer', KNNImputer(add_indicator=add_indicator))
        ],
    )
    
    # OneHotEncode categorical features
    category_transformer = Pipeline(
        [
            ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
        ],
    )

    # total preprocessing
    preproc = ColumnTransformer(
        transformers = [
            ('KNNImputer', imputer, numerical_cols),
            ('OneHotEncoder', category_transformer, cat_cols),
        ], remainder='passthrough'
    )

    # FIXME class weight balanced?
    model = XGBClassifier(
        objective='binary:logistic',
        max_depth=max_depth,
        n_estimators=n_estimators,
        eta=eta,
        gamma=gamma,
        # colsample_bytree=colsample_bytree,
        # colsample_bylevel=colsample_bylevel,
        # colsample_bynode=colsample_bynode,
        # subsample=subsample,
        # scale_pos_weight=class_weight,
    )

    # final model
    _pipe = Pipeline(
        [
            ('Preprocessing', preproc),
            ('XGBClassifer', model),
        ]
    )
    
    # scoring
    scoring = ['roc_auc']
    scores = list()

    # CV
    groups = df_train['product_code']
    cv = GroupKFold(len(groups.unique()))
    cv_results = cross_validate(_pipe, X, y, cv=cv, scoring=scoring, groups=groups)
    
    return np.mean(cv_results['test_roc_auc'].mean())

# Run trials

In [115]:
"""# optimise
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)"""

"# optimise\nstudy = optuna.create_study(direction='maximize')\nstudy.optimize(objective, n_trials=100)"

In [116]:
"""print(f'Best trial: {study.best_trial}')
print(f'Best value: {study.best_value}')
print(f'Best parameters: {study.best_params}')"""

"print(f'Best trial: {study.best_trial}')\nprint(f'Best value: {study.best_value}')\nprint(f'Best parameters: {study.best_params}')"

# Best model

In [117]:
# pipeline setup

# impute missing values
imputer = Pipeline(
    [
        ('knn_imputer', KNNImputer(add_indicator=True))
    ],
)

# OneHotEncode categorical features
category_transformer = Pipeline(
    [
        ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ],
)

# total preprocessing
preproc = ColumnTransformer(
    transformers = [
        ('KNNImputer', imputer, numerical_cols),
        ('OneHotEncoder', category_transformer, cat_cols),
    ], remainder='passthrough'
)

model = XGBClassifier(
    objective='binary:logistic',
    max_depth=2,
    n_estimators=70,
    eta=0.25,
    gamma=5,
)

# final model
pipe = Pipeline(
    [
        ('Preprocessing', preproc),
        ('XGBClassifer', model),
    ]
)

# Submission

In [118]:
pipe.fit(X, y)
pred = pipe.predict_proba(df_test)
pred

array([[0.84726524, 0.15273473],
       [0.84726524, 0.15273473],
       [0.85420334, 0.14579667],
       ...,
       [0.88274676, 0.11725324],
       [0.8445451 , 0.15545484],
       [0.88274676, 0.11725324]], dtype=float32)

In [119]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/sample_submission.csv')
submission['failure'] = pred[:, 1]

In [120]:
submission.to_csv('submission.csv', index=False)

In [121]:
submission

Unnamed: 0,id,failure
0,26570,0.152735
1,26571,0.152735
2,26572,0.145797
3,26573,0.145797
4,26574,0.161972
...,...,...
20770,47340,0.252160
20771,47341,0.121849
20772,47342,0.117253
20773,47343,0.155455


# Feature importances

In [122]:
#print(model)