In [1]:
!pip install -q iterative-stratification

In [2]:
import warnings
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import ParameterSampler
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')
seed = 27

In [3]:
train = pd.read_csv("/kaggle/input/playground-series-s4e3/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e3/test.csv")

In [4]:
print(f"Train shape: {train.shape}")
print(f"Test shape:  {test.shape}")

Train shape: (19219, 35)
Test shape:  (12814, 28)


In [5]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,0.0,1.0,2.0,3.0,4.0
X_Minimum,584.0,808.0,39.0,781.0,1540.0
X_Maximum,590.0,816.0,192.0,789.0,1560.0
Y_Minimum,909972.0,728350.0,2212076.0,3353146.0,618457.0
Y_Maximum,909977.0,728372.0,2212144.0,3353173.0,618502.0
Pixels_Areas,16.0,433.0,11388.0,210.0,521.0
X_Perimeter,8.0,20.0,705.0,16.0,72.0
Y_Perimeter,5.0,54.0,420.0,29.0,67.0
Sum_of_Luminosity,2274.0,44478.0,1311391.0,3202.0,48231.0
Minimum_of_Luminosity,113.0,70.0,29.0,114.0,82.0


In [6]:
label_cols = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]

In [7]:
X = train.drop(["id"] + label_cols, axis=1)
y = train[label_cols]
X_test = test.drop("id", axis=1)

In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [9]:
params = {
    "n_estimators": [100, 200, 300, 400, 500, 700, 100, 1500],
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "gamma": [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
    "min_child_weight": [1, 2, 3, 4, 5],
    "reg_alpha": [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "reg_lambda": [0, 0.1, 0.2, 0.3, 0.4, 0.5] 
}


n_iter = 400
random_combinations = list(ParameterSampler(params, n_iter=n_iter, random_state=seed))

best_score = 0
best_params = None

counter = 0

mskf = MultilabelStratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
for param_combination in random_combinations:
    scores = []
    for train_index, val_index in mskf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model = XGBClassifier(
            n_estimators=param_combination["n_estimators"],
            learning_rate=param_combination["learning_rate"],
            gamma=param_combination["gamma"],
            subsample=param_combination["subsample"],
            colsample_bytree=param_combination["colsample_bytree"],
            max_depth=param_combination["max_depth"],
            min_child_weight=param_combination["min_child_weight"],
            reg_alpha=param_combination["reg_alpha"],
            reg_lambda=param_combination["reg_lambda"],
            random_state=seed,
            device_type="cuda",
            tree_method="gpu_hist"
        )
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_val)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        
        counter += 1
        print(f"{counter}/{n_iter * 10} | Fold Score: {score:.4f} | Best Total Score: {best_score:.4f}")
        
    current_score = sum(scores) / len(scores)
    if current_score > best_score:
        best_score = current_score
        best_params = param_combination
        
    scores = []                
    
print(f"\n\nBest Score: {best_score:.4f} | Best Params: {best_params}")

1/4000 | Fold Score: 0.8856 | Best Total Score: 0.0000
2/4000 | Fold Score: 0.8930 | Best Total Score: 0.0000
3/4000 | Fold Score: 0.8868 | Best Total Score: 0.0000
4/4000 | Fold Score: 0.8787 | Best Total Score: 0.0000
5/4000 | Fold Score: 0.8761 | Best Total Score: 0.0000
6/4000 | Fold Score: 0.8860 | Best Total Score: 0.0000
7/4000 | Fold Score: 0.8784 | Best Total Score: 0.0000
8/4000 | Fold Score: 0.8881 | Best Total Score: 0.0000
9/4000 | Fold Score: 0.8767 | Best Total Score: 0.0000
10/4000 | Fold Score: 0.8834 | Best Total Score: 0.0000
11/4000 | Fold Score: 0.8610 | Best Total Score: 0.8833
12/4000 | Fold Score: 0.8626 | Best Total Score: 0.8833
13/4000 | Fold Score: 0.8626 | Best Total Score: 0.8833
14/4000 | Fold Score: 0.8561 | Best Total Score: 0.8833
15/4000 | Fold Score: 0.8460 | Best Total Score: 0.8833
16/4000 | Fold Score: 0.8527 | Best Total Score: 0.8833
17/4000 | Fold Score: 0.8562 | Best Total Score: 0.8833
18/4000 | Fold Score: 0.8597 | Best Total Score: 0.8833
1

In [10]:
model = XGBClassifier(
    **best_params,
    random_state=seed,
    device_type="cuda",
    tree_method="gpu_hist"
)

model.fit(X, y)
y_test = model.predict_proba(X_test)

In [11]:
submission = pd.DataFrame(y_test, columns=label_cols)
submission["id"] = test["id"]
submission = submission[["id"] + label_cols]
submission.to_csv(f"submission_{best_score:.4f}.csv", index=False)
submission.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.500787,0.003664,0.004593,0.001405,0.019016,0.174227,0.341923
1,19220,0.290382,0.019535,0.010414,0.001414,0.173815,0.213151,0.318569
2,19221,0.002617,0.045103,0.037649,0.001765,0.00808,0.309135,0.463494
3,19222,0.154136,0.006038,0.001618,0.002248,0.013064,0.366192,0.427223
4,19223,0.002979,0.005951,0.001742,0.002498,0.007379,0.614158,0.367447
