In [1]:
!pip install -q iterative-stratification

In [2]:
import warnings
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import ParameterSampler
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')
seed = 27

In [3]:
train = pd.read_csv("/kaggle/input/playground-series-s4e3/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e3/test.csv")

In [4]:
print(f"Train shape: {train.shape}")
print(f"Test shape:  {test.shape}")

Train shape: (19219, 35)
Test shape:  (12814, 28)


In [5]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,0.0,1.0,2.0,3.0,4.0
X_Minimum,584.0,808.0,39.0,781.0,1540.0
X_Maximum,590.0,816.0,192.0,789.0,1560.0
Y_Minimum,909972.0,728350.0,2212076.0,3353146.0,618457.0
Y_Maximum,909977.0,728372.0,2212144.0,3353173.0,618502.0
Pixels_Areas,16.0,433.0,11388.0,210.0,521.0
X_Perimeter,8.0,20.0,705.0,16.0,72.0
Y_Perimeter,5.0,54.0,420.0,29.0,67.0
Sum_of_Luminosity,2274.0,44478.0,1311391.0,3202.0,48231.0
Minimum_of_Luminosity,113.0,70.0,29.0,114.0,82.0


In [6]:
label_cols = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]

In [7]:
X = train.drop(["id"] + label_cols, axis=1)
y = train[label_cols]
X_test = test.drop("id", axis=1)

In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [9]:
params = {
    "n_estimators": [100, 200, 300, 400, 500, 700, 1000],
    "learning_rate": [0.005, 0.01, 0.05, 0.1, 0.2],
    "gamma": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "max_depth": [3, 4, 5, 6, 7],
    "min_child_weight": [1, 2, 3, 4, 5],
    "reg_alpha": [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "reg_lambda": [0, 0.1, 0.2, 0.3, 0.4, 0.5] 
}


n_iter = 100
random_combinations = list(ParameterSampler(params, n_iter=n_iter, random_state=seed))

best_score = 0
best_params = None

counter = 0

mskf = MultilabelStratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
for param_combination in random_combinations:
    scores = []
    for train_index, val_index in mskf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model = XGBClassifier(
            n_estimators=param_combination["n_estimators"],
            learning_rate=param_combination["learning_rate"],
            gamma=param_combination["gamma"],
            subsample=param_combination["subsample"],
            colsample_bytree=param_combination["colsample_bytree"],
            max_depth=param_combination["max_depth"],
            min_child_weight=param_combination["min_child_weight"],
            reg_alpha=param_combination["reg_alpha"],
            reg_lambda=param_combination["reg_lambda"],
            random_state=seed,
            device_type="cuda",
            tree_method="gpu_hist"
        )
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_val)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        
        counter += 1
        print(f"{counter}/{n_iter * 10} | Fold Score: {score:.4f} | Best Total Score: {best_score:.4f}")
        
    current_score = sum(scores) / len(scores)
    if current_score > best_score:
        best_score = current_score
        best_params = param_combination
        
    scores = []                
    
print(f"\n\nBest Score: {best_score:.4f} | Best Params: {best_params}")

1/1000 | Fold Score: 0.8873 | Best Total Score: 0.0000
2/1000 | Fold Score: 0.8916 | Best Total Score: 0.0000
3/1000 | Fold Score: 0.8875 | Best Total Score: 0.0000
4/1000 | Fold Score: 0.8806 | Best Total Score: 0.0000
5/1000 | Fold Score: 0.8781 | Best Total Score: 0.0000
6/1000 | Fold Score: 0.8822 | Best Total Score: 0.0000
7/1000 | Fold Score: 0.8797 | Best Total Score: 0.0000
8/1000 | Fold Score: 0.8875 | Best Total Score: 0.0000
9/1000 | Fold Score: 0.8790 | Best Total Score: 0.0000
10/1000 | Fold Score: 0.8886 | Best Total Score: 0.0000
11/1000 | Fold Score: 0.8825 | Best Total Score: 0.8842
12/1000 | Fold Score: 0.8899 | Best Total Score: 0.8842
13/1000 | Fold Score: 0.8844 | Best Total Score: 0.8842
14/1000 | Fold Score: 0.8773 | Best Total Score: 0.8842
15/1000 | Fold Score: 0.8682 | Best Total Score: 0.8842
16/1000 | Fold Score: 0.8814 | Best Total Score: 0.8842
17/1000 | Fold Score: 0.8743 | Best Total Score: 0.8842
18/1000 | Fold Score: 0.8822 | Best Total Score: 0.8842
1

In [10]:
model = XGBClassifier(
    **best_params,
    random_state=seed,
    device_type="cuda",
    tree_method="gpu_hist"
)

model.fit(X, y)
y_test = model.predict_proba(X_test)

In [11]:
submission = pd.DataFrame(y_test, columns=label_cols)
submission["id"] = test["id"]
submission = submission[["id"] + label_cols]
submission.to_csv(f"submission_{best_score:.4f}.csv", index=False)
submission.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.510682,0.001734,0.004037,0.000323,0.016948,0.166111,0.341583
1,19220,0.263171,0.017392,0.009015,0.00039,0.171924,0.174881,0.328107
2,19221,0.002158,0.040082,0.038508,0.000596,0.006404,0.309286,0.465312
3,19222,0.17992,0.001898,0.000845,0.001427,0.013149,0.393392,0.442395
4,19223,0.002975,0.002068,0.001127,0.002945,0.008383,0.604201,0.375651
