In [1]:
import warnings

warnings.filterwarnings('ignore')

In [43]:
import os
import random
from statistics import mode
import optuna

import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import (ConfusionMatrixDisplay, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

In [3]:
RANDOM_SEED=42
random.seed(RANDOM_SEED)
os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)

## Датасет

In [4]:
df = pd.read_csv('../data/Skyserver_SQL2_27_2018 6_51_39 PM.csv')

Удаляем неинформативные колонки

In [6]:
df.drop(columns=['objid', 'run', 'rerun', 'camcol', 'field', 'specobjid', 'fiberid', 'plate', 'mjd'], inplace=True)

In [7]:
df.head()

Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift
0,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,STAR,-9e-06
1,183.59837,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,STAR,-5.5e-05
2,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,GALAXY,0.123111
3,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,STAR,-0.000111
4,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,STAR,0.00059


## Обучение

У нас задача - предсказать класс, поэтому разобьем датасет на фичи и целевую метку

In [8]:
X = df.drop(columns=['class'])
y = df['class']

Разобьем данные на train и test подвыборки

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Перед обучением надо предварительно конвертировать целевую метку в формат, удобный для модели. Выберем LabelEncoder, потому что у нас только одна категориальная переменная, и тут на самом деле без особой разницы, как именно ее кодировать. Если бы у нас было их несколько, то можно было бы использовать OneHotEncoder, чтобы модель не подумала, что между данными после препроцессинга OneHotEncoder есть какая-то связь

In [19]:
enc = LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.fit_transform(y_test)

## Подбор гиперпараметров

In [27]:
xgb_clf = XGBClassifier(objective='multi:softmax', 
                            num_class=3, 
                            seed=42)

In [28]:
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

In [29]:
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=parameters,
    scoring = 'f1_weighted',
    n_jobs = 100,
    cv = 10,
    verbose=True
)

In [30]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


In [32]:
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100}

In [33]:
grid_search.best_score_

0.9899465080122567

## Optuna

In [50]:
def objective(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)

    param = {
        "verbosity": 0,
        "objective": "multi:softmax",
        "num_class": 3,
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    f1 = f1_score(y_test, pred_labels, average='weighted')
    return f1

In [51]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-12-26 16:41:48,558] A new study created in memory with name: no-name-db7fcd8e-b4e1-4263-af40-a3d24789e66b
[I 2023-12-26 16:41:48,637] Trial 0 finished with value: 0.43387424207892855 and parameters: {'booster': 'gblinear', 'lambda': 0.021646907605599005, 'alpha': 0.0003930209500027381, 'subsample': 0.6089175512780751, 'colsample_bytree': 0.5161181113679294}. Best is trial 0 with value: 0.43387424207892855.
[I 2023-12-26 16:41:48,661] Trial 1 finished with value: 0.8823803018792294 and parameters: {'booster': 'gblinear', 'lambda': 7.511518061514997e-05, 'alpha': 0.009338564512491146, 'subsample': 0.8531376665931736, 'colsample_bytree': 0.667291839947506}. Best is trial 1 with value: 0.8823803018792294.
[I 2023-12-26 16:41:48,735] Trial 2 finished with value: 0.9885855076481319 and parameters: {'booster': 'gbtree', 'lambda': 0.7010135658160812, 'alpha': 4.68316637126489e-07, 'subsample': 0.5647309478144871, 'colsample_bytree': 0.9012036791910736, 'max_depth': 7, 'min_child_weight

Number of finished trials:  100
Best trial:
  Value: 0.9936430796420859
  Params: 
    booster: dart
    lambda: 1.960228257382407e-07
    alpha: 7.43949991679402e-06
    subsample: 0.8154880895527705
    colsample_bytree: 0.7907267043492003
    max_depth: 5
    min_child_weight: 2
    eta: 0.8646827155464566
    gamma: 0.000589744964364003
    grow_policy: lossguide
    sample_type: weighted
    normalize_type: forest
    rate_drop: 8.167529248189773e-05
    skip_drop: 5.945705496428493e-06
