In [1]:
import numpy as np
import optuna
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import pandas as pd

In [11]:
def get_data():
    df = pd.read_csv('../data/train.csv')
    y = df['Label'].values
    X = df.drop(columns=['Label']).values
    return X, y



array([1, 1, 0, ..., 0, 0, 1], dtype=int64)

In [12]:
def objective(trial):
    (data, target) = get_data()
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy

In [13]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-04-18 13:05:09,762] A new study created in memory with name: no-name-885d8efb-317f-491f-ab40-d3dd123a200d
[I 2024-04-18 13:05:09,798] Trial 0 finished with value: 0.80625 and parameters: {'booster': 'dart', 'lambda': 1.841094062633413e-05, 'alpha': 5.1176697731047006e-06, 'subsample': 0.6378483363471373, 'colsample_bytree': 0.5232792168415564, 'max_depth': 7, 'min_child_weight': 9, 'eta': 0.0041539266887273595, 'gamma': 0.00020497200990155515, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.0025190525089800286, 'skip_drop': 1.2508869663873379e-05}. Best is trial 0 with value: 0.80625.
[I 2024-04-18 13:05:09,828] Trial 1 finished with value: 0.83625 and parameters: {'booster': 'gbtree', 'lambda': 3.554081933979355e-05, 'alpha': 0.10888569298411195, 'subsample': 0.6596532351593277, 'colsample_bytree': 0.6669442981309006, 'max_depth': 9, 'min_child_weight': 2, 'eta': 0.30575550580260596, 'gamma': 3.25439035905628e-05, 'grow_policy': '

Number of finished trials:  100
Best trial:
  Value: 0.8775
  Params: 
    booster: gbtree
    lambda: 1.7530425343203377e-07
    alpha: 0.04124730293310747
    subsample: 0.8243216661774506
    colsample_bytree: 0.969203559750932
    max_depth: 9
    min_child_weight: 3
    eta: 0.05913197693988899
    gamma: 1.6977595149678425e-07
    grow_policy: lossguide


In [14]:
trial

FrozenTrial(number=14, state=TrialState.COMPLETE, values=[0.8775], datetime_start=datetime.datetime(2024, 4, 18, 13, 5, 10, 308874), datetime_complete=datetime.datetime(2024, 4, 18, 13, 5, 10, 380874), params={'booster': 'gbtree', 'lambda': 1.7530425343203377e-07, 'alpha': 0.04124730293310747, 'subsample': 0.8243216661774506, 'colsample_bytree': 0.969203559750932, 'max_depth': 9, 'min_child_weight': 3, 'eta': 0.05913197693988899, 'gamma': 1.6977595149678425e-07, 'grow_policy': 'lossguide'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear', 'dart')), 'lambda': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'alpha': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'max_depth': IntDistribution(high=9, log=False, low=3, step=2),

In [21]:
from xgboost import XGBClassifier
import pickle
a = pickle.load(open('model.pkl', "rb"))
# xgbc = XGBClassifier()
# xgbc.load_model('model.pkl')

In [34]:
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Label
0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590,1
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809,1
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,0
3,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723,1
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,1
...,...,...,...,...,...,...,...,...
3195,1.673063,-1.371783,-1.967652,0.111736,1.612901,-0.118002,-0.356564,1
3196,-2.156864,-1.655429,0.680630,1.499035,0.346765,0.507208,1.584665,0
3197,0.068512,-1.972440,-0.014678,-0.372102,2.473623,-2.943860,5.529861,0
3198,-1.355544,1.188881,-2.895093,-0.384314,-2.467256,1.681962,0.601350,0


In [35]:
a.predict(df)

FileNotFoundError: [Errno 2] No such file or directory: 'AutogluonModels\\ag-20240417_070035\\models\\XGBoost_r98_BAG_L2\\..\\..\\..\\..\\KNeighborsUnif_BAG_L1\\model.pkl'

In [16]:
train_x, train_y = get_data()
xgbc.fit(train_x, train_y)

In [17]:
x_test = pd.read_csv('../data/validation.csv').drop(columns=['Label']).values
y_test = pd.read_csv('../data/validation.csv')['Label'].values

In [18]:
xgbc.score(x_test, y_test)

0.8925