In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder

# 01 - Subset data for RAM fitting

# 02 - Search for the best hyperparameters with the subset

In [None]:
#Load train data
train_data_subset = np.load('j_model/01_X_subset.npy')
train_labels_subset = np.load('j_model/01_y_subset.npy', allow_pickle=True)

In [None]:
#Encode labels
label_encoder = LabelEncoder()

In [None]:
train_labels_encoded = label_encoder.fit_transform(train_labels_subset)

In [None]:
#Define model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',n_jobs=4)

In [None]:
# Define the parameter distributions
param_distributions = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3)
}

In [None]:
# Random search
random_search = RandomizedSearchCV(
    model,
    param_distributions,
    n_iter=20,        
    cv=3,             
    random_state=42,
    n_jobs=2
)

random_search.fit(train_data_subset, train_labels_encoded)

In [None]:
# Save best model 
joblib.dump(random_search.best_estimator_, 'best_xgb_model.pkl')


In [None]:
import json

# Get the best hyperparameters
best_params = random_search.best_params_

# Save them to a JSON file
with open('best_xgb_params.json', 'w') as f:
    json.dump(best_params, f, indent=4)

In [None]:
# Best params:
best_learning_rate = 0.22959818254342154
best_max_depth = 7
best_n_estimators = 70

# 03 - Train model with the best hyperparameters in the whole dataset

In [None]:
#Load train data
train_data = np.load('X_train_scaled.npy')
train_labels = np.load('y_train.npy', allow_pickle=True)

In [None]:
#Encode labels
label_encoder = LabelEncoder()

In [None]:
train_labels_encoded = label_encoder.fit_transform(train_labels)

In [None]:
model = XGBClassifier(
    learning_rate=0.22959818254342154,
    max_depth=7,
    n_estimators=70,
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=4 
)

In [None]:
model.fit(train_data, train_labels_encoded)