In [1]:
import os
import sys
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor

# Установка пути к проекту (если utils в корне проекта)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Загрузка и подготовка данных
from utils.prepare_data import load_and_prepare_data

data = load_and_prepare_data()
X = data['X']
y = data['y_ic50']  # лог-преобразованная целевая переменная

# Деление на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Обучение LightGBM с лучшими параметрами от FLAML
model = LGBMRegressor(
    n_estimators=113,
    num_leaves=9,
    min_child_samples=19,
    learning_rate=0.0663,
    max_bin=511,  # из log_max_bin=9
    colsample_bytree=0.7646,
    reg_alpha=0.3551,
    reg_lambda=0.0539,
    random_state=42
)

model.fit(X_train, y_train)

# Предсказания и метрики
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("LightGBM (FLAML best config) Results:")
print(f"RMSE = {rmse:.3f}")
print(f"R²   = {r2:.3f}")

# Сохранение модели
model_path = "best_lgbm_model.pkl"
joblib.dump(model, model_path)
print(f"Модель сохранена в: {model_path}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006354 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17633
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 169
[LightGBM] [Info] Start training from score 3.940505
LightGBM (FLAML best config) Results:
RMSE = 1.415
R²   = 0.480
Модель сохранена в: best_lgbm_model.pkl


[WinError 2] The system cannot find the file specified
  File "c:\Users\mikha\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\mikha\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mikha\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\mikha\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
