In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
import xgboost as xgb
import joblib

# --- Подготовка окружения ---
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.prepare_data import load_and_prepare_data

# --- Загрузка данных ---
data = load_and_prepare_data()
X_full = data['X']
y = data['y_si']

# --- Топ-20 признаков по SHAP (на основе предыдущего анализа) ---
top_20_features = [
    'NHOHCount', 'BCUT2D_MWLOW', 'MolLogP', 'PEOE_VSA6', 'Kappa3', 'Kappa1',
    'BCUT2D_MRLOW', 'Kappa2', 'VSA_EState8', 'PEOE_VSA7', 'EState_VSA3', 'AvgIpc',
    'VSA_EState4', 'fr_C_O', 'SPS', 'BCUT2D_CHGLO', 'EState_VSA8',
    'VSA_EState3', 'NumHDonors', 'VSA_EState6'
]
X = X_full[top_20_features]

# --- Обучение моделей ---
rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features='log2',
    random_state=42,
    n_jobs=-1
)

xgb_model = xgb.XGBRegressor(
    n_estimators=400,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.6,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=0.5,
    min_child_weight=5,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

ensemble = VotingRegressor(estimators=[
    ("xgb", xgb_model),
    ("rf", rf_model)
], n_jobs=-1)

# --- Кросс-валидация ---
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(ensemble, X, y, cv=cv)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

print("\nVotingRegressor (RF + XGB):")
print(f"  RMSE = {rmse:.3f}")
print(f"  R²   = {r2:.3f}")

# --- Сохранение модели ---
ensemble.fit(X, y)
joblib.dump(ensemble, "final_voting_model_y_si.joblib")



VotingRegressor (RF + XGB):
  RMSE = 1.174
  R²   = 0.350


['final_voting_model_y_si.joblib']