In [2]:
import os
import sys

# Добавляем корень проекта в sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Импорт утилит
from utils.prepare_data import load_and_prepare_data

# Импорт библиотек
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor, early_stopping
from xgboost import XGBRegressor

# 1. Загрузка данных
data = load_and_prepare_data()
X = data['X']
y = data['y_cc50']  # логарифмированная цель

# 2. Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 3. Инициализация моделей
catboost = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.02,
    depth=8,
    early_stopping_rounds=50,
    verbose=False,
    random_state=42
)

lgbm = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.02,
    max_depth=8,
    random_state=42,
    verbosity=-1
)

xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42,
    verbosity=0,
    n_jobs=-1
)

# 4. Кросс-валидация
cv = KFold(n_splits=3, shuffle=True, random_state=42)
oof_preds = np.zeros((X_train.shape[0], 3))
test_preds = np.zeros((X_test.shape[0], 3))

# 5. Обучение моделей
for i, model in enumerate([catboost, lgbm, xgb]):
    print(f"Модель {i+1}: {model.__class__.__name__}")
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train)):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        if isinstance(model, LGBMRegressor):
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                eval_metric='rmse',
                callbacks=[early_stopping(stopping_rounds=50)]
            )
        elif isinstance(model, XGBRegressor):
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                verbose=False
            )
        else:
            model.fit(
                X_tr, y_tr,
                eval_set=(X_val, y_val),
                early_stopping_rounds=50,
                verbose=False
            )

        oof_preds[val_idx, i] = model.predict(X_val)

    # Финальное обучение
    if isinstance(model, LGBMRegressor):
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_metric='rmse',
            callbacks=[early_stopping(stopping_rounds=50)]
        )
    elif isinstance(model, XGBRegressor):
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            verbose=False
        )
    else:
        model.fit(
            X_train, y_train,
            eval_set=(X_test, y_test),
            early_stopping_rounds=50,
            verbose=False
        )

    test_preds[:, i] = model.predict(X_test)

# 6. Финальный стек
stacker = LinearRegression()
stacker.fit(oof_preds, y_train)
final_preds = stacker.predict(test_preds)

# 7. Оценка
rmse = np.sqrt(mean_squared_error(y_test, final_preds))
r2 = r2_score(y_test, final_preds)

print(f"\n Итоговая стекинг-модель:")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")


Модель 1: CatBoostRegressor
Модель 2: LGBMRegressor
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[160]	valid_0's rmse: 1.14638	valid_0's l2: 1.31419
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[169]	valid_0's rmse: 1.27046	valid_0's l2: 1.61407
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[322]	valid_0's rmse: 1.28079	valid_0's l2: 1.64043
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[110]	valid_0's rmse: 1.14463	valid_0's l2: 1.31019
Модель 3: XGBRegressor

 Итоговая стекинг-модель:
RMSE: 1.115
R²: 0.445
