In [1]:
from pathlib import Path
from typing import List

import joblib
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
# Папка для ресурсов
RESOURCES_DIR = Path("../resources")
RESOURCES_DIR.mkdir(exist_ok=True, parents=True)
MODEL_PATH = RESOURCES_DIR / "ridge_best_model.joblib"

# Загрузка данных
X = np.load("../data/x_data.npy")
y = np.load("../data/y_data.npy").ravel()

# Pipeline
model = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("regressor", Ridge()),
    ]
)

# Сетка alpha
alphas = np.arange(0.1, 15, 0.25)
param_grid = {"regressor__alpha": alphas}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid = GridSearchCV(
    model,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
grid.fit(X, y)

# Лучшая модель
best_model = grid.best_estimator_
best_alpha = grid.best_params_["regressor__alpha"]
print("Best alpha:", best_alpha)

# Метрики CV
best_mse_cv = -grid.best_score_
best_rmse_cv = best_mse_cv**0.5
print(f"CV MSE:  {best_mse_cv:.2f}")
print(f"CV RMSE: {best_rmse_cv:.2f}")

# Метрики на train
y_pred_train = best_model.predict(X)
train_mse = mean_squared_error(y, y_pred_train)
train_rmse = train_mse**0.5
train_r2 = r2_score(y, y_pred_train)
print("\nTrain metrics:")
print(f"Train MSE:  {train_mse:.2f}")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Train R2:   {train_r2:.4f}")

# Сохранение лучшей модели
joblib.dump(best_model, MODEL_PATH)
print(f"\nBest model saved to: {MODEL_PATH}")

Best alpha: 14.349999999999998
CV MSE:  1597020173.32
CV RMSE: 39962.73

Train metrics:
Train MSE:  1595105636.17
Train RMSE: 39938.77
Train R2:   0.4245

Best model saved to: ..\resources\ridge_best_model.joblib


In [3]:
# Пути
RESOURCES_DIR = Path("../resources")
MODEL_PATH = RESOURCES_DIR / "ridge_best_model.joblib"

DATA_DIR = Path("../data")
DATA_DIR.mkdir(exist_ok=True, parents=True)

X_PATH = DATA_DIR / "x_data.npy"
OUTPUT_TXT = DATA_DIR / "predictions.txt"

# Загружаем модель
model_pipeline = joblib.load(MODEL_PATH)
print(f"Model loaded from: {MODEL_PATH}")

# Загружаем X
X_new = np.load(X_PATH)
print(f"X_new loaded from: {X_PATH}, shape={X_new.shape}")

# Предсказания
predictions: np.ndarray = model_pipeline.predict(X_new)

# Сохраняем предсказания в текстовый файл (по одному значению на строку)
with open(OUTPUT_TXT, "w") as f:
    for p in predictions:
        f.write(f"{p:.2f}\n")  # сохраняем с 2 знаками после запятой

print(f"Predictions saved to: {OUTPUT_TXT}")

# Преобразуем в список float для использования в API или дальнейшем коде
predictions_list: List[float] = predictions.tolist()
print("First 10 predictions:", predictions_list[:10])

Model loaded from: ..\resources\ridge_best_model.joblib
X_new loaded from: ..\data\x_data.npy, shape=(52131, 32)
Predictions saved to: ..\data\predictions.txt
First 10 predictions: [23240.805449277294, 80003.64292568414, 70853.4933939467, 110775.92996944673, 45902.01125861561, 61660.90189917663, 23103.38944264104, 138522.54444494136, 117618.75029949346, 77276.83565283743]
