In [151]:
# Базовые библиотеки для работы с данными и вычислений
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Инструменты для ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)

# Визуализация
import seaborn as sns
import matplotlib.pyplot as plt

# MLflow и логирование
import mlflow
import mlflow.tensorflow
from mlflow.models.signature import infer_signature
import dagshub
from mlflow.exceptions import MlflowException

# Системные библиотеки
import time
import yaml
import os
import tempfile
import datetime

In [152]:
def load_config(file_name):
    config_path = os.path.join(os.getcwd(), 'configs', file_name)
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)


In [153]:
def create_model(input_dim, model_config):
    """
    Создание архитектуры нейронной сети на основе конфигурации
    """
    #model_config = load_config('model_config.yml')['model']

    # Создаем модель, используя функциональный API
    inputs = keras.Input(shape=(input_dim,))
    x = inputs

    for layer in model_config['layers']:
        x = layers.Dense(layer['units'], activation=layer['activation'])(x)
        if 'dropout' in layer:
            x = layers.Dropout(layer['dropout'])(x)

    outputs = x

    # Создаем модель
    model = keras.Model(inputs=inputs, outputs=outputs, name=model_config['name'])

    # Компиляция модели
    model.compile(
        optimizer=model_config['compile']['optimizer'],
        loss=model_config['compile']['loss'],
        metrics=model_config['compile']['metrics']
    )

    return model

In [154]:
def train_and_evaluate_model(X, y, model_config, hyperparams):
    """
    Обучает модель, используя параметры из конфигурационного файла
    """
    #hyperparams = load_config('hyperparameters.yml')['hyperparameters']

    # Разделение данных на обучающую и тестовую выборки
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=hyperparams['train_test_split']['test_size'],
        random_state=hyperparams['train_test_split']['random_state']
    )

    # Стандартизация признаков
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Создание модели
    model = create_model(input_dim=X.shape[1], model_config=model_config)

    # Настройка ранней остановки для предотвращения переобучения
    early_stopping = keras.callbacks.EarlyStopping(
        monitor=hyperparams['early_stopping']['monitor'],
        patience=hyperparams['early_stopping']['patience'],
        restore_best_weights=hyperparams['early_stopping']['restore_best_weights']
    )

    # Замер времени обучения
    start_time = time.time()
    history = model.fit(
        X_train_scaled, y_train,
        epochs=hyperparams['epochs'],
        batch_size=hyperparams['batch_size'],
        validation_split=hyperparams['validation_split'],
        callbacks=[early_stopping],
        verbose=2
    )
    training_time = time.time() - start_time

    # Получение предсказаний
    y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)
    y_pred_proba = model.predict(X_test_scaled)

    # Расчет метрик качества модели
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=1),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=1),
        "f1": f1_score(y_test, y_pred, average='weighted', zero_division=1),
        "roc_auc": roc_auc_score(y_test, y_pred_proba),
        "training_time": training_time,
    }

    return {
        'model': model,
        'scaler': scaler,
        'history': history,
        'metrics': metrics,
        'predictions': {
            'y_test': y_test,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        },
        'data': {
            'X_test_scaled': X_test_scaled
        }
    }

In [155]:
def log_to_mlflow(evaluation_results, experiment_name, logging_config, run_name, save_model):
    """
    Логирует результаты эксперимента в MLflow
    """
    try:
        experiment_id = mlflow.create_experiment(logging_config['experiment_name'])
    except mlflow.exceptions.MlflowException:
        experiment_id = mlflow.get_experiment_by_name(logging_config['experiment_name']).experiment_id

    with mlflow.start_run(experiment_id=experiment_id, run_name=run_name):
        # Логирование тегов эксперимента
        for tag, value in logging_config['tags'].items():
            mlflow.set_tag(tag, value)

        # Извлечение данных из результатов
        model = evaluation_results['model']
        history = evaluation_results['history']
        metrics = evaluation_results['metrics']
        X_test_scaled = evaluation_results['data']['X_test_scaled']  # Получаем X_test_scaled из evaluation_results
        y_pred = evaluation_results['predictions']['y_pred']  # Также получаем y_pred

        # Логирование параметров модели
        mlflow.log_params({
            "input_dim": evaluation_results['data']['X_test_scaled'].shape[1],
            "optimizer": model.optimizer.get_config()['name'],
            "loss": model.loss,
            "metrics": model.metrics_names,
        })

        # Логирование метрик
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)

        # Сохранение модели и артефактов
        if save_model:
            # Сохраняем модель с помощью MLflow
            signature = infer_signature(X_test_scaled, y_pred)
            mlflow.tensorflow.log_model(
                model,
                run_name,
                signature=signature,
                registered_model_name = run_name
            )
            # Сохранение в локальной директории models
            local_model_path = os.path.join("..", "..", "models", f"{run_name}.keras")
            model.save(local_model_path)

In [156]:
dagshub.init(repo_owner='sever.cpa.general', repo_name='my-first-repo', mlflow=True)
run_name = "ml_baseline"
from mlflow.exceptions import MlflowException

try:
    experiment_id = mlflow.create_experiment(experiment_name)
except MlflowException:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

# Загружаем данные
import os
file_path = os.path.join("..", "..", "data", "raw", "water_potability.csv")

df = pd.read_csv(file_path)
X = df.drop('Potability', axis=1)
y = df['Potability']
feature_names = X.columns.tolist()

## Загружаем конфиги
def load_config(file_name):
    config_path = os.path.join(os.getcwd(), 'configs', file_name)
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)

hyperparams = load_config('hyperparameters.yml')['hyperparameters']
model_config = load_config('model_config.yml')['model']
logging_config = load_config('logging_config.yml')['logging']

# Обучение и оценка модели
results = train_and_evaluate_model(
    X=X,
    y=y,
    model_config=model_config,
    hyperparams=hyperparams
)
# Логирование результатов в MLflow
log_to_mlflow(
    evaluation_results=results,
    experiment_name="Water Probability [RF]",  # Используем тот же experiment_name
    logging_config = logging_config,
    run_name= "ml_baseline",  # Явно передаем run_name
    save_model=True
)


Epoch 1/50
66/66 - 2s - 34ms/step - accuracy: 0.5978 - loss: 0.6890 - val_accuracy: 0.6050 - val_loss: 0.6828
Epoch 2/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6763 - val_accuracy: 0.6050 - val_loss: 0.6719
Epoch 3/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6707 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 4/50
66/66 - 0s - 5ms/step - accuracy: 0.6054 - loss: 0.6715 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 5/50
66/66 - 0s - 5ms/step - accuracy: 0.6054 - loss: 0.6710 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 6/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6719 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 7/50
66/66 - 0s - 5ms/step - accuracy: 0.6054 - loss: 0.6720 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 8/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6712 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 9/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6715 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 10/50
66/66 

Registered model 'ml_baseline' already exists. Creating a new version of this model...
2024/12/08 03:26:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ml_baseline, version 2
Created version '2' of model 'ml_baseline'.


🏃 View run ml_baseline at: https://dagshub.com/sever.cpa.general/my-first-repo.mlflow/#/experiments/2/runs/b63f07808f054edc81f3aff8091d6335
🧪 View experiment at: https://dagshub.com/sever.cpa.general/my-first-repo.mlflow/#/experiments/2
