In [151]:
# –ë–∞–∑–æ–≤—ã–µ –±–∏–±–ª–∏–æ—Ç–µ–∫–∏ –¥–ª—è —Ä–∞–±–æ—Ç—ã —Å –¥–∞–Ω–Ω—ã–º–∏ –∏ –≤—ã—á–∏—Å–ª–µ–Ω–∏–π
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# –ò–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç—ã –¥–ª—è ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)

# –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è
import seaborn as sns
import matplotlib.pyplot as plt

# MLflow –∏ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ
import mlflow
import mlflow.tensorflow
from mlflow.models.signature import infer_signature
import dagshub
from mlflow.exceptions import MlflowException

# –°–∏—Å—Ç–µ–º–Ω—ã–µ –±–∏–±–ª–∏–æ—Ç–µ–∫–∏
import time
import yaml
import os
import tempfile
import datetime

In [152]:
def load_config(file_name):
    config_path = os.path.join(os.getcwd(), 'configs', file_name)
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)


In [153]:
def create_model(input_dim, model_config):
    """
    –°–æ–∑–¥–∞–Ω–∏–µ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã –Ω–µ–π—Ä–æ–Ω–Ω–æ–π —Å–µ—Ç–∏ –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏
    """
    #model_config = load_config('model_config.yml')['model']

    # –°–æ–∑–¥–∞–µ–º –º–æ–¥–µ–ª—å, –∏—Å–ø–æ–ª—å–∑—É—è —Ñ—É–Ω–∫—Ü–∏–æ–Ω–∞–ª—å–Ω—ã–π API
    inputs = keras.Input(shape=(input_dim,))
    x = inputs

    for layer in model_config['layers']:
        x = layers.Dense(layer['units'], activation=layer['activation'])(x)
        if 'dropout' in layer:
            x = layers.Dropout(layer['dropout'])(x)

    outputs = x

    # –°–æ–∑–¥–∞–µ–º –º–æ–¥–µ–ª—å
    model = keras.Model(inputs=inputs, outputs=outputs, name=model_config['name'])

    # –ö–æ–º–ø–∏–ª—è—Ü–∏—è –º–æ–¥–µ–ª–∏
    model.compile(
        optimizer=model_config['compile']['optimizer'],
        loss=model_config['compile']['loss'],
        metrics=model_config['compile']['metrics']
    )

    return model

In [154]:
def train_and_evaluate_model(X, y, model_config, hyperparams):
    """
    –û–±—É—á–∞–µ—Ç –º–æ–¥–µ–ª—å, –∏—Å–ø–æ–ª—å–∑—É—è –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–∑ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–æ–Ω–Ω–æ–≥–æ —Ñ–∞–π–ª–∞
    """
    #hyperparams = load_config('hyperparameters.yml')['hyperparameters']

    # –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –Ω–∞ –æ–±—É—á–∞—é—â—É—é –∏ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫–∏
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=hyperparams['train_test_split']['test_size'],
        random_state=hyperparams['train_test_split']['random_state']
    )

    # –°—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏—è –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # –°–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏
    model = create_model(input_dim=X.shape[1], model_config=model_config)

    # –ù–∞—Å—Ç—Ä–æ–π–∫–∞ —Ä–∞–Ω–Ω–µ–π –æ—Å—Ç–∞–Ω–æ–≤–∫–∏ –¥–ª—è –ø—Ä–µ–¥–æ—Ç–≤—Ä–∞—â–µ–Ω–∏—è –ø–µ—Ä–µ–æ–±—É—á–µ–Ω–∏—è
    early_stopping = keras.callbacks.EarlyStopping(
        monitor=hyperparams['early_stopping']['monitor'],
        patience=hyperparams['early_stopping']['patience'],
        restore_best_weights=hyperparams['early_stopping']['restore_best_weights']
    )

    # –ó–∞–º–µ—Ä –≤—Ä–µ–º–µ–Ω–∏ –æ–±—É—á–µ–Ω–∏—è
    start_time = time.time()
    history = model.fit(
        X_train_scaled, y_train,
        epochs=hyperparams['epochs'],
        batch_size=hyperparams['batch_size'],
        validation_split=hyperparams['validation_split'],
        callbacks=[early_stopping],
        verbose=2
    )
    training_time = time.time() - start_time

    # –ü–æ–ª—É—á–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
    y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)
    y_pred_proba = model.predict(X_test_scaled)

    # –†–∞—Å—á–µ—Ç –º–µ—Ç—Ä–∏–∫ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=1),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=1),
        "f1": f1_score(y_test, y_pred, average='weighted', zero_division=1),
        "roc_auc": roc_auc_score(y_test, y_pred_proba),
        "training_time": training_time,
    }

    return {
        'model': model,
        'scaler': scaler,
        'history': history,
        'metrics': metrics,
        'predictions': {
            'y_test': y_test,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        },
        'data': {
            'X_test_scaled': X_test_scaled
        }
    }

In [155]:
def log_to_mlflow(evaluation_results, experiment_name, logging_config, run_name, save_model):
    """
    –õ–æ–≥–∏—Ä—É–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞ –≤ MLflow
    """
    try:
        experiment_id = mlflow.create_experiment(logging_config['experiment_name'])
    except mlflow.exceptions.MlflowException:
        experiment_id = mlflow.get_experiment_by_name(logging_config['experiment_name']).experiment_id

    with mlflow.start_run(experiment_id=experiment_id, run_name=run_name):
        # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ —Ç–µ–≥–æ–≤ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
        for tag, value in logging_config['tags'].items():
            mlflow.set_tag(tag, value)

        # –ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –∏–∑ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
        model = evaluation_results['model']
        history = evaluation_results['history']
        metrics = evaluation_results['metrics']
        X_test_scaled = evaluation_results['data']['X_test_scaled']  # –ü–æ–ª—É—á–∞–µ–º X_test_scaled –∏–∑ evaluation_results
        y_pred = evaluation_results['predictions']['y_pred']  # –¢–∞–∫–∂–µ –ø–æ–ª—É—á–∞–µ–º y_pred

        # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –º–æ–¥–µ–ª–∏
        mlflow.log_params({
            "input_dim": evaluation_results['data']['X_test_scaled'].shape[1],
            "optimizer": model.optimizer.get_config()['name'],
            "loss": model.loss,
            "metrics": model.metrics_names,
        })

        # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–µ—Ç—Ä–∏–∫
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)

        # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –∏ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç–æ–≤
        if save_model:
            # –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å —Å –ø–æ–º–æ—â—å—é MLflow
            signature = infer_signature(X_test_scaled, y_pred)
            mlflow.tensorflow.log_model(
                model,
                run_name,
                signature=signature,
                registered_model_name = run_name
            )
            # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤ –ª–æ–∫–∞–ª—å–Ω–æ–π –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏ models
            local_model_path = os.path.join("..", "..", "models", f"{run_name}.keras")
            model.save(local_model_path)

In [156]:
dagshub.init(repo_owner='sever.cpa.general', repo_name='my-first-repo', mlflow=True)
run_name = "ml_baseline"
from mlflow.exceptions import MlflowException

try:
    experiment_id = mlflow.create_experiment(experiment_name)
except MlflowException:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

# –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ
import os
file_path = os.path.join("..", "..", "data", "raw", "water_potability.csv")

df = pd.read_csv(file_path)
X = df.drop('Potability', axis=1)
y = df['Potability']
feature_names = X.columns.tolist()

## –ó–∞–≥—Ä—É–∂–∞–µ–º –∫–æ–Ω—Ñ–∏–≥–∏
def load_config(file_name):
    config_path = os.path.join(os.getcwd(), 'configs', file_name)
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)

hyperparams = load_config('hyperparameters.yml')['hyperparameters']
model_config = load_config('model_config.yml')['model']
logging_config = load_config('logging_config.yml')['logging']

# –û–±—É—á–µ–Ω–∏–µ –∏ –æ—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏
results = train_and_evaluate_model(
    X=X,
    y=y,
    model_config=model_config,
    hyperparams=hyperparams
)
# –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –≤ MLflow
log_to_mlflow(
    evaluation_results=results,
    experiment_name="Water Probability [RF]",  # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Ç–æ—Ç –∂–µ experiment_name
    logging_config = logging_config,
    run_name= "ml_baseline",  # –Ø–≤–Ω–æ –ø–µ—Ä–µ–¥–∞–µ–º run_name
    save_model=True
)


Epoch 1/50
66/66 - 2s - 34ms/step - accuracy: 0.5978 - loss: 0.6890 - val_accuracy: 0.6050 - val_loss: 0.6828
Epoch 2/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6763 - val_accuracy: 0.6050 - val_loss: 0.6719
Epoch 3/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6707 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 4/50
66/66 - 0s - 5ms/step - accuracy: 0.6054 - loss: 0.6715 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 5/50
66/66 - 0s - 5ms/step - accuracy: 0.6054 - loss: 0.6710 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 6/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6719 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 7/50
66/66 - 0s - 5ms/step - accuracy: 0.6054 - loss: 0.6720 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 8/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6712 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 9/50
66/66 - 0s - 4ms/step - accuracy: 0.6054 - loss: 0.6715 - val_accuracy: 0.6050 - val_loss: 0.6710
Epoch 10/50
66/66 

Registered model 'ml_baseline' already exists. Creating a new version of this model...
2024/12/08 03:26:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ml_baseline, version 2
Created version '2' of model 'ml_baseline'.


üèÉ View run ml_baseline at: https://dagshub.com/sever.cpa.general/my-first-repo.mlflow/#/experiments/2/runs/b63f07808f054edc81f3aff8091d6335
üß™ View experiment at: https://dagshub.com/sever.cpa.general/my-first-repo.mlflow/#/experiments/2
