In [53]:
# Core modules
import os
from io import StringIO
import json
from typing import Callable, List, Dict, Any, Tuple
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import numpy as np
import inspect
from datetime import datetime

# Data retrieval
import yfinance as yf

# Plotting modules
import matplotlib.pyplot as plt
import plotly.express as px
import hyperopt.plotting as hplt
from IPython.display import clear_output

# Sklearn ML
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error, median_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator

# Keras DL
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Hyperparameter optimisation
from hyperopt import STATUS_OK, fmin, tpe, Trials, hp
from functools import partial

# Cloud services
from azureml.core import Workspace
from azure.storage.blob import BlobServiceClient
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Tracking
import mlflow
import mlflow.keras
import mlflow.sklearn
import mlflow.pyfunc
from mlflow.deployments import get_deploy_client

# data drift
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

In [12]:
# downloading data
data = yf.download("^FTSE", start="1999-12-01", end="2021-12-01")

[*********************100%%**********************]  1 of 1 completed


In [None]:
data.tail(10)

In [None]:
data.drop('Close', axis=1, inplace=True)
print(data.isnull().sum())
print(data.eq(0).sum())
print(data.index.diff().value_counts().sort_index())

In [None]:
fig = px.line(data, x=data.index, y='Adj Close')
fig.update_layout(title={'text': 'Closing price over time', 'x': 0.5})
fig.show()

In [None]:
class KerasLogger:

    def __init__(self, run_name: str):
        current_dt = datetime.now().strftime("%y-%m-%d %h:%m:%s")
        self.run_name = f"{run_name}_{current_dt}"

    def __call__(self, func: Callable):
        def wrapper(*args, **kwargs):
            with mlflow.start_run(run_name=self.run_name):
                mlflow.keras.autolog()
                model, y_test, y_pred, losses = func(*args, **kwargs)

                metrics = self.get_metrics(y_pred, y_test)
                mlflow.log_metrics(metrics)

                mlflow.keras.log_model(model, artifact_path="models")

                lossplt = self.plot_true_value_vs_prediction(y_pred, y_test)
                mlflow.log_figure(lossplt, "true_value_vs_prediction.png")

                epochplt = self.plot_loss_over_epoch(losses)
                mlflow.log_figure(epochplt, "loss_over_epochs.png")

                return model, y_test, y_pred, losses
        
        return wrapper

        
    @staticmethod
    def plot_true_value_vs_prediction(pred: np.ndarray, test: np.ndarray) -> plt.figure:
        fig, ax = plt.subplots()
        ax.plot(test, label='True Value')
        ax.plot(pred, label='LSTM Value')
        ax.set_title('Prediction by LSTM')
        ax.set_xlabel('Time Scale')
        ax.set_ylabel('Scaled USD')
        ax.legend()
        return fig

    @staticmethod
    def plot_loss_over_epoch(losses: List[float]) -> plt.figure:
        fig, ax = plt.subplots()
        ax.plot(losses)
        ax.set_title('Model Loss Over Epochs')
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Loss')
        return fig
    
    @staticmethod
    def get_metrics(pred: np.ndarray, test: np.ndarray) -> Dict[str, float]:
        mse = mean_squared_error(test, pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(test, pred)
        r2 = r2_score(test, pred)
        msle = mean_squared_log_error(test, pred)
        medae = median_absolute_error(test, pred)
        return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "msle": msle, "medae": medae}

In [None]:
runname = "aalstm"

## Building LSTM mode
@KerasLogger(run_name=runname)
def train_lstm_model(df: pd.DataFrame,
                test_size: int = 10,
                loss: str = 'mean_squared_error',
                activation: str = 'relu',
                optimiser: str = 'adam',
                n_epoch: int = 100,
                batch_size: int = 8,
                verbose: int = 1) -> Tuple[Sequential, np.ndarray, np.ndarray, List[float]]:

    # defome featires, target
    X, y = df.drop('Adj Close', , axis=1), data[['Adj Close']]

    # svale features
    X_scaled = MinMaxScaler().fit_transform(X)
    X = pd.DataFrame(data=X_scaled, columns=X.columns, index=X.index)

    #Building the LSTM Model
    lstm = Sequential()
    lstm.add(LSTM(32, input_shape=(1, X.shape[1]), activation=activation, return_sequences=False))
    lstm.add(Dense(1))
    lstm.compile(loss=loss, optimizer=optimiser)

    n_split = 100 // test_size - 1
    timesplit = TimeSeriesSplit(n_splits=n_split)

    losses = []
    for i, (train_idx, test_idx) in enumerate(timesplit.split(X)):
        print(f"\n\n---Training model batch {i+1} out of {n_split}---")
        X_train = X[:len(train_idx)]
        X_test = X[len(train_idx): (len(train_idx)+len(test_idx))]
        y_train = y[:len(train_idx)].values.ravel()
        y_test = y[len(train_idx): (len(train_idx)+len(test_idx))].values.ravel()

        # reshapiong
        X_train = np.array(X_train).reshape(X_train.shape[0], 1, X_train.shape[1])
        hist = lstm.fit(X_train, y_train, epochs=n_epoch, batch_size=batch_size, verbose=verbose)
        losses.extend(hist.history['loss'])
    
    X_test = np.array(X_test).reshape(X_test.shape[0], 1, X_test.shape[1])
    y_pred = lstm.predict(X_test)
    return lstm, y_test, y_pred, losses

In [None]:
ml_client = MLClient.from_config(credential=DefaultAzureCredential())
mlflow_tracking_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri
mlflow.set_tracking_uri(mlflow_tracking_uri)

mlflow.set_experiment(experiment_name="demo4")

In [None]:
train_lstm_model(data, n_epoch=5, test_size=25)

In [None]:
def train_sklearn_regressor(hyperparams: Dict[str, Any],
                X_train: pd.DataFrame,
                X_test: pd.DataFrame,
                y_train: pd.Series,
                y_test: pd.Series,
                model: BaseEstimator,
                dtypes: Dict[str, Callable]) -> Dict[str, Any]:
    
    # map datatypes
    for k, v in hyperparams.items():
        if k in dtypes.keys():
            hyperparams[k] = dtypes[k](v)
    
    model = model(**hyperparams)
    hypervals = '_'.join([f'{key}: {value}' for key, value in hyperparams.items()])
    run_name = f"{type(model).__name__}_{hypervals}"
    with mlflow.start_run(nested=True, run_name=run_name) as child_run:
        
        mlflow.sklearn.autolog()

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        msle = mean_squared_log_error(y_test, y_pred)
        medae = median_absolute_error(y_test, y_pred)

        mlflow.log_metrics({"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "msle": msle, "medae": medae})
    
        return {'status': STATUS_OK, 'loss': mse, "attachments": {"run_id": child_run.info.run_id}}


In [None]:
from dataclasses import dataclass

@dataclass(frozen=True)
class ChildRunData:
    params: Dict[str, Any]
    metrics: Dict[str, float]
    model: BaseEstimator
    run_id: str
    run_name: str
    experiment_id: str

    @classmethod
    def get(cls, run_id):
        client = mlflow.tracking.MlflowClient()
        run = mlflow.get_run(run_id)
        metrics = run.data.metrics
        params = run.data.params
        models = client.download_artifacts(run_id, path="model")
        run_name = run.info.run_name
        experiment_id = run.info.experiment_id

        return cls(params, metrics, models, run_id, run_name, experiment_id)

In [None]:

def search(hyperparam_space: Dict[str, Any],
           df: pd.DataFrame,
           target: str,
           model: BaseEstimator,
           dtypes: Dict[str, Callable],
           max_evals: int = 100) -> Tuple[Trials, Dict[str, Any]]:

    X, y = df.drop(target, axis=1), df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    with mlflow.start_run(nested=False, run_name=model.__name__):
        trials = Trials()
        best_params = fmin(
            fn=partial(
                train_sklearn_regressor, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, model=model, dtypes=dtypes
                ),
            space=hyperparam_space,
            algo=tpe.suggest,
            trials=trials,
            max_evals=max_evals
        )

        best_run_id = trials.trial_attachments(trials.best_trial)["run_id"]
        best_run_data = ChildRunData.get(best_run_id)

        mlflow.log_param("best_run_id", best_run_id)
        mlflow.log_params({f"best_{p}": v for p, v in best_params.items()})
        mlflow.log_metric("best_mse", trials.best_trial["result"]["loss"])
        mlflow.log_metrics(best_run_data.metrics)
        mlflow.log_artifacts(local_dir=best_run_data.model, artifact_path="model")

        fig = plt.figure()
        fig.add_subplot()
        hplt.main_plot_histogram(trials, do_show=False)
        mlflow.log_figure(fig, "loss_histogram.png")

        return trials, best_params


In [None]:
rfspace = {
    "n_estimators": hp.uniform("n_estimators", 200, 1000),
    "max_depth": hp.quniform("max_depth", 10, 1200, 10),
    "min_samples_split": hp.uniform("min_samples_split", 0.1, 1.0),
    "min_samples_leaf": hp.uniform("min_samples_leaf", 0.1, 0.5),
    "max_features": hp.choice("max_features", options=[None, 'sqrt', 'log2']),
    "criterion": hp.choice("criterion", ['squared_error', 'poisson', 'absolute_error', 'friedman_mse'])
}


gbmspace = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'max_depth': hp.choice('max_depth', [None, hp.quniform('max_depth_val', 3, 10, 1)]),
    'min_samples_split': hp.uniform('min_samples_split', 0.1, 1.0),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.1, 0.5),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
}


rfdtypes = {"max_depth": lambda x: int(x), "n_estimators": lambda x: int(x)}
gbmdtypes = {"max_depth": lambda x: int(x) if x is not None else x, "n_estimators": lambda x: int(x)}

data = data.astype(float)

models = [RandomForestRegressor, GradientBoostingRegressor]
for model, grid, dtype in zip(models, [rfspace, gbmspace], [rfdtypes, gbmdtypes]):
    search(grid, data, 'Adj Close', model, dtype, max_evals=10)


In [11]:
last_run = mlflow.last_active_run()
try:
    child_runs = mlflow.search_runs(
        filter_string=f"tags.mlflow.parentRunId='{last_run.info.run_id}'"
    )
    with pd.option_context("display.max_columns", None):
        print(child_runs.sort_values(by='metrics.mae'))
except AttributeError: print("No runs found")

No runs found


In [4]:
# creating data store within workspace
load_dotenv(find_dotenv())
connection_string = os.environ.get('AZURE_STORAGE_CONNECTION_STRING')
container = os.environ.get('CONTAINER_NAME')

blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container)

main_blob_name = 'ftse100data.csv'

In [5]:
def save_data_to_blob(blob_name: str, df: pd.DataFrame, overwrite: bool = True) -> None:
    container_client.upload_blob(name=blob_name, data=df.to_csv(), overwrite=overwrite)

def read_data_from_blob(blob_name: str) -> pd.DataFrame:
    blob_client = container_client.get_blob_client(blob_name)
    data = blob_client.download_blob().readall().decode('utf-8')
    df = pd.read_csv(StringIO(data), parse_dates=['Date'])
    return df.set_index('Date')

#data = read_data_from_blob(main_blob_name)
#data.tail()

step 2: logging model in model registry

In [6]:
credential = DefaultAzureCredential() # this will prompt a sign-in if necessary
mlclient = MLClient.from_config(credential) # reads config.json file

Found the config file in: .\config.json


In [8]:
exp = mlflow.get_experiment_by_name('demo4')
last_run = mlflow.search_runs(exp.experiment_id)
our_run = last_run.loc[last_run['tags.mlflow.runName'] == 'RandomForestRegressor'].squeeze()
rid = our_run.run_id
with pd.option_context("display.max_rows", None): print(our_run)

run_id                                      5727663c-084e-4c18-a007-b8d69d67691d
experiment_id                               d2f4cd6e-b006-4000-9b48-2f1916d8f10a
status                                                                  FINISHED
artifact_uri                                                                    
start_time                                      2024-01-15 17:26:09.370000+00:00
end_time                                        2024-01-15 17:30:09.125000+00:00
metrics.medae                                                             487.75
metrics.msle                                                            0.010807
metrics.r2                                                               0.69373
metrics.mse                                                        321056.233024
metrics.loss                                                                 NaN
metrics.rmse                                                          566.618243
metrics.mae                 

In [114]:
artifact_path = "model"
mlflow.register_model(f"runs:/{rid}/{artifact_path}", "lstm")

Registered model 'lstm' already exists. Creating a new version of this model...
2024/01/16 14:46:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lstm, version 2
Created version '2' of model 'lstm'.


<ModelVersion: aliases=[], creation_timestamp=1705416370772, current_stage='None', description='', last_updated_timestamp=1705416370772, name='lstm', run_id='5727663c-084e-4c18-a007-b8d69d67691d', run_link='', source='azureml://uksouth.api.azureml.ms/mlflow/v2.0/subscriptions/b501a57e-71d5-4887-b72c-a0c961a0f281/resourceGroups/uk-environment/providers/Microsoft.MachineLearningServices/workspaces/daadspocs/experiments/d2f4cd6e-b006-4000-9b48-2f1916d8f10a/runs/5727663c-084e-4c18-a007-b8d69d67691d/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

In [21]:
client = mlflow.tracking.MlflowClient()
for model in client.search_registered_models():
    print(f"{model.name}")

azureml_boring_cassava_smcw7bv63h_output_mlflow_log_model_548350309
azureml_boring_cassava_smcw7bv63h_output_mlflow_log_model_1700531275
credit_defaults_model
amlstudio-predict-auto-price
azureml_AutoML_cb7a19de-94bb-40e3-a109-6e2330f4023e_0_output_mlflow_log_model_805459951
AutoMLcb7a19de90
amlstudio-predict-diabetes
azureml_5a48cbc5-ac10-4512-bb9f-4d6ac06c4468_output_mlflow_log_model_503845975
azureml_loving_machine_3r8228hz83_output_mlflow_log_model_503845975
diabetes-mlflow
azureml_9f297f68-d9fa-4714-a98b-6546413102d5_output_mlflow_log_model_1899327965
azureml_9f297f68-d9fa-4714-a98b-6546413102d5_output_mlflow_log_model_1938168045
azureml_6651ba31-eeac-4b4d-a021-39659af5ba27_output_mlflow_log_model_613564256
azureml_e84dddfe-1b66-4fa8-a8a0-c1c720cceed4_output_mlflow_log_model_613564256
azureml_e0299110-ad86-4156-9a20-575cf5c8730d_output_mlflow_log_model_613564256
azureml_e01e8cb3-4cc6-4d0e-8afd-ecc644103688_output_mlflow_log_model_613564256
azureml_d947dee0-32e9-4bbf-bffc-4030fca49

In [22]:
diab_model = mlflow.pyfunc.load_model(f"models:/diabetes-mlflow/None")
inspect.getmembers(diab_model, predicate=inspect.ismethod)

  latest = client.get_latest_versions(name, None if stage is None else [stage])
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 5/5 [00:01<00:00,  4.76it/s]
 - cloudpickle (current: 3.0.0, required: cloudpickle==2.2.0)
 - psutil (current: 5.9.7, required: psutil==5.8.0)
 - scikit-learn (current: 1.3.2, required: scikit-learn==0.24.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[('__eq__',
  <bound method PyFuncModel.__eq__ of mlflow.pyfunc.loaded_model:
    artifact_path: model
    flavor: mlflow.sklearn
    run_id: 36b85079-2e17-43da-b4b4-775944b9a2d3
  >),
 ('__init__',
  <bound method PyFuncModel.__init__ of mlflow.pyfunc.loaded_model:
    artifact_path: model
    flavor: mlflow.sklearn
    run_id: 36b85079-2e17-43da-b4b4-775944b9a2d3
  >),
 ('__repr__',
  <bound method PyFuncModel.__repr__ of mlflow.pyfunc.loaded_model:
    artifact_path: model
    flavor: mlflow.sklearn
    run_id: 36b85079-2e17-43da-b4b4-775944b9a2d3
  >),
 ('_predict_fn',
  <bound method _SklearnModelWrapper.predict of <mlflow.sklearn._SklearnModelWrapper object at 0x00000242DE17DFD0>>),
 ('predict',
  <bound method PyFuncModel.predict of mlflow.pyfunc.loaded_model:
    artifact_path: model
    flavor: mlflow.sklearn
    run_id: 36b85079-2e17-43da-b4b4-775944b9a2d3
  >),
 ('unwrap_python_model',
  <bound method PyFuncModel.unwrap_python_model of mlflow.pyfunc.loaded_model:
    artifac

In [117]:
client.search_registered_models(f"name='lstm'")

[<RegisteredModel: aliases={}, creation_timestamp=1705340501921, description='', last_updated_timestamp=1705340501921, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1705341048135, current_stage='Staging', description='', last_updated_timestamp=1705398650887, name='lstm', run_id='5727663c-084e-4c18-a007-b8d69d67691d', run_link='', source='azureml://uksouth.api.azureml.ms/mlflow/v2.0/subscriptions/b501a57e-71d5-4887-b72c-a0c961a0f281/resourceGroups/uk-environment/providers/Microsoft.MachineLearningServices/workspaces/daadspocs/experiments/d2f4cd6e-b006-4000-9b48-2f1916d8f10a/runs/5727663c-084e-4c18-a007-b8d69d67691d/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>,
  <ModelVersion: aliases=[], creation_timestamp=1705416370772, current_stage='None', description='', last_updated_timestamp=1705416370772, name='lstm', run_id='5727663c-084e-4c18-a007-b8d69d67691d', run_link='', source='azureml://uksouth.api.azureml.ms/mlflow/v2.0/subscrip

In [118]:
client.search_model_versions(f"name='lstm'")

[<ModelVersion: aliases=[], creation_timestamp=1705416370772, current_stage='None', description='', last_updated_timestamp=1705416370772, name='lstm', run_id='5727663c-084e-4c18-a007-b8d69d67691d', run_link='', source='azureml://uksouth.api.azureml.ms/mlflow/v2.0/subscriptions/b501a57e-71d5-4887-b72c-a0c961a0f281/resourceGroups/uk-environment/providers/Microsoft.MachineLearningServices/workspaces/daadspocs/experiments/d2f4cd6e-b006-4000-9b48-2f1916d8f10a/runs/5727663c-084e-4c18-a007-b8d69d67691d/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>,
 <ModelVersion: aliases=[], creation_timestamp=1705341048135, current_stage='Staging', description='', last_updated_timestamp=1705398650887, name='lstm', run_id='5727663c-084e-4c18-a007-b8d69d67691d', run_link='', source='azureml://uksouth.api.azureml.ms/mlflow/v2.0/subscriptions/b501a57e-71d5-4887-b72c-a0c961a0f281/resourceGroups/uk-environment/providers/Microsoft.MachineLearningServices/workspaces/daadspoc

In [None]:
"""
client = MlflowClient()
client.copy_model_version(
    src_model_uri="models:/regression-model-staging@candidate",
    dst_name="regression-model-production",
)
""" # look into this tomorrow

In [121]:
client.get_model_version_stages('lstm', version="latest")


``mlflow.tracking.client.MlflowClient.get_model_version_stages`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/2.9.2/model-registry.html#migrating-from-stages



['None', 'Staging', 'Production', 'Archived']

In [123]:
client.transition_model_version_stage('lstm', version=2, stage='Production', archive_existing_versions=True)


``mlflow.tracking.client.MlflowClient.transition_model_version_stage`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/2.9.2/model-registry.html#migrating-from-stages



<ModelVersion: aliases=[], creation_timestamp=1705416370772, current_stage='Production', description='', last_updated_timestamp=1705417109695, name='lstm', run_id='5727663c-084e-4c18-a007-b8d69d67691d', run_link='', source='azureml://uksouth.api.azureml.ms/mlflow/v2.0/subscriptions/b501a57e-71d5-4887-b72c-a0c961a0f281/resourceGroups/uk-environment/providers/Microsoft.MachineLearningServices/workspaces/daadspocs/experiments/d2f4cd6e-b006-4000-9b48-2f1916d8f10a/runs/5727663c-084e-4c18-a007-b8d69d67691d/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

In [20]:
client.get_model_version_stages("lstm", version=2)

NameError: name 'client' is not defined

step 3: model deployment

In [139]:
random_arr = np.random.randint(low=0, high=10, size=20)
suffix = "".join(random_arr.astype(str))
endpoint_name = f"ftse100demo-{suffix}"
endpoint_name

# add model deployment to endpoint here

'ftse100demo-12132267894507554523'

hopefully everything from here...

In [13]:
deployment_client = get_deploy_client(mlflow.get_tracking_uri())

In [32]:
ep_name = os.environ.get('ENDPOINT_NAME')
deployment_name = 'ftse100demo'

In [40]:
endpoint = deployment_client.create_endpoint(ep_name)

In [52]:
deployment_client.delete_endpoint(ep_name)

In [50]:
import ctypes
ctypes.__version__

'1.1.0'

In [41]:
scoring_uri = deployment_client.get_endpoint(endpoint=ep_name)["properties"][
    "scoringUri"
]
print(scoring_uri)

https://ftse100demo-12132267894507554523.uksouth.inference.ml.azure.com/score


In [44]:
deploy_config = {
    "instance_type": "Standard_DS2_v2",
    "instance_count": 1,
}
with open("deployment_config.json", "w") as j:
    j.write(json.dumps(deploy_config))

In [43]:
traffic_config = {
    "traffic": {deployment_name: 100}
}

with open("traffic_config.json", "w") as j:
    j.write(json.dumps(traffic_config))

In [45]:
dep = deployment_client.create_deployment(
    name=deployment_name,
    endpoint=ep_name,
    model_uri="models:/lstm/Staging",
    config={
        "deploy-config-file": "deployment_config.json",
        "endpoint-config-file": "traffic_config.json"
    },
)

  if model_stage_or_version in client.get_model_version_stages(None, None):
  model_version = client.get_latest_versions(model_name, [model_stage_or_version])[0].version


...................................................................................................

...to here is not needed

In [47]:
newdata = yf.download("^FTSE", start="2021-12-02")
newdata = newdata.drop(newdata.tail(1).index) # as no volume for current day
newdata = newdata.drop('Close', axis=1)
newdata

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-12-02,7168.700195,7168.700195,7083.200195,7129.200195,724509800
2021-12-03,7129.200195,7196.100098,7105.299805,7122.299805,867111100
2021-12-06,7122.299805,7246.299805,7122.299805,7232.299805,637274000
2021-12-07,7232.299805,7344.700195,7232.299805,7339.899902,783615400
2021-12-08,7339.899902,7378.899902,7333.600098,7337.399902,776663000
...,...,...,...,...,...
2024-01-09,7694.200195,7717.500000,7675.100098,7684.000000,703141300
2024-01-10,7684.000000,7684.000000,7647.399902,7651.799805,668838800
2024-01-11,7651.799805,7693.899902,7576.600098,7576.600098,1306895000
2024-01-12,7576.600098,7655.200195,7576.600098,7624.899902,794125500


In [48]:
sample = newdata.head(30).drop('Adj Close', axis=1)

In [25]:
model = mlflow.pyfunc.load_model(f"models:/lstm/Staging")

  latest = client.get_latest_versions(name, None if stage is None else [stage])
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00,  7.12it/s]


[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-02,7168.700195,7168.700195,7083.200195,7129.200195,7129.200195,724509800
2021-12-03,7129.200195,7196.100098,7105.299805,7122.299805,7122.299805,867111100
2021-12-06,7122.299805,7246.299805,7122.299805,7232.299805,7232.299805,637274000
2021-12-07,7232.299805,7344.700195,7232.299805,7339.899902,7339.899902,783615400
2021-12-08,7339.899902,7378.899902,7333.600098,7337.399902,7337.399902,776663000
...,...,...,...,...,...,...
2024-01-09,7694.200195,7717.500000,7675.100098,7684.000000,7684.000000,703141300
2024-01-10,7684.000000,7684.000000,7647.399902,7651.799805,7651.799805,668838800
2024-01-11,7651.799805,7693.899902,7576.600098,7576.600098,7576.600098,1306895000
2024-01-12,7576.600098,7655.200195,7576.600098,7624.899902,7624.899902,794125500


In [33]:
data_update = pd.concat([data, newdata])
data_update
# save_data_to_blob(main_blob_name, data_update)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1999-12-01,6597.200195,6655.799805,6575.299805,6646.000000,6646.000000,642023000
1999-12-02,6646.000000,6694.200195,6623.000000,6653.700195,6653.700195,946100000
1999-12-03,6653.700195,6772.100098,6652.299805,6742.200195,6742.200195,954355000
1999-12-06,6742.200195,6769.399902,6657.100098,6694.000000,6694.000000,640360000
1999-12-07,6694.000000,6728.899902,6659.700195,6660.899902,6660.899902,729797000
...,...,...,...,...,...,...
2024-01-09,7694.200195,7717.500000,7675.100098,7684.000000,7684.000000,703141300
2024-01-10,7684.000000,7684.000000,7647.399902,7651.799805,7651.799805,668838800
2024-01-11,7651.799805,7693.899902,7576.600098,7576.600098,7576.600098,1306895000
2024-01-12,7576.600098,7655.200195,7576.600098,7624.899902,7624.899902,794125500


In [93]:
def eval_feature_drift(reference_data: pd.DataFrame,
                       new_data: pd.DataFrame,
                       col_mapping: ColumnMapping) -> Dict[str, float]:

    report = Report(metrics=[DataDriftPreset()])
    report.run(reference_data=reference_data, current_data=new_data, column_mapping=col_mapping)
    report_dict = report.as_dict()

    drifts = {}

    num_features = col_mapping.numerical_features if col_mapping.numerical_features else []
    cat_features = col_mapping.categorical_features if col_mapping.categorical_features else []

    for feature in num_features + cat_features:
        drifts[feature] = report_dict["metrics"][1]["result"]["drift_by_columns"][feature]["drift_score"]

    drifts['drift_share'] = report_dict["metrics"][0]["result"]["drift_share"]
    return drifts, report

In [96]:
def log_drifts(old_data: pd.DataFrame,
               new_data: pd.DataFrame,
               target: str,
               numerical_features: List[str] = [],
               categorical_features: List[str] = [],
               n_batch: int = 2) -> None:
    
    if not numerical_features and not categorical_features:
        raise AttributeError("Features must be specified")
    
    colmap = ColumnMapping()
    if numerical_features: colmap.numerical_features = numerical_features
    if categorical_features: colmap.categorical_features = categorical_features
    colmap.target = target
    
    start, end = new_data.index.min(), new_data.index.max()
    batch_idx = pd.date_range(start=start, end=end, periods=n_batch+1)

    with mlflow.start_run(run_name=f"data_drift_{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"):

        mlflow.log_param("new_data_start", start)
        mlflow.log_param("new_data_end", end)

        for idx, (sdate, edate) in enumerate(zip(batch_idx, batch_idx[1:])):

            batch = newdata[(sdate <= newdata.index) & (newdata.index < edate)]

            with mlflow.start_run(nested=True, run_name=f"Batch {idx+1}"):
                mlflow.log_param("batch_start", sdate)
                mlflow.log_param("batch_end", edate - pd.Timedelta(days=1))
                batch_drifts, batch_report = eval_feature_drift(old_data, batch, colmap)
                
                batch_report.save_html(f'test{idx}.html')
                mlflow.log_metrics(batch_drifts)
                mlflow.log_artifact(f'test{idx}.html', artifact_path='reports')
                os.remove(f"test{idx}.html")


    
        drifts, report = eval_feature_drift(old_data, new_data, colmap)

        report.save_html('main.html')
        mlflow.log_metrics(drifts)
        mlflow.log_artifact('main.html', artifact_path='reports')
        os.remove('main.html')

    # next step: log plots to GO ALONG WUITH HIWESQPaw
    return


mlflow.set_experiment(experiment_name="yyyeetttty")
log_drifts(data, newdata, 'Adj Close', numerical_features=['Open', 'High', 'Low', 'Volume'])

2024/01/16 14:18:31 INFO mlflow.tracking.fluent: Experiment with name 'yyyeetttty' does not exist. Creating a new experiment.


['Open', 'High', 'Low', 'Volume']
['Open', 'High', 'Low', 'Volume']
['Open', 'High', 'Low', 'Volume']


In [None]:
def trigger_retrain(mean_error_threshold: float) -> None:

    mlflow.start_run(run_name="lstm data drift check 3")

    avg_error = 0
    list_of_errors = np.array([])
    list_of_dates = np.array([])
    threshold_broken = False

    for date, column in newdata.iterrows():

        clear_output(wait=True)

        with mlflow.start_run(nested=True, run_name=date.strftime("%Y-%m-%d")):
            day_data = column.to_frame().transpose()
            X, y = day_data.drop('Adj Close', axis=1), day_data[['Adj Close']]
            y_pred = model.predict(X)
            error = (y-y_pred) ** 2

            list_of_dates = np.append(list_of_dates, date)
            list_of_errors = np.append(list_of_errors, error)

            avg_error = np.average(list_of_errors)

            fig, ax = plt.subplots()
            ax.plot(list_of_dates, list_of_errors)
            ax.set_title('MSE over time')
            ax.set_xlabel('Date')
            ax.set_ylabel('MSE')
            ax.axhline(mean_error_threshold, color='r', linestyle='--')

            plt.show()
            mlflow.log_figure(fig, "mse.png")
            
            #df_to_log = pd.DataFrame(index=list_of_dates, data={'Error': list_of_errors})
            #mlflow.log_artifact(df_to_log.to_csv('error.csv'), artifact_path="errors")

            mlflow.log_metric("rolling_mse", avg_error)

        if avg_error > mean_error_threshold:
            threshold_broken = True
            break

    mlflow.end_run()
    clear_output(wait=True)

    if threshold_broken:
        print("Drift threshold exceeded. Retraining...")
        
        data_used = newdata[newdata.index <= (date)]
        all_data = pd.concat([data, data_used])
        runname = "aalstm_retrain"            
        train_model(all_data, n_epochs=1, test_size=25)
            

trigger_retrain(1000000)


