In [None]:
# dev/training.ipynb
# Minimal example:
#  - Slightly different parameters for dev

#import mlflow
#import mlflow.sklearn
#from sklearn.datasets import load_iris
#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier

#print("DEV TRAINING NOTEBOOK")

# Ingest data
#iris = load_iris()
#X_train, X_test, y_train, y_test = train_test_split(
#    iris.data, iris.target, test_size=0.2, random_state=42
#)

# Dev might use fewer trees, etc.
#model = RandomForestClassifier(n_estimators=5, random_state=42)
#model.fit(X_train, y_train)

#mlflow.set_experiment("/Users/<YOUR_USER>/dev_experiment")  # e.g. Dev-specific MLflow experiment path

#with mlflow.start_run():
#    mlflow.log_param("n_estimators", 5)
#    accuracy = model.score(X_test, y_test)
#    mlflow.log_metric("accuracy", accuracy)
#    mlflow.sklearn.log_model(model, artifact_path="model")
#    print(f"Dev training complete. Accuracy={accuracy}")


In [None]:
%pip install mlflow>=2.0.0 scikit-learn>=1.0.0 pandas>=1.3.0


In [None]:
dbutils.library.restartPython()

In [None]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from mlflow.models.signature import infer_signature

In [None]:
#loading of parameters
catalog_name = dbutils.widgets.get("catalog_name")
schema_name = dbutils.widgets.get("schema_name")
model_name = dbutils.widgets.get("model_name")
exp_name = dbutils.widgets.get("exp_name")
model_description= dbutils.widgets.get("model_description")
model_category = dbutils.widgets.get("model_category")
owner = dbutils.widgets.get("owner")
project = dbutils.widgets.get("project")
team = dbutils.widgets.get("team")

In [None]:
model_tags = {
    "Owner": owner,
    "Project": project,
    "Category": model_category,
    "mlflow.note.content": model_description
}

MODEL_TAGS = model_tags
MODEL_DESCRIPTION = model_description
MODEL_NAME = model_name


In [None]:
iris = load_iris()
data = pd.DataFrame(np.c_[iris.data, iris.target], 
                   columns=iris.feature_names + ['target'])

In [None]:
train_data = data.sample(150, random_state=42)
X_train = train_data[iris.feature_names]
y_train = train_data['target']

In [None]:
experiment_name = exp_name

In [None]:
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id

In [None]:
mlflow.set_experiment(experiment_name)
mlflow.set_registry_uri("databricks-uc")


In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [None]:
registered_model_name = f"{catalog_name}.{schema_name}.{model_name}"

In [None]:
with mlflow.start_run(experiment_id = experiment_id, tags = MODEL_TAGS):
    
    mlflow.log_params({
        "n_estimators": 100,
        "max_depth": 5,
        "random_state": 42
    })
    
    
    mlflow.log_metric("training_accuracy", model.score(X_train, y_train))
    
    signature = infer_signature(X_train, model.predict(X_train))
    input_example = X_train.iloc[:5]
    
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="iris-model",
        signature=signature,
        input_example=input_example,
        registered_model_name=registered_model_name
    )
        
    client = mlflow.MlflowClient()
    latest_version = client.get_latest_versions(
        registered_model_name, 
        stages=[]
    )[0].version
    client.set_registered_model_alias(
        registered_model_name,
        "production",
        latest_version
    )