In [1]:
import os
import json
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# MLflow is optional at runtime, but we want it for tracking
try:
    import mlflow
    import mlflow.sklearn
    MLFLOW_AVAILABLE = True
except Exception as e:
    print("MLflow not found. Proceeding without tracking.")
    MLFLOW_AVAILABLE = False

# Paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "..")) if os.path.basename(os.getcwd())=="notebooks" else os.getcwd()
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
os.makedirs(ARTIFACT_DIR, exist_ok=True)

if MLFLOW_AVAILABLE:
    # Log to local folder repo/mlruns so it’s portable to CI
    mlflow.set_tracking_uri(f"file://{os.path.join(BASE_DIR, 'mlruns')}")
    mlflow.set_experiment("baseline-iris")


2025/09/15 21:54:26 INFO mlflow.tracking.fluent: Experiment with name 'baseline-iris' does not exist. Creating a new experiment.


In [3]:
def load_data(test_size=0.2, random_state=42):
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    feature_names = iris.feature_names
    target_names = iris.target_names

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    return (X_train, y_train), (X_test, y_test), feature_names, target_names

(X_train, y_train), (X_test, y_test), feature_names, target_names = load_data()
X_train.shape, X_test.shape


((120, 4), (30, 4))

In [4]:
def train_and_log(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    run_name=None
):
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state,
        n_jobs=-1,
    )

    if MLFLOW_AVAILABLE:
        mlflow.sklearn.autolog(log_models=False)  # we will log model manually with a clean artifact path

    if MLFLOW_AVAILABLE:
        with mlflow.start_run(run_name=run_name or f"rf-{datetime.utcnow().isoformat()}") as run:
            model.fit(X_train, y_train)

            preds = model.predict(X_test)
            acc = accuracy_score(y_test, preds)
            f1 = f1_score(y_test, preds, average="macro")

            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("max_depth", max_depth)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("f1_macro", f1)

            # Save model to artifacts with a versioned stamp
            stamp = datetime.utcnow().strftime("%Y%m%dT%H%M%S")
            run_art_dir = os.path.join(ARTIFACT_DIR, f"model_{stamp}")
            os.makedirs(run_art_dir, exist_ok=True)

            # Persist model
            import joblib
            model_path = os.path.join(run_art_dir, "model.joblib")
            joblib.dump(model, model_path)

            # Minimal model card and metrics
            info = {
                "model_type": "RandomForestClassifier",
                "n_estimators": n_estimators,
                "max_depth": max_depth,
                "feature_names": feature_names,
                "target_names": target_names.tolist(),
                "created_utc": datetime.utcnow().isoformat() + "Z",
                "mlflow_run_id": run.info.run_id,
            }
            with open(os.path.join(run_art_dir, "model_info.json"), "w") as f:
                json.dump(info, f, indent=2)

            with open(os.path.join(run_art_dir, "metrics.json"), "w") as f:
                json.dump({"accuracy": acc, "f1_macro": f1}, f, indent=2)

            # Log directory to MLflow
            mlflow.log_artifacts(run_art_dir, artifact_path="model_artifacts")

            # Also log a MLflow model for easy loading later (optional for Lambda size)
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="sk_model",
                registered_model_name=None
            )

            print("Saved locally to:", run_art_dir)
            print("MLflow run id:", run.info.run_id)
            return {"acc": acc, "f1": f1, "artifact_dir": run_art_dir, "run_id": run.info.run_id}
    else:
        # Fallback: no MLflow, still train and save artifacts
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds, average="macro")

        stamp = datetime.utcnow().strftime("%Y%m%dT%H%M%S")
        run_art_dir = os.path.join(ARTIFACT_DIR, f"model_{stamp}")
        os.makedirs(run_art_dir, exist_ok=True)

        import joblib
        joblib.dump(model, os.path.join(run_art_dir, "model.joblib"))
        with open(os.path.join(run_art_dir, "metrics.json"), "w") as f:
            json.dump({"accuracy": acc, "f1_macro": f1}, f, indent=2)

        with open(os.path.join(run_art_dir, "model_info.json"), "w") as f:
            json.dump({
                "model_type": "RandomForestClassifier",
                "n_estimators": n_estimators,
                "max_depth": max_depth,
                "feature_names": feature_names,
                "target_names": target_names.tolist(),
                "created_utc": datetime.utcnow().isoformat() + "Z",
            }, f, indent=2)

        print("Saved locally to:", run_art_dir)
        return {"acc": acc, "f1": f1, "artifact_dir": run_art_dir, "run_id": None}

results = train_and_log(n_estimators=300, max_depth=None)
results




Saved locally to: /Users/saikamat/Documents/Python Scripts/FDE-Deployments/02-ml-deployment-pipeline/artifacts/model_20250915T195522
MLflow run id: 1db77263faeb498db68543aed72e2ce2


{'acc': 0.9,
 'f1': 0.899749373433584,
 'artifact_dir': '/Users/saikamat/Documents/Python Scripts/FDE-Deployments/02-ml-deployment-pipeline/artifacts/model_20250915T195522',
 'run_id': '1db77263faeb498db68543aed72e2ce2'}

In [5]:
# Load the most recent model artifact and run a simple prediction
import glob, joblib

all_models = sorted(glob.glob(os.path.join(ARTIFACT_DIR, "model_*")), reverse=True)
latest = all_models[0]
clf = joblib.load(os.path.join(latest, "model.joblib"))

sample = X_test[:5]
preds = clf.predict(sample)
pd.DataFrame({
    "pred": preds,
    "target": y_test[:5]
})


Unnamed: 0,pred,target
0,0,0
1,2,2
2,1,1
3,1,1
4,0,0
