In [1]:
import pandas as pd
import joblib
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Model, Data
from azure.identity import DefaultAzureCredential
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from azure.ai.ml.entities import Job
import mlflow

# Load and process the dataset
df = pd.read_csv("./data/german_credit_dataset.csv").drop('Sno', axis=1)

X_raw = df.drop('Risk', axis=1)
y_raw = df['Risk']

categorical_features = X_raw.select_dtypes(include=['object']).columns
numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
    ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

feature_engineering_pipeline = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ], remainder="drop")

# Encode Labels
le = LabelEncoder()
encoded_y = le.fit_transform(y_raw)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.25, stratify=encoded_y, random_state=42)

# Create sklearn pipeline
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline),
                         ('classifier', LogisticRegression(solver="lbfgs", random_state=23, penalty='l2'))])

In [2]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Access the variables
subscription_id = os.getenv("SUBSCRIPTION_ID")
resource_group = os.getenv("RESOURCE_GROUP")
workspace_name = os.getenv("WORKSPACE_NAME")

# Connect to ML workspace using InteractiveBrowserCredential
credential = DefaultAzureCredential()

# Create an MLClient object
ml_client = MLClient(credential, subscription_id, resource_group, workspace_name)

In [3]:
# Retrieve the MLflow tracking URI from the workspace
mlflow_tracking_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri
# Set the MLflow tracking URI to Azure ML
mlflow.set_tracking_uri(mlflow_tracking_uri)
# Set the experiment in MLflow
experiment_name = 'german_credit_card_hsg'
mlflow.set_experiment(experiment_name)

  from google.protobuf import service as _service
2024/09/26 21:31:31 INFO mlflow.tracking.fluent: Experiment with name 'german_credit_card_hsg' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1727379094897, experiment_id='139e2307-e993-48d1-b596-ab3ce4f737f8', last_update_time=None, lifecycle_stage='active', name='german_credit_card_hsg', tags={}>

In [4]:
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities import Model

with mlflow.start_run() as run:
    # Train the model
    lr_clf.fit(X_train, y_train)

    # Capture metrics
    train_acc = lr_clf.score(X_train, y_train)
    test_acc = lr_clf.score(X_test, y_test)
    recall = recall_score(y_test, lr_clf.predict(X_test), pos_label=0)

    print("Training accuracy: %.3f" % train_acc)
    print("Test data accuracy: %.3f" % test_acc)
    print("Recall for class 'Bad': ", recall)

    # Log metrics to Azure ML
    mlflow.log_metric('Train accuracy', train_acc)
    mlflow.log_metric('Test accuracy', test_acc)
    mlflow.log_metric('Recall', recall)

    # Save the model locally
    joblib.dump(lr_clf, 'model.pkl')

    # Upload the model to Azure ML
    mlflow.log_artifact('model.pkl')

    # Register the model in Azure ML workspace
    ds = ml_client.data.get(name='german_credit_card_hsg', version='1')
    model = Model(
        name='german-credit-card-hsg',
        path="model.pkl",
        description='Model for German Credit data',
        tags={
            "use": "demo", 
            'recall': recall,
            "dataset_name": ds.name,
            "dataset_version": ds.version,
            },
        type=AssetTypes.CUSTOM_MODEL
    )
    ml_client.models.create_or_update(model)



Training accuracy: 0.743
Test data accuracy: 0.752
Recall for class 'Bad':  0.38666666666666666


[32mUploading model.pkl[32m (< 1 MB): 100%|##########| 6.52k/6.52k [00:00<00:00, 144kB/s]
[39m

2024/09/26 21:34:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run eager_shelf_86zs1g53 at: https://switzerlandnorth.api.azureml.ms/mlflow/v2.0/subscriptions/b5b33273-782e-496a-964f-72acfc982719/resourceGroups/hsg-lesson-rg/providers/Microsoft.MachineLearningServices/workspaces/hsg-lesson-aml/#/experiments/139e2307-e993-48d1-b596-ab3ce4f737f8/runs/49912416-80bf-4e5e-94d1-978eb94e6ccc.
2024/09/26 21:34:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://switzerlandnorth.api.azureml.ms/mlflow/v2.0/subscriptions/b5b33273-782e-496a-964f-72acfc982719/resourceGroups/hsg-lesson-rg/providers/Microsoft.MachineLearningServices/workspaces/hsg-lesson-aml/#/experiments/139e2307-e993-48d1-b596-ab3ce4f737f8.
