### setting up the environment

In [None]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

In [None]:
import os
## Either get environment variables, or a fallback name, which is the second parameter.
## Currently, fill in the fallback values. Later on, we will make sure to work with Environment values. So we're already preparing for it in here!
workspace_name = os.environ.get('WORKSPACE', 'ml-workspace')
subscription_id = os.environ.get('SUBSCRIPTION_ID', '72b19ebb-8177-4d32-a321-da2a8bd06ccb')
resource_group = os.environ.get('RESOURCE_GROUP', 'mlops-project')

In [None]:
# The credential "DefaultAzureCredential" will use the same name as your logged in user.
credential = DefaultAzureCredential()

In [None]:
ml_client = MLClient(
    credential, subscription_id, resource_group, workspace_name
)

In [None]:
import os

# Folder and file path
folder_path = "./components/dataprep"
file_path = os.path.join(folder_path, "conda.yaml")

# Create folder structure if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# YAML content
conda_yaml_content = """
name: aml-Pillow
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - xlrd==2.0.1
    - mlflow==1.26.1
    - azureml-mlflow==1.42.0
    - Pillow==10.0.1
"""

# Write to file
with open(file_path, "w") as f:
    f.write(conda_yaml_content)

print(" conda.yaml written to", file_path)

### Pillow lets you work with images inside your Azure ML (AML) projects.

### 

In [None]:
from azure.ai.ml.entities import Environment
import os

custom_env_name = "aml-Pillow"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Image Processing (with Pillow)",
    tags={"Pillow": "10.0.1"},
    conda_file=os.path.join("components", "dataprep", "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

### Creating components

### Data preparing 

In [None]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output
import os

# Define the data prep component
data_prep_component = command(
    name="data_prep_png_to_jpg",
    display_name="Data Preparation - PNG to JPG (28x28)",
    description="Converts 28x28 PNG images to JPG format.",
    inputs={
        "data": Input(type="uri_folder"),
    },
    outputs={
        "output_data": Output(type="uri_folder", mode="rw_mount"),
    },
    code=os.path.join("components", "dataprep", "code"),  # Must contain dataprep.py
    command="""
        python dataprep.py \
        --data ${{inputs.data}} \
        --output_data ${{outputs.output_data}}
    """,
    environment="aml-Pillow@latest",
)

# Register the component
data_prep_component = ml_client.create_or_update(data_prep_component.component)

print(
    f"✅ Component {data_prep_component.name} (v{data_prep_component.version}) registered"
)


In [None]:
import os
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    compute="compute-sadikshasapkota",
    description="Data preprocessing for MNIST folders"
)
def mnist_preprocessing_pipeline():  # 🚫 no input_version here
    jobs = {}

    input_version = "1"  # ✅ hardcoded — this is the fix
    workspace_name = os.environ.get('WORKSPACE', 'ml-workspace')
    subscription_id = os.environ.get('SUBSCRIPTION_ID', '72b19ebb-8177-4d32-a321-da2a8bd06ccb')
    resource_group = os.environ.get('RESOURCE_GROUP', 'mlops-project')

    for i in range(10):
        label = str(i)
        folder_name = f"minst-{i}"

        data_prep_job = data_prep_component(
            data=Input(
                type="uri_folder",
                path=f"azureml:{folder_name}:{input_version}"  # ✅ this will resolve correctly now
            )
        )

        output_path = (
            f"azureml://subscriptions/{subscription_id}/resourcegroups/{resource_group}/"
            f"workspaces/{workspace_name}/datastores/workspaceblobstore/paths/processed_mnist/{label}"
        )

        data_prep_job.outputs.output_data = Output(
            type="uri_folder",
            path=output_path,
            name=f"mnist_{label}_jpg",
            mode="rw_mount"
        )

        jobs[f"label_{label}"] = data_prep_job

    return {
        f"label_{label}": job.outputs.output_data for label, job in jobs.items()
    }


In [None]:
pipeline = mnist_preprocessing_pipeline()


In [None]:
import webbrowser
# Submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="mnist_preprocessing_pipeline"
)

# Open the job in Azure ML Studio
webbrowser.open(pipeline_job.studio_url)

### Train test split

In [None]:
from azure.ai.ml import command, Input, Output
from azure.ai.ml.entities import Environment

data_split_component = command(
    name="mnist_data_split",
    display_name="Split MNIST Datasets",
    description="Splits 10 MNIST datasets into training and testing sets.",
    inputs={
        "mnist_0": Input(type="uri_folder"),
        "mnist_1": Input(type="uri_folder"),
        "mnist_2": Input(type="uri_folder"),
        "mnist_3": Input(type="uri_folder"),
        "mnist_4": Input(type="uri_folder"),
        "mnist_5": Input(type="uri_folder"),
        "mnist_6": Input(type="uri_folder"),
        "mnist_7": Input(type="uri_folder"),
        "mnist_8": Input(type="uri_folder"),
        "mnist_9": Input(type="uri_folder"),
        "train_test_split_factor": Input(type="number"),
    },
    outputs={
        "training_data": Output(type="uri_folder", mode="rw_mount"),
        "testing_data": Output(type="uri_folder", mode="rw_mount"),
    },
    code="Components/dataprep/code",  # must contain traintestsplit.py
    command="""python traintestsplit.py \
        --datasets ${{inputs.mnist_0}} ${{inputs.mnist_1}} ${{inputs.mnist_2}} \
                  ${{inputs.mnist_3}} ${{inputs.mnist_4}} ${{inputs.mnist_5}} \
                  ${{inputs.mnist_6}} ${{inputs.mnist_7}} ${{inputs.mnist_8}} \
                  ${{inputs.mnist_9}} \
        --split_size ${{inputs.train_test_split_factor}} \
        --training_data_output ${{outputs.training_data}} \
        --testing_data_output ${{outputs.testing_data}} \
    """,
    environment="aml-Pillow@latest",
)

data_split_component = ml_client.create_or_update(data_split_component.component)
print(f"✅ Component registered: {data_split_component.name}")


In [None]:
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    compute="compute-sadikshasapkota",
    description="Custom data_split pipeline for MNIST 0-9"
)
def mnist_images_traintest_split_pipeline(
    train_test_split: int,
    mnist_0: Input,
    mnist_1: Input,
    mnist_2: Input,
    mnist_3: Input,
    mnist_4: Input,
    mnist_5: Input,
    mnist_6: Input,
    mnist_7: Input,
    mnist_8: Input,
    mnist_9: Input,
):
    data_split_job = data_split_component(
        mnist_0=mnist_0,
        mnist_1=mnist_1,
        mnist_2=mnist_2,
        mnist_3=mnist_3,
        mnist_4=mnist_4,
        mnist_5=mnist_5,
        mnist_6=mnist_6,
        mnist_7=mnist_7,
        mnist_8=mnist_8,
        mnist_9=mnist_9,
        train_test_split_factor=train_test_split
    )

    data_split_job.outputs.training_data = Output(
        type="uri_folder",
        name="training_data",
        mode="rw_mount"
    )
    data_split_job.outputs.testing_data = Output(
        type="uri_folder",
        name="testing_data",
        mode="rw_mount"
    )

    return {
        "training_data": data_split_job.outputs.training_data,
        "testing_data": data_split_job.outputs.testing_data
    }


In [None]:
version = "1"
mnist_inputs = {
    f"mnist_{i}": Input(type="uri_folder", path=f"azureml:mnist_{i}_jpg:{version}")
    for i in range(10)
}

train_test_pipeline = mnist_images_traintest_split_pipeline(
    **mnist_inputs,
    train_test_split=20
)


In [None]:
import webbrowser

pipeline_job = ml_client.jobs.create_or_update(
    train_test_pipeline,
    experiment_name="mnist_split_experiment",
)
webbrowser.open(pipeline_job.studio_url)


### Trainng the model

#### Environment and compnents pipeline registering

In [None]:
from azure.ai.ml.entities import Environment
import os

custom_env_name = "aml-Tensorflow-Pillow"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for AI Training (with Pillow)",
    tags={"Pillow": "10.0.1", "Tensorflow": "2.4.1"},
    conda_file=os.path.join("components", "training", "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

In [None]:
import os
from azure.ai.ml import command, Input, Output

# Define the training component
training_component = command(
    name="mnist_training",
    display_name="Training MNIST CNN",
    description="Trains a CNN on MNIST JPG images",
    inputs={
        "training_folder": Input(type="uri_folder"),
        "testing_folder": Input(type="uri_folder"),
        "epochs": Input(type="number"),
    },
    outputs={
        "output_folder": Output(type="uri_folder", mode="rw_mount"),
    },
    code=os.path.join("components", "training", "code"),  # must contain train.py and utils.py
    command="""python train.py \
        --training_folder ${{inputs.training_folder}} \
        --testing_folder ${{inputs.testing_folder}} \
        --output_folder ${{outputs.output_folder}} \
        --epochs ${{inputs.epochs}}""",
    environment="aml-Tensorflow-Pillow@latest",
)

# Register the component
training_component = ml_client.create_or_update(training_component.component)
print(f"✅ Registered: {training_component.name} (v{training_component.version})")


In [None]:
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    compute="compute-sadikshasapkota",
    description="MNIST Training Pipeline",
)
def mnist_training_pipeline(
    training_folder: Input,
    testing_folder: Input,
    epochs: int,
):
    training_job = training_component(
        training_folder=training_folder,
        testing_folder=testing_folder,
        epochs=epochs
    )

    training_job.outputs.output_folder = Output(
        type="uri_folder",
        name="output_data",
        mode="rw_mount"
    )

    return {
        "output_data": training_job.outputs.output_folder,
    }


In [None]:
training_pipeline = mnist_training_pipeline(
    training_folder=Input(type="uri_folder", path="azureml:training_data:1"),
    testing_folder=Input(type="uri_folder", path="azureml:testing_data:1"),
    epochs=5
)

import webbrowser

training_pipeline_job = ml_client.jobs.create_or_update(
    training_pipeline,
    experiment_name="mnist_training_pipeline"
)

# Open the job in your browser
webbrowser.open(training_pipeline_job.studio_url)


### Chaining the whole pipeline together

In [None]:
prep = ml_client.components.get(name="data_prep_png_to_jpg", version="1")
split = ml_client.components.get(name="mnist_data_split", version="1")
train = ml_client.components.get(name="mnist_training", version="1")


In [None]:
@pipeline(default_compute="compute-sadikshasapkota")
def full_mnist_pipeline(
    raw_input: Input,
    split_ratio: float = 20.0,
    epochs: int = 5
):
    # Fix: this component only accepts `data`, and it returns a default output (often just `.outputs.output`)
    prep_step = prep(
        data=raw_input
    )

    # This assumes `prep_step.outputs.output` is a single folder with all images
    split_step = split(
    datasets=[prep_step.outputs.output_data],  # ✅ expects list
    split_size=split_ratio                     # ✅ correct key name
)


    train_step = train(
        training_folder=split_step.outputs.training_data,
        testing_folder=split_step.outputs.testing_data,
        epochs=epochs
    )

    return {
        "trained_model": train_step.outputs.output_folder
    }


In [None]:
pipeline_job = full_mnist_pipeline(
    raw_input=Input(
        type="uri_folder",
        path="azureml:mnist_raw_dataset:1"
    ),
    split_ratio=20.0,
    epochs=5
)

pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job,
    experiment_name="mnist_full_pipeline"
)

pipeline_job
