In [1]:
from diagrams import Diagram, Cluster, Node
from diagrams.generic.compute import Rack
from diagrams.generic.storage import Storage
from diagrams.generic.database import SQL
from diagrams.aws.analytics import Glue

In [4]:
with Diagram('ML Data Processing Workflow', direction="LR", outformat="jpg") as diag:
    
    # Step 1: Raw dataset input
    raw_data = Storage("Raw Dataset")

    # Step 2: Initial EDA
    eda = Glue("Preliminary EDA")

    # Step 3: Split into group datasets
    split = Rack("Split into 4 Datasets")

    # Step 4: Parallel feature engineering
    with Cluster("Demographic"):
        demo_fe = Rack("Feature Eng.")
        demo_fs = Rack("Feature Sel.")
        demo_fe >> demo_fs
    
    with Cluster("Clinical"):
        clinical_fe = Rack("Feature Eng.")
        clinical_fs = Rack("Feature Sel.")
        clinical_fe >> clinical_fs

    with Cluster("Social"):
        social_fe = Rack("Feature Eng.")
        social_fs = Rack("Feature Sel.")
        social_fe >> social_fs
    
    with Cluster("Insurance"):
        ins_fe = Rack("Feature Eng.")
        ins_fs = Rack("Feature Sel.")
        ins_fe >> ins_fs
    
    # Step 5: Merge and output
    merge = Rack("Merge Selected Features")
    final_data = SQL("Final Processed Dataset")

    # Connections
    raw_data >> eda >> split

    split >> demo_fe
    split >> clinical_fe
    split >> social_fe
    split >> ins_fe

    demo_fs >> merge
    clinical_fs >> merge
    social_fs >> merge
    ins_fs >> merge

    merge >> final_data







In [6]:
from diagrams import Diagram, Cluster
from diagrams.aws.storage import S3
from diagrams.aws.ml import Sagemaker
from diagrams.aws.analytics import Glue
from diagrams.aws.compute import Lambda
from diagrams.aws.database import Redshift

with Diagram("ML Data Processing Workflow", direction="LR"):

    # Initial raw dataset
    raw_data = S3("Raw Dataset")

    # Step 1: Preliminary EDA
    eda = Glue("Preliminary EDA")

    # Step 2: Split
    split = Lambda("Split into 4 Datasets")

    # Step 3: Parallel feature engineering + selection
    with Cluster("Grouped Datasets (EDA + Feature Engineering + Selection)"):
        with Cluster("Demographic"):
            demo_fe = Sagemaker("Feature Engineering")
            demo_fs = Sagemaker("Feature Selection")
            demo_fe >> demo_fs

        with Cluster("Clinical"):
            clinical_fe = Sagemaker("Feature Engineering")
            clinical_fs = Sagemaker("Feature Selection")
            clinical_fe >> clinical_fs

        with Cluster("Social"):
            social_fe = Sagemaker("Feature Engineering")
            social_fs = Sagemaker("Feature Selection")
            social_fe >> social_fs

        with Cluster("Insurance"):
            ins_fe = Sagemaker("Feature Engineering")
            ins_fs = Sagemaker("Feature Selection")
            ins_fe >> ins_fs

    # Step 4: Merge + Final Dataset
    merge = Lambda("Merge Selected Features")
    final_dataset = Redshift("Final Processed Dataset")

    # Connections
    raw_data >> eda >> split

    split >> demo_fe
    split >> clinical_fe
    split >> social_fe
    split >> ins_fe

    demo_fs >> merge
    clinical_fs >> merge
    social_fs >> merge
    ins_fs >> merge

    merge >> final_dataset

In [8]:
from diagrams import Diagram, Cluster, Node
from diagrams.aws.iot import IotAnalyticsDataSet
from diagrams.aws.ml import Sagemaker
from diagrams.aws.analytics import Glue
from diagrams.aws.compute import Lambda
from diagrams.aws.database import Redshift
from diagrams.generic.compute import Rack
from diagrams.generic.storage import Storage
from diagrams.generic.database import SQL

In [None]:
with Diagram("ML Data Processing Workflow", direction="LR"):

    with Cluster ("EDA + Data Preprocessing"):
        # Initial raw dataset
        raw_data = IotAnalyticsDataSet("Raw Dataset")

        # Step 1: Preliminary EDA
        eda = Glue("Preliminary EDA")

        # Step 2: Split
        split = IotAnalyticsDataSet("")

        # Step 3: Parallel feature engineering + selection
        with Cluster("Dataset split into 4 groups: Feature Engineering + Feature Selection"):
            with Cluster("Demographic group"):
                demo_fe = IotAnalyticsDataSet("Feature Eng.")
                demo_fs = IotAnalyticsDataSet("Feature Sel.")
                demo_fe >> demo_fs

            with Cluster("Clinical group"):
                clinical_fe = IotAnalyticsDataSet("Feature Eng.")
                clinical_fs = IotAnalyticsDataSet("Feature Sel.")
                clinical_fe >> clinical_fs

            with Cluster("Social group"):
                social_fe = IotAnalyticsDataSet("Feature Eng.")
                social_fs = IotAnalyticsDataSet("Feature Sel.")
                social_fe >> social_fs

            with Cluster("Insurance group"):
                ins_fe = IotAnalyticsDataSet("Feature Eng.")
                ins_fs = IotAnalyticsDataSet("Feature Sel.")
                ins_fe >> ins_fs

        # Step 4: Merge + Final Dataset
        final_dataset = IotAnalyticsDataSet("Final Processed Dataset")

        # Connections
        raw_data >> eda >> split

        split >> demo_fe
        split >> clinical_fe
        split >> social_fe
        split >> ins_fe

        demo_fs >> final_dataset
        clinical_fs >> final_dataset
        social_fs >> final_dataset
        ins_fs >> final_dataset

