In [5]:
# LOGIN
# Handle to the workspace
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

In [6]:
# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="56539498-d3d8-4a3b-92f4-f3b098a11d1e",
    resource_group_name="continuous_review_ms_and_ucl",
    workspace_name="EPPI_DEV",
)

In [7]:
# Retrieve an existing environment from the workspace
env_name = "aml-eppi-text-classification"
env_version = "0.1.3"  # Specify the version of the environment
pipeline_job_env = ml_client.environments.get(name=env_name, version=env_version)

In [3]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


data_path = "../data/raw/debunking_review.tsv"

debunking_data = Data(
    name="debunking_review_data",
    path=data_path,
    type=AssetTypes.URI_FILE,
    description="Dataset for testing sams text classification pipeline",
    version="1.0.0",
)

In [None]:
debunking_data = ml_client.data.create_or_update(debunking_data)
print(
    f"Dataset with name {debunking_data.name} was registered to workspace, the dataset version is {debunking_data.version}"
)

In [47]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [26]:
%%writefile {dependencies_dir}/conda.yaml
name: eppi-text-classification-env 
channels:
  - conda-forge 
dependencies:
  - python=3.11.8
  - pip=24.0
  - pip:
    - git+https://github.com/samjmolyneux/eppi-text-classification.git@dev
    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

Overwriting ./dependencies/conda.yaml


In [28]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-eppi-text-classification"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for eppi classifier workbench pipeline",
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.1.3",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-eppi-text-classification is registered to workspace, the environment version is 0.1.3


## MLflow env

In [48]:
%%writefile {dependencies_dir}/display_image_env.yaml
name: display-image-env 
channels:
  - conda-forge 
dependencies:
  - python=3.11.8
  - pip=24.0
  - pip:
    - azureml-mlflow==1.42.0  

Writing ./dependencies/display_image_env.yaml


In [51]:
from azure.ai.ml.entities import Environment

custom_env_name = "display-image-env"

display_image_env = Environment(
    name=custom_env_name,
    description="Environment for displaying images in azure ml",
    conda_file=os.path.join(dependencies_dir, "display_image_env.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.1.0",
)
display_image_env = ml_client.environments.create_or_update(display_image_env)

print(
    f"Environment with name {display_image_env.name} is registered to workspace, the environment version is {display_image_env.version}"
)

Environment with name display-image-env is registered to workspace, the environment version is 0.1.0


## Process data component


In [98]:
import os

process_data = "./components/process_data"
os.makedirs(process_data, exist_ok=True)

In [99]:
%%writefile {process_data}/data_prep.py
import argparse
import os

import numpy as np
import pandas as pd
from scipy.sparse import save_npz

from eppi_text_classification import (
    get_features_and_labels,
    get_tfidf_and_names,
)


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input dataframe")
    parser.add_argument("--labels", type=str, help="path to ordered list of labels")
    parser.add_argument(
        "--tfidf_scores", type=str, help="path to tfidf scores for data"
    )
    parser.add_argument(
        "--feature_names", type=str, help="path to ordered list of feature names"
    )
    args = parser.parse_args()

    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)

    df = pd.read_csv(args.data, sep="\t")

    word_features, labels = get_features_and_labels(df)
    tfidf_scores, feature_names = get_tfidf_and_names(word_features)

    print(f"labels: {args.labels}")
    print(f"feature_names: {args.feature_names}")
    print(f"tfidf_scores: {args.tfidf_scores}")

    np.save(os.path.join(args.labels, "labels.npy"), labels)
    np.save(os.path.join(args.feature_names, "feature_names.npy"), feature_names)
    save_npz(os.path.join(args.tfidf_scores, "tfidf_scores.npz"), tfidf_scores)


if __name__ == "__main__":
    main()

Overwriting ./components/process_data/data_prep.py


In [100]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

data_prep_component = command(
    name="data_prepocessing_for_classifier_workbench",
    display_name="Data preprocessing for eppi classifier workbench",
    description="Tokenizes and processes text data using spaCy then generates tfirf",
    inputs={
        "data": Input(type="uri_file"),
    },
    outputs={
        "labels": Output(type="uri_folder", mode="rw_mount"),
        "feature_names": Output(type="uri_folder", mode="rw_mount"),
        "tfidf_scores": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=process_data,
    command="""python data_prep.py \
            --data ${{inputs.data}} \
            --labels ${{outputs.labels}} \
            --tfidf_scores ${{outputs.tfidf_scores}} \
            --feature_names ${{outputs.feature_names}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [101]:
# Now we register the component to the workspace
data_prep_component = ml_client.create_or_update(data_prep_component.component)

# Create (register) the component in your workspace
print(
    f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered"
)

Component data_prepocessing_for_classifier_workbench with Version 2024-09-20-14-12-52-2262068 is registered


## Search Parameters Data


In [102]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


data_path = "user_inputs/hyperparam_search_input.json"

search_params = Data(
    name="hyperparameter_search_parameter_placeholder",
    path=data_path,
    type=AssetTypes.URI_FILE,
    description=(
        "Place holder for the hyperparameter search parameters for eppi classifier"
        " workbench"
    ),
    version="1.0.0",
)

search_params = ml_client.data.create_or_update(search_params)
print(
    f"Dataset with name {search_params.name} was registered to workspace, the dataset version is {search_params.version}"
)

HttpResponseError: (UserError) A data version with this name and version already exists. If you are trying to create a new data version, use a different name or version. If you are trying to update an existing data version, the existing asset's data uri cannot be changed. Only tags, description, and isArchived can be updated.
Code: UserError
Message: A data version with this name and version already exists. If you are trying to create a new data version, use a different name or version. If you are trying to update an existing data version, the existing asset's data uri cannot be changed. Only tags, description, and isArchived can be updated.
Additional Information:Type: ComponentName
Info: {
    "value": "managementfrontend"
}Type: Correlation
Info: {
    "value": {
        "operation": "4fe2c76f8e3571dfa1b7fcf245b92593",
        "request": "806c5f3af3e9b52d"
    }
}Type: Environment
Info: {
    "value": "westeurope"
}Type: Location
Info: {
    "value": "westeurope"
}Type: Time
Info: {
    "value": "2024-09-20T14:12:56.3993302+00:00"
}Type: InnerError
Info: {
    "value": {
        "code": "Immutable",
        "innerError": {
            "code": "DataVersionPropertyImmutable",
            "innerError": null
        }
    }
}Type: MessageFormat
Info: {
    "value": "A data version with this name and version already exists. If you are trying to create a new data version, use a different name or version. If you are trying to update an existing data version, the existing asset's {property} cannot be changed. Only tags, description, and isArchived can be updated."
}Type: MessageParameters
Info: {
    "value": {
        "property": "data uri"
    }
}

# HYPERPARAMETER SEARCH COMPONENT

In [84]:
import os

hyperparameter_search = "./components/hyperparameter_search"
os.makedirs(hyperparameter_search, exist_ok=True)

In [103]:
%%writefile {hyperparameter_search}/optuna_search.py
import argparse
import json
import os
import time

import jsonpickle

from eppi_text_classification import OptunaHyperparameterOptimisation
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_np_array_at_directory,
)


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--labels",
        type=str,
        help="path to ordered list of labels",
    )
    parser.add_argument(
        "--tfidf_scores",
        type=str,
        help="path to tfidf scores for data",
    )
    parser.add_argument(
        "--search_parameters",
        type=str,
        help="path to search parameters for the optuna search",
    )
    parser.add_argument(
        "--best_params",
        type=str,
        help="path to best hypereparameters found by the search",
    )
    parser.add_argument(
        "--search_db",
        type=str,
        help="path to optuna search database",
    )
    args = parser.parse_args()

    tfidf_scores = load_csr_at_directory(args.tfidf_scores)
    labels = load_np_array_at_directory(args.labels)
    with open(args.search_parameters, "r") as file:
        json_search_parameters = file.read()
    kwargs = jsonpickle.decode(json_search_parameters)

    optuna_db_path = os.path.join(args.search_db, "optuna.db")
    print(f"optuna_db_path: {optuna_db_path}")

    # with open("/mnt/optuna.db", 'w') as f:
    #     pass

    model_name = kwargs["model_name"]
    num_trials_per_job = kwargs["num_trials_per_job"]
    n_folds = 3 if "n_folds" not in kwargs else kwargs["n_folds"]
    num_cv_repeats = 1 if "num_cv_repeats" not in kwargs else kwargs["num_cv_repeats"]
    print(f"model_name: {model_name}")
    print(f"num_trials_per_job: {num_trials_per_job}")
    print(f"n_folds: {n_folds}")
    print(f"num_cv_repeats: {num_cv_repeats}")

    # Perform the search
    optimiser = OptunaHyperparameterOptimisation(
        tfidf_scores,
        labels,
        model_name,
        n_trials_per_job=num_trials_per_job,
        n_jobs=-1,
        nfolds=n_folds,
        num_cv_repeats=num_cv_repeats,
        # db_url=f"sqlite:////mnt/optuna.db", #Use this one on Azure
        # db_url=None,
        db_url=f"sqlite:///{optuna_db_path}",
    )

    start = time.time()
    best_params = optimiser.optimise_hyperparameters(study_name="hyperparam_search")
    print(f"Time taken: {time.time() - start}")

    # Save the best parameters
    best_params["model_name"] = model_name
    best_params = jsonpickle.encode(best_params, keys=True)
    best_param_path = os.path.join(args.best_params, "model_params.json")
    with open(best_param_path, "w") as f:
        json.dump(best_params, f)


if __name__ == "__main__":
    main()

Overwriting ./components/hyperparameter_search/optuna_search.py


In [104]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

hyperparameter_search_component = command(
    name="hyperparameter_search_for_classifier_workbench",
    display_name="Hyperparameter search for eppi classifier workbench",
    description=(
        "Uses parallel optuna to search for best hyperparameters for a given "
        "model, storing the history on a sqlite database"
    ),
    inputs={
        "labels": Input(type="uri_folder"),
        "tfidf_scores": Input(type="uri_folder"),
        "search_parameters": Input(type="uri_file"),
    },
    outputs={
        "best_params": Output(type="uri_folder", mode="rw_mount"),
        "search_db": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=hyperparameter_search,
    command="""python optuna_search.py \
            --labels ${{inputs.labels}} \
            --tfidf_scores ${{inputs.tfidf_scores}} \
            --search_parameters ${{inputs.search_parameters}} \
            --best_params ${{outputs.best_params}} \
            --search_db ${{outputs.search_db}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [105]:
# Now we register the component to the workspace
hyperparameter_search_component = ml_client.create_or_update(
    hyperparameter_search_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {hyperparameter_search_component .name} with Version {hyperparameter_search_component .version} is registered"
)

Component hyperparameter_search_for_classifier_workbench with Version 2024-09-20-14-13-02-9270807 is registered


## HyperParamSearch Data Input

In [None]:
{
    "model_name": "LGBMClassifier",
    "num_trials_per_job": 3,
    "n_folds": 3,
    "num_cv_repeats": 1,
}

In [70]:
import jsonpickle
import json

hyperparam_search_input = {
    "model_name": "LGBMClassifier",
    "num_trials_per_job": 3,
    "n_folds": 3,
    "num_cv_repeats": 1,
}

serialized_input = jsonpickle.encode(hyperparam_search_input, keys=True)

with open("user_inputs/hyperparam_search_input.json", "w") as file:
    file.write(serialized_input)
    # json.dump(hyperparam_search_input, file)

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


data_path = "../data/raw/debunking_review.tsv"

debunking_data = Data(
    name="debunking_review_data",
    path=data_path,
    type=AssetTypes.URI_FILE,
    description="Dataset for testing sams text classification pipeline",
    version="1.0.0",
)
debunking_data = ml_client.data.create_or_update(debunking_data)
print(
    f"Dataset with name {debunking_data.name} was registered to workspace, the dataset version is {debunking_data.version}"
)

## Test size data input

In [106]:
import json

with open("user_inputs/test_size_025.json", "w") as file:
    json.dump("0.25", file)
with open("user_inputs/test_size_05.json", "w") as file:
    json.dump("0.5", file)

In [202]:
import json

with open("user_inputs/false.json", "w") as file:
    json.dump(False, file)

In [203]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


data_path = "user_inputs/false.json"

user_input_false = Data(
    name="user_input_false",
    path=data_path,
    type=AssetTypes.URI_FILE,
    description="A place holder for user input of bool False",
    version="1.0.0",
)
user_input_false = ml_client.data.create_or_update(user_input_false)
print(
    f"Dataset with name {user_input_false.name} was registered to workspace, the dataset version is {user_input_false.version}"
)

# data_path = "user_inputs/test_size_05.json"

# test_size = Data(
#     name="user_input_test_size_05",
#     path=data_path,
#     type=AssetTypes.URI_FILE,
#     description="A placeholder for user input for the test size",
#     version="1.0.0",
# )
# test_size = ml_client.data.create_or_update(test_size)
# print(
#     f"Dataset with name {test_size.name} was registered to workspace, the dataset version is {test_size.version}"
# )

HttpResponseError: (UserError) A data version with this name and version already exists. If you are trying to create a new data version, use a different name or version. If you are trying to update an existing data version, the existing asset's data uri cannot be changed. Only tags, description, and isArchived can be updated.
Code: UserError
Message: A data version with this name and version already exists. If you are trying to create a new data version, use a different name or version. If you are trying to update an existing data version, the existing asset's data uri cannot be changed. Only tags, description, and isArchived can be updated.
Additional Information:Type: ComponentName
Info: {
    "value": "managementfrontend"
}Type: Correlation
Info: {
    "value": {
        "operation": "ea4f1f264fe3c5659d23bf043eeeca7a",
        "request": "359973ed8d4efa75"
    }
}Type: Environment
Info: {
    "value": "westeurope"
}Type: Location
Info: {
    "value": "westeurope"
}Type: Time
Info: {
    "value": "2024-09-20T07:53:59.3062296+00:00"
}Type: InnerError
Info: {
    "value": {
        "code": "Immutable",
        "innerError": {
            "code": "DataVersionPropertyImmutable",
            "innerError": null
        }
    }
}Type: MessageFormat
Info: {
    "value": "A data version with this name and version already exists. If you are trying to create a new data version, use a different name or version. If you are trying to update an existing data version, the existing asset's {property} cannot be changed. Only tags, description, and isArchived can be updated."
}Type: MessageParameters
Info: {
    "value": {
        "property": "data uri"
    }
}

## Split Training Data

In [107]:
import os

split_data = "./components/split_data"
os.makedirs(split_data, exist_ok=True)

In [108]:
%%writefile {split_data}/split_data.py
import argparse
import json
import os

import jsonpickle
import numpy as np
from scipy.sparse import load_npz, save_npz
from sklearn.model_selection import train_test_split

from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_np_array_at_directory,
)


def main():
    # input and output arguments
    print("before parse")
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--labels",
        type=str,
        help="path to ordered list of labels",
    )
    parser.add_argument(
        "--tfidf_scores",
        type=str,
        help="path to tfidf scores for data",
    )
    parser.add_argument(
        "--test_size",
        type=str,
        help="path to the test size as a proportion of the data",
    )
    parser.add_argument(
        "--X_train",
        type=str,
        help="path to X_train",
    )
    parser.add_argument(
        "--X_test",
        type=str,
        help="path to X_test",
    )
    parser.add_argument(
        "--y_train",
        type=str,
        help="path to y_train",
    )
    parser.add_argument(
        "--y_test",
        type=str,
        help="path to y_test",
    )
    args = parser.parse_args()
    tfidf_scores = load_csr_at_directory(args.tfidf_scores)
    labels = load_np_array_at_directory(args.labels)
    with open(args.test_size, "r") as file:
        test_size = float(json.load(file))

    print(f"test_size: {test_size}")
    X_train, X_test, y_train, y_test = train_test_split(
        tfidf_scores, labels, test_size=test_size, stratify=labels, random_state=8
    )

    save_npz(os.path.join(args.X_train, "X_train.npz"), X_train)
    save_npz(os.path.join(args.X_test, "X_test.npz"), X_test)
    np.save(os.path.join(args.y_train, "y_train.npy"), y_train)
    np.save(os.path.join(args.y_test, "y_test.npy"), y_test)


if __name__ == "__main__":
    main()

Overwriting ./components/split_data/split_data.py


In [109]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

split_data_component = command(
    name="split_data_for_classifier_workbench",
    display_name="Split data into two sets",
    description=(
        "Uses train_test_split to split the data into two sets, storing the split data"
    ),
    inputs={
        "labels": Input(type="uri_folder"),
        "tfidf_scores": Input(type="uri_folder"),
        "test_size": Input(type="uri_file"),
    },
    outputs={
        "X_train": Output(type="uri_folder", mode="rw_mount"),
        "X_test": Output(type="uri_folder", mode="rw_mount"),
        "y_train": Output(type="uri_folder", mode="rw_mount"),
        "y_test": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=split_data,
    command="""python split_data.py \
            --labels ${{inputs.labels}} \
            --tfidf_scores ${{inputs.tfidf_scores}} \
            --test_size ${{inputs.test_size}} \
            --X_train ${{outputs.X_train}} \
            --X_test ${{outputs.X_test}} \
            --y_train ${{outputs.y_train}} \
            --y_test ${{outputs.y_test}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [110]:
# Now we register the component to the workspace
split_data_component = ml_client.create_or_update(split_data_component.component)

# Create (register) the component in your workspace
print(
    f"Component {split_data_component.name} with Version {split_data_component.version} is registered"
)

Component split_data_for_classifier_workbench with Version 2024-09-20-14-13-16-7094096 is registered


## Train Model


In [135]:
import os

train_model = "./components/train_model"
os.makedirs(train_model, exist_ok=True)

In [136]:
%%writefile {train_model}/train_model.py

import argparse
import json
import os

import joblib
import jsonpickle
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_json_at_directory,
    load_np_array_at_directory,
)

mname_to_mclass = {
    "SVC": SVC,
    "LGBMClassifier": LGBMClassifier,
    "RandomForestClassifier": RandomForestClassifier,
    "XGBClassifier": XGBClassifier,
}


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--X_train",
        type=str,
        help="path to training data",
    )
    parser.add_argument(
        "--y_train",
        type=str,
        help="path to training labels",
    )
    parser.add_argument(
        "--model_parameters",
        type=str,
        help="path to model training parameters",
    )
    parser.add_argument(
        "--model",
        type=str,
        help="path to trained model",
    )
    args = parser.parse_args()

    X_train = load_csr_at_directory(args.X_train)
    y_train = load_np_array_at_directory(args.y_train)
    model_parameters = load_json_at_directory(args.model_parameters)
    # model_params_path = os.path.join(args.model_parameters, "model_params.json")
    # with open(model_params_path, "r") as file:
    #     json_model_parameters = json.load(file)
    # model_parameters = jsonpickle.decode(json_model_parameters)

    model_class = mname_to_mclass[model_parameters.pop("model_name")]
    model = model_class(**model_parameters)

    model.fit(X_train, y_train)

    joblib.dump(model, os.path.join(args.model, "model.joblib"))


if __name__ == "__main__":
    main()

Overwriting ./components/train_model/train_model.py


In [137]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

train_model_component = command(
    name="fit_model_for_classifier_workbench",
    display_name="fit_model_for_classifier_workbench",
    description=(
        "Trains a model for classifier workbench using given data and model parameters"
    ),
    inputs={
        "X_train": Input(type="uri_folder"),
        "y_train": Input(type="uri_folder"),
        "model_parameters": Input(type="uri_file"),
    },
    outputs={
        "model": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=train_model,
    command="""python train_model.py \
            --X_train ${{inputs.X_train}} \
            --y_train ${{inputs.y_train}} \
            --model_parameters ${{inputs.model_parameters}} \
            --model ${{outputs.model}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [138]:
# Now we register the component to the workspace
train_model_component = ml_client.create_or_update(train_model_component.component)

# Create (register) the component in your workspace
print(
    f"Component {train_model_component.name} with Version {train_model_component.version} is registered"
)

Component fit_model_for_classifier_workbench with Version 2024-09-20-14-33-18-8525116 is registered


## Predict Scores component

In [115]:
import os

predict_scores = "./components/predict_scores"
os.makedirs(predict_scores, exist_ok=True)

In [116]:
%%writefile {predict_scores}/predict_scores.py

import argparse
import os

import numpy as np

from eppi_text_classification import predict_scores
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_joblib_model_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--X",
        type=str,
        help="path to prediction data",
    )
    parser.add_argument(
        "--y_pred_probs",
        type=str,
        help="path to the predicted probabilities",
    )
    parser.add_argument(
        "--model",
        type=str,
        help="path to trained model",
    )
    args = parser.parse_args()

    X = load_csr_at_directory(args.X)
    model = load_joblib_model_at_directory(args.model)

    y_pred_probabilities = predict_scores(model, X)

    np.save(os.path.join(args.y_pred_probs, "y_pred_probs.npy"), y_pred_probabilities)


if __name__ == "__main__":
    main()

Overwriting ./components/predict_scores/predict_scores.py


In [117]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

predict_probabilities_component = command(
    name="predict_probabilities_for_eppi_classifier_workbench",
    display_name="predict_probabilities_for_eppi_classifier_workbench",
    description=(
        "Takes a model from the eppi classifier workbench and uses it to predict "
        "probabilities"
    ),
    inputs={
        "X": Input(type="uri_folder"),
        "model": Input(type="uri_folder"),
    },
    outputs={
        "y_pred_probs": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=predict_scores,
    command="""python predict_scores.py \
            --X ${{inputs.X}} \
            --model ${{inputs.model}} \
            --y_pred_probs ${{outputs.y_pred_probs}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [118]:
# Now we register the component to the workspace
predict_probabilities_component = ml_client.create_or_update(
    predict_probabilities_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {predict_probabilities_component.name} with Version {predict_probabilities_component.version} is registered"
)

Component predict_probabilities_for_eppi_classifier_workbench with Version 2024-09-20-14-13-32-7063933 is registered


## Plotly ROC component

In [119]:
import os

plotly_roc = "./components/plotly_roc"
os.makedirs(plotly_roc, exist_ok=True)

In [120]:
%%writefile {plotly_roc}/plotly_roc.py

import argparse
import os
from pathlib import Path

from eppi_text_classification import plotly_roc
from eppi_text_classification.utils import load_np_array_at_directory


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--y",
        type=str,
        help="path to labels",
    )
    parser.add_argument(
        "--y_pred_probs",
        type=str,
        help="path to the predicted probabilities",
    )
    parser.add_argument(
        "--roc_plot",
        type=str,
        help="path to the roc plot",
    )
    args = parser.parse_args()
    y = load_np_array_at_directory(args.y)
    y_pred_probs = load_np_array_at_directory(args.y_pred_probs)

    roc_plot_path = Path(args.roc_plot) / "roc_plot.html"
    plotly_roc(y, y_pred_probs, save_path=roc_plot_path)


if __name__ == "__main__":
    main()

Overwriting ./components/plotly_roc/plotly_roc.py


In [121]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

plotly_roc_component = command(
    name="roc_plot_for_eppi_classifier_workbench",
    display_name="ROC plot for eppi classifier workbench",
    description=("Plots ROC curve for given labels and predicted probabilities"),
    inputs={
        "y": Input(type="uri_folder"),
        "y_pred_probs": Input(type="uri_folder"),
    },
    outputs={
        "roc_plot": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=plotly_roc,
    command="""python plotly_roc.py \
            --y ${{inputs.y}} \
            --y_pred_probs ${{inputs.y_pred_probs}} \
            --roc_plot ${{outputs.roc_plot}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [122]:
# Now we register the component to the workspace
plotly_roc_component = ml_client.create_or_update(plotly_roc_component.component)

# Create (register) the component in your workspace
print(
    f"Component {plotly_roc_component.name} with Version {plotly_roc_component.version} is registered"
)

Component roc_plot_for_eppi_classifier_workbench with Version 2024-09-20-14-13-40-5414609 is registered


## View HTML image component


In [58]:
import os

view_html_image = "./components/view_html_image"
os.makedirs(view_html_image, exist_ok=True)

In [63]:
%%writefile {view_html_image}/view_html_image.py

import argparse
import os
import mlflow


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--image",
        type=str,
        help="path to image",
    )
    args = parser.parse_args()

    image_path = os.path.join(args.image, os.listdir(args.image)[0])

    mlflow.log_artifact(image_path)


if __name__ == "__main__":
    main()

Overwriting ./components/view_html_image/view_html_image.py


In [62]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

view_html_image_component = command(
    name="view_html_image",
    display_name="Display image from html file in logs",
    description=("Display image from html file in logs"),
    inputs={
        "image": Input(type="uri_folder"),
    },
    # The source folder of the component
    code=view_html_image,
    command="""python view_html_image.py \
            --image ${{inputs.image}} \
            """,
    environment=f"{display_image_env.name}:{display_image_env.version}",
)

In [64]:
# Now we register the component to the workspace
view_html_image_component = ml_client.create_or_update(
    view_html_image_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {view_html_image_component.name} with Version {view_html_image_component.version} is registered"
)

Component view_html_image with Version 2024-09-19-14-53-48-5180406 is registered


## Get raw threshold


In [123]:
import os

get_threshold = "./components/get_threshold"
os.makedirs(get_threshold, exist_ok=True)

In [124]:
%%writefile {get_threshold}/get_threshold.py

import argparse
import json
import os

from eppi_text_classification import get_raw_threshold
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_joblib_model_at_directory,
    load_np_array_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--X",
        type=str,
        help="path to X data",
    )
    parser.add_argument(
        "--y",
        type=str,
        help="path to y data",
    )
    parser.add_argument(
        "--model",
        type=str,
        help="path to model",
    )
    parser.add_argument(
        "--target_tpr",
        type=str,
        help="path to target true positive rate",
    )
    parser.add_argument(
        "--threshold",
        type=str,
        help="path to threshold",
    )
    args = parser.parse_args()

    model = load_joblib_model_at_directory(args.model)
    X = load_csr_at_directory(args.X)
    y = load_np_array_at_directory(args.y)
    with open(args.target_tpr) as file:
        target_tpr = float(json.load(file))

    threshold = get_raw_threshold(model, X, y, target_tpr)

    print(f"threshold: {threshold}")
    with open(os.path.join(args.threshold, "threshold.json"), "w") as file:
        json.dump(threshold, file)


if __name__ == "__main__":
    main()

Overwriting ./components/get_threshold/get_threshold.py


In [125]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

get_threshold_component = command(
    name="get_classification_threshold_for_classifier_workbench",
    display_name="Get the classification threshold for a given TPR",
    description=(
        "For a given desired true positive rate, get the classification threshold"
    ),
    inputs={
        "y": Input(type="uri_folder"),
        "X": Input(type="uri_folder"),
        "model": Input(type="uri_folder"),
        "target_tpr": Input(type="uri_folder"),
    },
    outputs={
        "threshold": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=get_threshold,
    command="""python get_threshold.py \
            --y ${{inputs.y}} \
            --X ${{inputs.X}} \
            --model ${{inputs.model}} \
            --target_tpr ${{inputs.target_tpr}} \
            --threshold ${{outputs.threshold}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [126]:
# Now we register the component to the workspace
get_threshold_component = ml_client.create_or_update(get_threshold_component.component)

# Create (register) the component in your workspace
print(
    f"Component {get_threshold_component.name} with Version {get_threshold_component.version} is registered"
)

Component get_classification_threshold_for_classifier_workbench with Version 2024-09-20-14-13-55-4120226 is registered


## Threshold Predict component



In [127]:
import os

threshold_predict = "./components/threshold_predict"
os.makedirs(threshold_predict, exist_ok=True)

In [128]:
%%writefile {threshold_predict}/threshold_predict.py

import argparse
import json
import os

import numpy as np

from eppi_text_classification import raw_threshold_predict
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_joblib_model_at_directory,
    load_value_from_json_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--X",
        type=str,
        help="path to X data",
    )
    parser.add_argument(
        "--model",
        type=str,
        help="path to model",
    )
    parser.add_argument(
        "--threshold",
        type=str,
        help="path to threshold",
    )
    parser.add_argument(
        "--y_pred",
        type=str,
        help="path to y predictions",
    )
    args = parser.parse_args()

    model = load_joblib_model_at_directory(args.model)
    X = load_csr_at_directory(args.X)
    threshold = float(load_value_from_json_at_directory(args.threshold))

    y_pred = raw_threshold_predict(model, X, threshold)

    np.save(os.path.join(args.y_pred, "y_pred.npy"), y_pred)


if __name__ == "__main__":
    main()

Overwriting ./components/threshold_predict/threshold_predict.py


In [129]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

threshold_predict = command(
    name="predict_given_threshold_for_classifier_workbench",
    display_name="Predict given a threshold for classifier workbench model",
    description=("Predict given a threshold for classifier workbench model"),
    inputs={
        "X": Input(type="uri_folder"),
        "model": Input(type="uri_folder"),
        "threshold": Input(type="uri_folder"),
    },
    outputs={
        "y_pred": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=threshold_predict,
    command="""python threshold_predict.py \
            --X ${{inputs.X}} \
            --model ${{inputs.model}} \
            --threshold ${{inputs.threshold}} \
            --y_pred ${{outputs.y_pred}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [130]:
# Now we register the component to the workspace
threshold_predict = ml_client.create_or_update(threshold_predict.component)

# Create (register) the component in your workspace
print(
    f"Component {threshold_predict.name} with Version {threshold_predict.version} is registered"
)

Component predict_given_threshold_for_classifier_workbench with Version 2024-09-20-14-14-02-6063587 is registered


## Plotly confusion plots


In [172]:
import os

plotly_confusion = "./components/plotly_confusion"
os.makedirs(plotly_confusion, exist_ok=True)

In [173]:
%%writefile {plotly_confusion}/plotly_confusion.py

import argparse
import os
from pathlib import Path

from eppi_text_classification import (
    binary_train_valid_confusion_plotly,
    binary_train_valid_test_confusion_plotly,
)
from eppi_text_classification.utils import load_np_array_at_directory


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--y_train",
        type=str,
        help="path to y_train",
    )
    parser.add_argument(
        "--y_train_pred",
        type=str,
        help="path to y_train_pred",
    )
    parser.add_argument(
        "--y_val",
        type=str,
        help="path to y_val",
    )
    parser.add_argument(
        "--y_val_pred",
        type=str,
        help="path to y_val_pred",
    )
    parser.add_argument(
        "--y_test",
        type=str,
        help="path to y_test",
    )
    parser.add_argument(
        "--y_test_pred",
        type=str,
        help="path to y_test_pred",
    )
    parser.add_argument(
        "--confusion_plot",
        type=str,
        help="path to confusion plot",
    )
    args = parser.parse_args()
    y_train = load_np_array_at_directory(args.y_train)
    y_train_pred = load_np_array_at_directory(args.y_train_pred)
    y_val = load_np_array_at_directory(args.y_val)
    y_val_pred = load_np_array_at_directory(args.y_val_pred)

    save_path = Path(args.confusion_plot) / "confusion_plot.html"

    if not args.y_test:
        binary_train_valid_confusion_plotly(
            y_train,
            y_train_pred,
            y_val,
            y_val_pred,
            postive_label="Included",
            negative_label="Excluded",
            save_path=save_path,
        )
    else:
        y_test = load_np_array_at_directory(args.y_test)
        y_test_pred = load_np_array_at_directory(args.y_test_pred)
        binary_train_valid_test_confusion_plotly(
            y_train,
            y_train_pred,
            y_val,
            y_val_pred,
            y_test,
            y_test_pred,
            postive_label="Included",
            negative_label="Excluded",
            save_path=save_path,
        )


if __name__ == "__main__":
    main()

Overwriting ./components/plotly_confusion/plotly_confusion.py


In [174]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

plotly_confusion_component = command(
    name="confusion_plot_for_classifier_workbench",
    display_name="Confusion Matrix Plot",
    description=(
        "Confusion matrix that plots three or two confusion plots based on whether"
        "test data is provided"
    ),
    inputs={
        "y_train": Input(type="uri_folder"),
        "y_train_pred": Input(type="uri_folder"),
        "y_val": Input(type="uri_folder"),
        "y_val_pred": Input(type="uri_folder"),
        "y_test": Input(type="uri_folder", optional=True),
        "y_test_pred": Input(type="uri_folder", optional=True),
    },
    outputs={
        "confusion_plot": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=plotly_confusion,
    command="""python plotly_confusion.py \
            --y_train ${{inputs.y_train}} \
            --y_train_pred ${{inputs.y_train_pred}} \
            --y_val ${{inputs.y_val}} \
            --y_val_pred ${{inputs.y_val_pred}} \
            $[[--y_test ${{inputs.y_test}}]] \
            $[[--y_test_pred ${{inputs.y_test_pred}}]] \
            --confusion_plot ${{outputs.confusion_plot}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [175]:
# Now we register the component to the workspace
plotly_confusion_component = ml_client.create_or_update(
    plotly_confusion_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {plotly_confusion_component.name} with Version {plotly_confusion_component.version} is registered"
)

[32mUploading plotly_confusion (0.0 MBs): 100%|██████████| 2172/2172 [00:00<00:00, 81333.00it/s]
[39m



Component confusion_plot_for_classifier_workbench with Version 2024-09-19-18-19-18-2843572 is registered


## Create Shap Plotter

In [64]:
import os

calculate_shap_values = "./components/calculate_shap_values"
os.makedirs(calculate_shap_values, exist_ok=True)

In [65]:
%%writefile {calculate_shap_values}/calculate_shap_values.py

import argparse
import json
import os
from pathlib import Path

from scipy.sparse import save_npz

from eppi_text_classification import ShapPlotter
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_joblib_model_at_directory,
    load_np_array_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model",
        type=str,
        help="path to model",
    )
    parser.add_argument(
        "--X",
        type=str,
        help="path to model",
    )
    parser.add_argument(
        "--feature_names",
        type=str,
        help="path to feature_names",
    )
    parser.add_argument(
        "--shap_values",
        type=str,
        help="path to shap_values",
    )
    parser.add_argument(
        "--shap_expected_value",
        type=str,
        help="path to shap expected_value",
    )
    args = parser.parse_args()
    X = load_csr_at_directory(args.X)
    feature_names = load_np_array_at_directory(args.feature_names, allow_pickle=True)
    model = load_joblib_model_at_directory(args.model)

    shap_plotter = ShapPlotter(
        model,
        X,
        feature_names,
    )

    shap_values_file = Path(args.shap_values) / "shap_values.npz"
    save_npz(shap_values_file, shap_plotter.shap_values)

    with open(
        os.path.join(args.shap_expected_value, "shap_expected_value.json"), "w"
    ) as file:
        json.dump(shap_plotter.expected_value, file)


if __name__ == "__main__":
    main()

Overwriting ./components/calculate_shap_values/calculate_shap_values.py


In [66]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

calculate_shap_values_component = command(
    name="calculate_shap_values_for_classifier_workbench",
    display_name="Calculate SHAP values for classifier workbench",
    description=("Creates a shap plotter object and calculates the shap values"),
    inputs={
        "model": Input(type="uri_folder"),
        "X": Input(type="uri_folder"),
        "feature_names": Input(type="uri_folder"),
    },
    outputs={
        "shap_values": Output(type="uri_folder", mode="rw_mount"),
        "shap_expected_value": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=calculate_shap_values,
    command="""python calculate_shap_values.py \
            --model ${{inputs.model}} \
            --X ${{inputs.X}} \
            --feature_names ${{inputs.feature_names}} \
            --shap_values ${{outputs.shap_values}} \
            --shap_expected_value ${{outputs.shap_expected_value}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [67]:
# Now we register the component to the workspace
calculate_shap_values_component = ml_client.create_or_update(
    calculate_shap_values_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {calculate_shap_values_component.name} with Version {calculate_shap_values_component.version} is registered"
)

[32mUploading calculate_shap_values (0.0 MBs): 100%|██████████| 1545/1545 [00:00<00:00, 55245.14it/s]
[39m



Component calculate_shap_values_for_classifier_workbench with Version 2024-09-20-12-54-16-6439259 is registered


## Limit number of data component


In [9]:
import os

splice_data = "./components/splice_data"
os.makedirs(splice_data, exist_ok=True)

In [10]:
%%writefile {splice_data}/splice_data.py

import argparse
import json
import os
from pathlib import Path

from scipy.sparse import save_npz

from eppi_text_classification.utils import (
    load_csr_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data",
        type=str,
        help="path to data to be spliced",
    )
    parser.add_argument(
        "--num_rows",
        type=str,
        help="path number of rows to keep",
    )
    parser.add_argument(
        "--spliced_data",
        type=str,
        help="path to spliced data",
    )
    args = parser.parse_args()

    data = load_csr_at_directory(args.data)
    with open(args.num_rows, "r") as file:
        num_rows = int(json.load(file))

    spliced_data = data[:num_rows]

    spliced_data_save_path = Path(args.spliced_data) / "splice_data.npz"
    save_npz(spliced_data_save_path, spliced_data)


if __name__ == "__main__":
    main()

Writing ./components/splice_data/splice_data.py


In [11]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

splice_data_component = command(
    name="splice_csr_data_for_classifier_workbench",
    display_name="Splice data for csr matrix for classifier workbench",
    description=("Splice data for csr matrix for classifier workbench"),
    inputs={
        "data": Input(type="uri_folder"),
        "num_rows": Input(type="uri_folder"),
    },
    outputs={
        "spliced_data": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=splice_data,
    command="""python splice_data.py \
            --data ${{inputs.data}} \
            --num_rows ${{inputs.num_rows}} \
            --spliced_data ${{outputs.spliced_data}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [12]:
# Now we register the component to the workspace
splice_data_component = ml_client.create_or_update(splice_data_component.component)

# Create (register) the component in your workspace
print(
    f"Component {splice_data_component.name} with Version {splice_data_component.version} is registered"
)

[32mUploading splice_data (0.0 MBs): 100%|██████████| 957/957 [00:00<00:00, 35782.28it/s]
[39m



Component splice_csr_data_for_classifier_workbench with Version 1 is registered


## Dot plot


In [18]:
import os

create_dot_plot = "./components/create_dot_plot"
os.makedirs(create_dot_plot, exist_ok=True)

In [255]:
%%writefile {create_dot_plot}/create_dot_plot.py

import argparse
import json
import os
from pathlib import Path

from eppi_text_classification.shap_plotter import DotPlot
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_np_array_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--shap_values",
        type=str,
        help="path to shap_plotter",
    )
    parser.add_argument(
        "--X",
        type=str,
        help="path to X data to explain model on",
    )
    parser.add_argument(
        "--feature_names",
        type=str,
        help="path to features names",
    )
    parser.add_argument(
        "--num_display",
        type=str,
        help="path to the number of features to display on plot",
    )
    parser.add_argument(
        "--log_scale",
        type=str,
        help="path to bool of whether to display plot along log scale",
    )
    parser.add_argument(
        "--plot_zero",
        type=str,
        help="path to bool of whether to plot zero shap values",
    )
    parser.add_argument(
        "--dot_plot",
        type=str,
        help="path to dot plot",
    )
    args = parser.parse_args()

    shap_values = load_csr_at_directory(args.shap_values)
    X = load_csr_at_directory(args.X)
    feature_names = load_np_array_at_directory(args.feature_names, allow_pickle=True)
    with open(args.num_display, "r") as file:
        num_display = int(json.load(file))
    with open(args.log_scale, "r") as file:
        log_scale = bool(json.load(file))
    with open(args.plot_zero, "r") as file:
        plot_zero = bool(json.load(file))

    dot_plot = DotPlot(
        shap_values=shap_values,
        X_test=X,
        feature_names=feature_names,
        num_display=num_display,
        log_scale=log_scale,
        plot_zero=plot_zero,
    )

    print(f"feature_names : {feature_names.shape}")
    print(f"shap_values : {shap_values.shape}")
    print(f"X : {X.shape}")
    dot_plot_path = Path(args.dot_plot) / "dot_plot.png"
    dot_plot.save(dot_plot_path)


if __name__ == "__main__":
    main()

Overwriting ./components/create_dot_plot/create_dot_plot.py


In [256]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

create_dot_plot_component = command(
    name="shap_dot_plot_for_classifier_workbench",
    display_name="SHAP dot plot for classifier workbench",
    description=("Create a SHAP dot plot for classifier workbench"),
    inputs={
        "shap_values": Input(type="uri_folder"),
        "X": Input(type="uri_folder"),
        "feature_names": Input(type="uri_folder"),
        "num_display": Input(type="uri_folder"),
        "log_scale": Input(type="uri_folder"),
        "plot_zero": Input(type="uri_folder"),
    },
    outputs={
        "dot_plot": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=create_dot_plot,
    command="""python create_dot_plot.py \
            --shap_values ${{inputs.shap_values}} \
            --X ${{inputs.X}} \
            --feature_names ${{inputs.feature_names}} \
            --num_display ${{inputs.num_display}} \
            --log_scale ${{inputs.log_scale}} \
            --plot_zero ${{inputs.plot_zero}} \
            --dot_plot ${{outputs.dot_plot}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [257]:
# Now we register the component to the workspace
create_dot_plot_component = ml_client.create_or_update(
    create_dot_plot_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {create_dot_plot_component.name} with Version {create_dot_plot_component.version} is registered"
)

[32mUploading create_dot_plot (0.0 MBs): 100%|██████████| 2125/2125 [00:00<00:00, 74883.81it/s]
[39m



Component shap_dot_plot_for_classifier_workbench with Version 2024-09-20-09-53-59-2570414 is registered


## Shap Bar Plot

In [13]:
import os

create_bar_plot = "./components/create_bar_plot"
os.makedirs(create_bar_plot, exist_ok=True)


In [14]:
%%writefile {create_bar_plot}/create_bar_plot.py

import argparse
import json
import os
from pathlib import Path

from eppi_text_classification.shap_plotter import BarPlot
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_np_array_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--shap_values",
        type=str,
        help="path to shap_plotter",
    )
    parser.add_argument(
        "--X",
        type=str,
        help="path to X data to explain model on",
    )
    parser.add_argument(
        "--feature_names",
        type=str,
        help="path to features names",
    )
    parser.add_argument(
        "--num_display",
        type=str,
        help="path to the number of features to display on plot",
    )
    parser.add_argument(
        "--bar_plot",
        type=str,
        help="path to bar plot",
    )
    args = parser.parse_args()

    shap_values = load_csr_at_directory(args.shap_values)
    X = load_csr_at_directory(args.X)
    feature_names = load_np_array_at_directory(args.feature_names, allow_pickle=True)
    with open(args.num_display, "r") as file:
        num_display = int(json.load(file))

    bar_plot = BarPlot(
        shap_values=shap_values,
        X_test=X,
        feature_names=feature_names,
        num_display=num_display,
    )

    print(f"feature_names : {feature_names.shape}")
    print(f"shap_values : {shap_values.shape}")
    print(f"X : {X.shape}")
    bar_plot_path = Path(args.bar_plot) / "bar_plot.png"
    bar_plot.save(bar_plot_path)


if __name__ == "__main__":
    main()


Writing ./components/create_bar_plot/create_bar_plot.py


In [16]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

create_bar_plot_component = command(
    name="create_shap_bar_plot_for_classifier_workbench",
    display_name="SHAP bar plot for classifier workbench",
    description=("Create a SHAP bar plot for classifier workbench"),
    inputs={
        "shap_values": Input(type="uri_folder"),
        "X": Input(type="uri_folder"),
        "feature_names": Input(type="uri_folder"),
        "num_display": Input(type="uri_folder"),
    },
    outputs={
        "bar_plot": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=create_bar_plot,
    command="""python create_bar_plot.py \
            --shap_values ${{inputs.shap_values}} \
            --X ${{inputs.X}} \
            --feature_names ${{inputs.feature_names}} \
            --num_display ${{inputs.num_display}} \
            --bar_plot ${{outputs.bar_plot}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)


In [17]:
# Now we register the component to the workspace
create_bar_plot_component = ml_client.create_or_update(
    create_bar_plot_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {create_bar_plot_component.name} with Version {create_bar_plot_component.version} is registered"
)

[32mUploading create_bar_plot (0.0 MBs): 100%|██████████| 1614/1614 [00:00<00:00, 64902.03it/s]
[39m



Component create_shap_bar_plot_for_classifier_workbench with Version 1 is registered


## Decision Plot 

In [77]:
import os

create_decision_plot = "./components/create_decision_plot"
os.makedirs(create_decision_plot, exist_ok=True)


In [78]:
%%writefile {create_decision_plot}/create_decision_plot.py

import argparse
import json
import os
from pathlib import Path

from eppi_text_classification.shap_plotter import DecisionPlot
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_np_array_at_directory,
    load_value_from_json_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--expected_shap_value",
        type=str,
        help="path to expected shap value",
    )
    parser.add_argument(
        "--threshold",
        type=str,
        help="path to decision threshold for your model",
    )
    parser.add_argument(
        "--shap_values",
        type=str,
        help="path to shap_plotter",
    )
    parser.add_argument(
        "--X",
        type=str,
        help="path to X data to explain model on",
    )
    parser.add_argument(
        "--feature_names",
        type=str,
        help="path to features names",
    )
    parser.add_argument(
        "--num_display",
        type=str,
        help="path to the number of features to display on plot",
    )
    parser.add_argument(
        "--log_scale",
        type=str,
        help="path to bool of whether to display plot along log scale",
    )
    parser.add_argument(
        "--decision_plot",
        type=str,
        help="path to decision plot",
    )
    args = parser.parse_args()

    expected_shap_value = float(
        load_value_from_json_at_directory(args.expected_shap_value)
    )
    threshold = float(load_value_from_json_at_directory(args.threshold))
    shap_values = load_csr_at_directory(args.shap_values)
    X = load_csr_at_directory(args.X)
    feature_names = load_np_array_at_directory(args.feature_names, allow_pickle=True)
    with open(args.num_display, "r") as file:
        num_display = int(json.load(file))
    with open(args.log_scale, "r") as file:
        log_scale = bool(json.load(file))

    decision_plot = DecisionPlot(
        expected_value=expected_shap_value,
        threshold=threshold,
        shap_values=shap_values,
        X_test=X,
        feature_names=feature_names,
        num_display=num_display,
        log_scale=log_scale,
    )

    print(f"expected_shap_value : {expected_shap_value}")
    print(f"threshold : {threshold}")
    print(f"feature_names : {feature_names.shape}")
    print(f"shap_values : {shap_values.shape}")
    print(f"X : {X.shape}")
    decision_plot_path = Path(args.decision_plot) / "decision_plot.png"
    decision_plot.save(decision_plot_path)


if __name__ == "__main__":
    main()

Overwriting ./components/create_decision_plot/create_decision_plot.py


In [79]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

create_decision_plot_component = command(
    name="create_shap_decision_plot_for_classifier_workbench",
    display_name="SHAP decision plot for classifier workbench",
    description=("Create a SHAP decision plot for classifier workbench"),
    inputs={
        "expected_shap_value": Input(type="uri_folder"),
        "threshold": Input(type="uri_folder"),
        "shap_values": Input(type="uri_folder"),
        "X": Input(type="uri_folder"),
        "feature_names": Input(type="uri_folder"),
        "num_display": Input(type="uri_folder"),
        "log_scale": Input(type="uri_folder"),
    },
    outputs={
        "decision_plot": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=create_decision_plot,
    command="""python create_decision_plot.py \
            --expected_shap_value ${{inputs.expected_shap_value}} \
            --threshold ${{inputs.threshold}} \
            --shap_values ${{inputs.shap_values}} \
            --X ${{inputs.X}} \
            --feature_names ${{inputs.feature_names}} \
            --num_display ${{inputs.num_display}} \
            --log_scale ${{inputs.log_scale}} \
            --decision_plot ${{outputs.decision_plot}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)


In [80]:
# Now we register the component to the workspace
create_decision_plot_component = ml_client.create_or_update(
    create_decision_plot_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {create_decision_plot_component .name} with Version {create_decision_plot_component .version} is registered"
)

[32mUploading create_decision_plot (0.0 MBs): 100%|██████████| 2568/2568 [00:00<00:00, 95362.23it/s]
[39m



Component create_shap_decision_plot_for_classifier_workbench with Version 2024-09-20-13-21-26-3990053 is registered


## Second Split attempt


In [2]:
import os

split_with_primitive = "./components/split_with_primitive"
os.makedirs(split_with_primitive, exist_ok=True)


In [3]:
%%writefile {split_with_primitive}/split_with_primitive.py
import argparse
import json
import os

import jsonpickle
import numpy as np
from scipy.sparse import load_npz, save_npz
from sklearn.model_selection import train_test_split

from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_np_array_at_directory,
)


def main():
    # input and output arguments
    print("before parse")
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--labels",
        type=str,
        help="path to ordered list of labels",
    )
    parser.add_argument(
        "--tfidf_scores",
        type=str,
        help="path to tfidf scores for data",
    )
    parser.add_argument(
        "--test_size",
        type=float,
        help="path to the test size as a proportion of the data",
    )
    parser.add_argument(
        "--X_train",
        type=str,
        help="path to X_train",
    )
    parser.add_argument(
        "--X_test",
        type=str,
        help="path to X_test",
    )
    parser.add_argument(
        "--y_train",
        type=str,
        help="path to y_train",
    )
    parser.add_argument(
        "--y_test",
        type=str,
        help="path to y_test",
    )
    args = parser.parse_args()
    tfidf_scores = load_csr_at_directory(args.tfidf_scores)
    labels = load_np_array_at_directory(args.labels)
    with open(args.test_size, "r") as file:
        test_size = float(json.load(file))

    print(f"test_size: {test_size}")
    X_train, X_test, y_train, y_test = train_test_split(
        tfidf_scores, labels, test_size=test_size, stratify=labels, random_state=8
    )

    save_npz(os.path.join(args.X_train, "X_train.npz"), X_train)
    save_npz(os.path.join(args.X_test, "X_test.npz"), X_test)
    np.save(os.path.join(args.y_train, "y_train.npy"), y_train)
    np.save(os.path.join(args.y_test, "y_test.npy"), y_test)


if __name__ == "__main__":
    main()

Writing ./components/split_with_primitive/split_with_primitive.py


In [10]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

split_with_primitive_component = command(
    name="split_with_primitive_for_classifier_workbench",
    display_name="Split data with primitive into two sets",
    description=(
        "Uses train_test_split to split the data into two sets, storing the split data"
    ),
    inputs={
        "labels": Input(type="uri_folder"),
        "tfidf_scores": Input(type="uri_folder"),
        "test_size": Input(type="number"),
    },
    outputs={
        "X_train": Output(type="uri_folder", mode="rw_mount"),
        "X_test": Output(type="uri_folder", mode="rw_mount"),
        "y_train": Output(type="uri_folder", mode="rw_mount"),
        "y_test": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=split_with_primitive,
    command="""python split_data.py \
            --labels ${{inputs.labels}} \
            --tfidf_scores ${{inputs.tfidf_scores}} \
            --test_size ${{inputs.test_size}} \
            --X_train ${{outputs.X_train}} \
            --X_test ${{outputs.X_test}} \
            --y_train ${{outputs.y_train}} \
            --y_test ${{outputs.y_test}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [11]:
# Now we register the component to the workspace
split_with_primitive_component = ml_client.create_or_update(
    split_with_primitive_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {split_with_primitive_component.name} with Version {split_with_primitive_component.version} is registered"
)

Component split_with_primitive_for_classifier_workbench with Version 2024-09-20-16-49-45-0478564 is registered
