In [139]:
# LOGIN
# Handle to the workspace
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

In [140]:
# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="56539498-d3d8-4a3b-92f4-f3b098a11d1e",
    resource_group_name="continuous_review_ms_and_ucl",
    workspace_name="EPPI_DEV",
)

In [141]:
# Retrieve an existing environment from the workspace
env_name = "aml-eppi-text-classification"
env_version = "0.1.3"  # Specify the version of the environment
pipeline_job_env = ml_client.environments.get(name=env_name, version=env_version)

In [3]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


data_path = "../data/raw/debunking_review.tsv"

debunking_data = Data(
    name="debunking_review_data",
    path=data_path,
    type=AssetTypes.URI_FILE,
    description="Dataset for testing sams text classification pipeline",
    version="1.0.0",
)

In [None]:
debunking_data = ml_client.data.create_or_update(debunking_data)
print(
    f"Dataset with name {debunking_data.name} was registered to workspace, the dataset version is {debunking_data.version}"
)

In [47]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [26]:
%%writefile {dependencies_dir}/conda.yaml
name: eppi-text-classification-env 
channels:
  - conda-forge 
dependencies:
  - python=3.11.8
  - pip=24.0
  - pip:
    - git+https://github.com/samjmolyneux/eppi-text-classification.git@dev
    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

Overwriting ./dependencies/conda.yaml


In [28]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-eppi-text-classification"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for eppi classifier workbench pipeline",
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.1.3",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-eppi-text-classification is registered to workspace, the environment version is 0.1.3


## MLflow env

In [48]:
%%writefile {dependencies_dir}/display_image_env.yaml
name: display-image-env 
channels:
  - conda-forge 
dependencies:
  - python=3.11.8
  - pip=24.0
  - pip:
    - azureml-mlflow==1.42.0  

Writing ./dependencies/display_image_env.yaml


In [51]:
from azure.ai.ml.entities import Environment

custom_env_name = "display-image-env"

display_image_env = Environment(
    name=custom_env_name,
    description="Environment for displaying images in azure ml",
    conda_file=os.path.join(dependencies_dir, "display_image_env.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.1.0",
)
display_image_env = ml_client.environments.create_or_update(display_image_env)

print(
    f"Environment with name {display_image_env.name} is registered to workspace, the environment version is {display_image_env.version}"
)

Environment with name display-image-env is registered to workspace, the environment version is 0.1.0


## Process data component


In [162]:
import os

process_data = "./components/process_data"
os.makedirs(process_data, exist_ok=True)

In [163]:
%%writefile {process_data}/data_prep.py
import argparse
import os

import numpy as np
import pandas as pd
from scipy.sparse import save_npz

from eppi_text_classification import (
    get_features_and_labels,
    get_tfidf_and_names,
)


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input dataframe")
    parser.add_argument("--labels", type=str, help="path to ordered list of labels")
    parser.add_argument(
        "--tfidf_scores", type=str, help="path to tfidf scores for data"
    )
    parser.add_argument(
        "--feature_names", type=str, help="path to ordered list of feature names"
    )
    args = parser.parse_args()

    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)

    df = pd.read_csv(args.data, sep="\t")

    word_features, labels = get_features_and_labels(df)
    tfidf_scores, feature_names = get_tfidf_and_names(word_features)

    print(f"labels: {args.labels}")
    print(f"feature_names: {args.feature_names}")
    print(f"tfidf_scores: {args.tfidf_scores}")

    np.save(os.path.join(args.labels, "labels.npy"), labels)
    np.save(os.path.join(args.feature_names, "feature_names.npy"), feature_names)
    save_npz(os.path.join(args.tfidf_scores, "tfidf_scores.npz"), tfidf_scores)


if __name__ == "__main__":
    main()

Overwriting ./components/process_data/data_prep.py


In [164]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

data_prep_component = command(
    name="data_prepocessing_for_classifier_workbench",
    display_name="Data preprocessing for eppi classifier workbench",
    description="Tokenizes and processes text data using spaCy then generates tfirf",
    inputs={
        "data": Input(type="uri_file"),
    },
    outputs={
        "labels": Output(type="uri_folder", mode="rw_mount"),
        "feature_names": Output(type="uri_folder", mode="rw_mount"),
        "tfidf_scores": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=process_data,
    command="""python data_prep.py \
            --data ${{inputs.data}} \
            --labels ${{outputs.labels}} \
            --tfidf_scores ${{outputs.tfidf_scores}} \
            --feature_names ${{outputs.feature_names}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [165]:
# Now we register the component to the workspace
data_prep_component = ml_client.create_or_update(data_prep_component.component)

# Create (register) the component in your workspace
print(
    f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered"
)

Component data_prepocessing_for_classifier_workbench with Version 2024-09-19-08-41-39-5264346 is registered


## Search Parameters Data


In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


data_path = "user_inputs/hyperparam_search_input.json"

search_params = Data(
    name="hyperparameter_search_parameter_placeholder",
    path=data_path,
    type=AssetTypes.URI_FILE,
    description=(
        "Place holder for the hyperparameter search parameters for eppi classifier"
        " workbench"
    ),
    version="1.0.0",
)

search_params = ml_client.data.create_or_update(search_params)
print(
    f"Dataset with name {search_params.name} was registered to workspace, the dataset version is {search_params.version}"
)

# HYPERPARAMETER SEARCH COMPONENT

In [205]:
import os

hyperparameter_search = "./components/hyperparameter_search"
os.makedirs(hyperparameter_search, exist_ok=True)

In [206]:
%%writefile {hyperparameter_search}/optuna_search.py
import argparse
import os
from dataclasses import asdict

import json
import jsonpickle
import numpy as np
import pandas as pd
from scipy.sparse import load_npz
import time
from eppi_text_classification import OptunaHyperparameterOptimisation


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--labels",
        type=str,
        help="path to ordered list of labels",
    )
    parser.add_argument(
        "--tfidf_scores",
        type=str,
        help="path to tfidf scores for data",
    )
    parser.add_argument(
        "--search_parameters",
        type=str,
        help="path to search parameters for the optuna search",
    )
    parser.add_argument(
        "--best_params",
        type=str,
        help="path to best hypereparameters found by the search",
    )
    parser.add_argument(
        "--search_db",
        type=str,
        help="path to optuna search database",
    )
    args = parser.parse_args()

    tfidf_scores = load_npz(os.path.join(args.tfidf_scores, "tfidf_scores.npz"))
    labels = np.load(os.path.join(args.labels, "labels.npy"))
    with open(args.search_parameters, "r") as file:
        json_search_parameters = file.read()
    kwargs = jsonpickle.decode(json_search_parameters)

    optuna_db_path = os.path.join(args.search_db, "optuna.db")
    print(f"optuna_db_path: {optuna_db_path}")

    # with open("/mnt/optuna.db", 'w') as f:
    #     pass

    model_name = kwargs["model_name"]
    num_trials_per_job = kwargs["num_trials_per_job"]
    n_folds = 3 if "n_folds" not in kwargs else kwargs["n_folds"]
    num_cv_repeats = 1 if "num_cv_repeats" not in kwargs else kwargs["num_cv_repeats"]
    print(f"model_name: {model_name}")
    print(f"num_trials_per_job: {num_trials_per_job}")
    print(f"n_folds: {n_folds}")
    print(f"num_cv_repeats: {num_cv_repeats}")

    # Perform the search
    optimiser = OptunaHyperparameterOptimisation(
        tfidf_scores,
        labels,
        model_name,
        n_trials_per_job=num_trials_per_job,
        n_jobs=-1,
        nfolds=n_folds,
        num_cv_repeats=num_cv_repeats,
        # db_url=f"sqlite:////mnt/optuna.db", #Use this one on Azure
        # db_url=None,
        db_url=f"sqlite:///{optuna_db_path}",
    )

    start = time.time()
    best_params = optimiser.optimise_hyperparameters(study_name="hyperparam_search")
    print(f"Time taken: {time.time() - start}")

    # Save the best parameters
    best_params["model_name"] = model_name
    best_params = jsonpickle.encode(best_params, keys=True)
    best_param_path = os.path.join(args.best_params, "model_params.json")
    with open(best_param_path, "w") as f:
        json.dump(best_params, f)


if __name__ == "__main__":
    main()

Overwriting ./components/hyperparameter_search/optuna_search.py


In [207]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

hyperparameter_search_component = command(
    name="hyperparameter_search_for_classifier_workbench",
    display_name="Hyperparameter search for eppi classifier workbench",
    description=(
        "Uses parallel optuna to search for best hyperparameters for a given "
        "model, storing the history on a sqlite database"
    ),
    inputs={
        "labels": Input(type="uri_folder"),
        "tfidf_scores": Input(type="uri_folder"),
        "search_parameters": Input(type="uri_file"),
    },
    outputs={
        "best_params": Output(type="uri_folder", mode="rw_mount"),
        "search_db": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=hyperparameter_search,
    command="""python optuna_search.py \
            --labels ${{inputs.labels}} \
            --tfidf_scores ${{inputs.tfidf_scores}} \
            --search_parameters ${{inputs.search_parameters}} \
            --best_params ${{outputs.best_params}} \
            --search_db ${{outputs.search_db}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [208]:
# Now we register the component to the workspace
hyperparameter_search_component = ml_client.create_or_update(
    hyperparameter_search_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {hyperparameter_search_component .name} with Version {hyperparameter_search_component .version} is registered"
)

[32mUploading hyperparameter_search (0.0 MBs): 100%|██████████| 2817/2817 [00:00<00:00, 66313.57it/s]
[39m



Component hyperparameter_search_for_classifier_workbench with Version 2024-09-19-09-50-22-9031301 is registered


## HyperParamSearch Data Input

In [None]:
{
    "model_name": "LGBMClassifier",
    "num_trials_per_job": 3,
    "n_folds": 3,
    "num_cv_repeats": 1,
}

In [70]:
import jsonpickle
import json

hyperparam_search_input = {
    "model_name": "LGBMClassifier",
    "num_trials_per_job": 3,
    "n_folds": 3,
    "num_cv_repeats": 1,
}

serialized_input = jsonpickle.encode(hyperparam_search_input, keys=True)

with open("user_inputs/hyperparam_search_input.json", "w") as file:
    file.write(serialized_input)
    # json.dump(hyperparam_search_input, file)

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


data_path = "../data/raw/debunking_review.tsv"

debunking_data = Data(
    name="debunking_review_data",
    path=data_path,
    type=AssetTypes.URI_FILE,
    description="Dataset for testing sams text classification pipeline",
    version="1.0.0",
)
debunking_data = ml_client.data.create_or_update(debunking_data)
print(
    f"Dataset with name {debunking_data.name} was registered to workspace, the dataset version is {debunking_data.version}"
)

## Test size data input

In [3]:
import json

with open("user_inputs/test_size_025.json", "w") as file:
    json.dump("0.25", file)
with open("user_inputs/test_size_05.json", "w") as file:
    json.dump("0.5", file)

In [74]:
import json

with open("user_inputs/float_1.json", "w") as file:
    json.dump("1", file)

In [75]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


data_path = "user_inputs/float_1.json"

user_input_float = Data(
    name="user_input_float_1",
    path=data_path,
    type=AssetTypes.URI_FILE,
    description="A placeholder for user input of float 1",
    version="1.0.0",
)
user_input_float = ml_client.data.create_or_update(user_input_float)
print(
    f"Dataset with name {user_input_float.name} was registered to workspace, the dataset version is {user_input_float.version}"
)

# data_path = "user_inputs/test_size_05.json"

# test_size = Data(
#     name="user_input_test_size_05",
#     path=data_path,
#     type=AssetTypes.URI_FILE,
#     description="A placeholder for user input for the test size",
#     version="1.0.0",
# )
# test_size = ml_client.data.create_or_update(test_size)
# print(
#     f"Dataset with name {test_size.name} was registered to workspace, the dataset version is {test_size.version}"
# )

[32mUploading float_1.json[32m (< 1 MB): 100%|██████████| 3.00/3.00 [00:00<00:00, 120B/s]
[39m



Dataset with name user_input_float_1 was registered to workspace, the dataset version is 1.0.0


## Split Training Data

In [170]:
import os

split_data = "./components/split_data"
os.makedirs(split_data, exist_ok=True)

In [171]:
%%writefile {split_data}/split_data.py
import argparse
import os
import json
import jsonpickle
import numpy as np
from scipy.sparse import load_npz, save_npz
from sklearn.model_selection import train_test_split


def main():
    # input and output arguments
    print("before parse")
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--labels",
        type=str,
        help="path to ordered list of labels",
    )
    parser.add_argument(
        "--tfidf_scores",
        type=str,
        help="path to tfidf scores for data",
    )
    parser.add_argument(
        "--test_size",
        type=str,
        help="path to the test size as a proportion of the data",
    )
    parser.add_argument(
        "--X_train",
        type=str,
        help="path to X_train",
    )
    parser.add_argument(
        "--X_test",
        type=str,
        help="path to X_test",
    )
    parser.add_argument(
        "--y_train",
        type=str,
        help="path to y_train",
    )
    parser.add_argument(
        "--y_test",
        type=str,
        help="path to y_test",
    )
    args = parser.parse_args()
    tfidf_scores = load_npz(os.path.join(args.tfidf_scores, "tfidf_scores.npz"))
    labels = np.load(os.path.join(args.labels, "labels.npy"))
    with open(args.test_size, "r") as file:
        test_size = float(json.load(file))

    print(f"test_size: {test_size}")
    X_train, X_test, y_train, y_test = train_test_split(
        tfidf_scores, labels, test_size=test_size, stratify=labels, random_state=8
    )

    save_npz(os.path.join(args.X_train, "X_train.npz"), X_train)
    save_npz(os.path.join(args.X_test, "X_test.npz"), X_test)
    np.save(os.path.join(args.y_train, "y_train.npy"), y_train)
    np.save(os.path.join(args.y_test, "y_test.npy"), y_test)


if __name__ == "__main__":
    main()

Overwriting ./components/split_data/split_data.py


In [172]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

split_data_component = command(
    name="split_data_for_classifier_workbench",
    display_name="Split data into two sets",
    description=(
        "Uses train_test_split to split the data into two sets, storing the split data"
    ),
    inputs={
        "labels": Input(type="uri_folder"),
        "tfidf_scores": Input(type="uri_folder"),
        "test_size": Input(type="uri_file"),
    },
    outputs={
        "X_train": Output(type="uri_folder", mode="rw_mount"),
        "X_test": Output(type="uri_folder", mode="rw_mount"),
        "y_train": Output(type="uri_folder", mode="rw_mount"),
        "y_test": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=split_data,
    command="""python split_data.py \
            --labels ${{inputs.labels}} \
            --tfidf_scores ${{inputs.tfidf_scores}} \
            --test_size ${{inputs.test_size}} \
            --X_train ${{outputs.X_train}} \
            --X_test ${{outputs.X_test}} \
            --y_train ${{outputs.y_train}} \
            --y_test ${{outputs.y_test}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [173]:
# Now we register the component to the workspace
split_data_component = ml_client.create_or_update(split_data_component.component)

# Create (register) the component in your workspace
print(
    f"Component {split_data_component.name} with Version {split_data_component.version} is registered"
)

Component split_data_for_classifier_workbench with Version 2024-09-19-08-42-09-3319537 is registered


## Train Model


In [9]:
import os

train_model = "./components/train_model"
os.makedirs(train_model, exist_ok=True)

In [10]:
%%writefile {train_model}/train_model.py

import argparse
import os

import json
import jsonpickle
import numpy as np
from lightgbm import LGBMClassifier
from scipy.sparse import load_npz
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import joblib

mname_to_mclass = {
    "SVC": SVC,
    "LGBMClassifier": LGBMClassifier,
    "RandomForestClassifier": RandomForestClassifier,
    "XGBClassifier": XGBClassifier,
}


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--X_train",
        type=str,
        help="path to training data",
    )
    parser.add_argument(
        "--y_train",
        type=str,
        help="path to training labels",
    )
    parser.add_argument(
        "--model_parameters",
        type=str,
        help="path to model training parameters",
    )
    parser.add_argument(
        "--model",
        type=str,
        help="path to trained model",
    )
    args = parser.parse_args()

    X_train = load_npz(os.path.join(args.X_train, "X_train.npz"))
    y_train = np.load(os.path.join(args.y_train, "y_train.npy"))
    model_params_path = os.path.join(args.model_parameters, "model_params.json")
    with open(model_params_path, "r") as file:
        json_model_parameters = json.load(file)
    model_parameters = jsonpickle.decode(json_model_parameters)

    model_class = mname_to_mclass[model_parameters.pop("model_name")]
    model = model_class(**model_parameters)

    model.fit(X_train, y_train)

    joblib.dump(model, os.path.join(args.model, "model.joblib"))


if __name__ == "__main__":
    main()

Overwriting ./components/train_model/train_model.py


In [11]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

train_model_component = command(
    name="fit_model_for_classifier_workbench",
    display_name="fit_model_for_classifier_workbench",
    description=(
        "Trains a model for classifier workbench using given data and model parameters"
    ),
    inputs={
        "X_train": Input(type="uri_folder"),
        "y_train": Input(type="uri_folder"),
        "model_parameters": Input(type="uri_file"),
    },
    outputs={
        "model": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=train_model,
    command="""python train_model.py \
            --X_train ${{inputs.X_train}} \
            --y_train ${{inputs.y_train}} \
            --model_parameters ${{inputs.model_parameters}} \
            --model ${{outputs.model}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [12]:
# Now we register the component to the workspace
train_model_component = ml_client.create_or_update(train_model_component.component)

# Create (register) the component in your workspace
print(
    f"Component {train_model_component.name} with Version {train_model_component.version} is registered"
)

Component fit_model_for_classifier_workbench with Version 2024-09-19-12-09-40-4125287 is registered


## Predict Scores component

In [17]:
import os

predict_scores = "./components/predict_scores"
os.makedirs(predict_scores, exist_ok=True)

In [19]:
%%writefile {predict_scores}/predict_scores.py

import argparse
import os

import numpy as np

from eppi_text_classification import predict_scores
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_joblib_model_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--X",
        type=str,
        help="path to prediction data",
    )
    parser.add_argument(
        "--y_pred_probs",
        type=str,
        help="path to the predicted probabilities",
    )
    parser.add_argument(
        "--model",
        type=str,
        help="path to trained model",
    )
    args = parser.parse_args()

    X = load_csr_at_directory(args.X)
    model = load_joblib_model_at_directory(args.model)

    y_pred_probabilities = predict_scores(model, X)

    np.save(os.path.join(args.y_pred_probs, "y_pred_probs.npy"), y_pred_probabilities)


if __name__ == "__main__":
    main()

Overwriting ./components/predict_scores/predict_scores.py


In [20]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

predict_probabilities_component = command(
    name="predict_probabilities_for_eppi_classifier_workbench",
    display_name="predict_probabilities_for_eppi_classifier_workbench",
    description=(
        "Takes a model from the eppi classifier workbench and uses it to predict "
        "probabilities"
    ),
    inputs={
        "X": Input(type="uri_folder"),
        "model": Input(type="uri_folder"),
    },
    outputs={
        "y_pred_probs": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=predict_scores,
    command="""python predict_scores.py \
            --X ${{inputs.X}} \
            --model ${{inputs.model}} \
            --y_pred_probs ${{outputs.y_pred_probs}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [21]:
# Now we register the component to the workspace
predict_probabilities_component = ml_client.create_or_update(
    predict_probabilities_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {predict_probabilities_component.name} with Version {predict_probabilities_component.version} is registered"
)

[32mUploading predict_scores (0.0 MBs): 100%|██████████| 939/939 [00:00<00:00, 48449.40it/s]
[39m



Component predict_probabilities_for_eppi_classifier_workbench with Version 2024-09-19-13-09-53-6822957 is registered


## Plotly ROC component

In [34]:
import os

plotly_roc = "./components/plotly_roc"
os.makedirs(plotly_roc, exist_ok=True)

In [37]:
%%writefile {plotly_roc}/plotly_roc.py

import argparse
import os
from pathlib import Path

from eppi_text_classification import plotly_roc
from eppi_text_classification.utils import load_np_array_at_directory


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--y",
        type=str,
        help="path to labels",
    )
    parser.add_argument(
        "--y_pred_probs",
        type=str,
        help="path to the predicted probabilities",
    )
    parser.add_argument(
        "--roc_plot",
        type=str,
        help="path to the roc plot",
    )
    args = parser.parse_args()
    y = load_np_array_at_directory(args.y)
    y_pred_probs = load_np_array_at_directory(args.y_pred_probs)

    roc_plot_path = Path(args.roc_plot) / "roc_plot.html"
    plotly_roc(y, y_pred_probs, save_path=roc_plot_path)


if __name__ == "__main__":
    main()

Overwriting ./components/plotly_roc/plotly_roc.py


In [38]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

plotly_roc_component = command(
    name="roc_plot_for_eppi_classifier_workbench",
    display_name="ROC plot for eppi classifier workbench",
    description=("Plots ROC curve for given labels and predicted probabilities"),
    inputs={
        "y": Input(type="uri_folder"),
        "y_pred_probs": Input(type="uri_folder"),
    },
    outputs={
        "roc_plot": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=plotly_roc,
    command="""python plotly_roc.py \
            --y ${{inputs.y}} \
            --y_pred_probs ${{inputs.y_pred_probs}} \
            --roc_plot ${{outputs.roc_plot}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [39]:
# Now we register the component to the workspace
plotly_roc_component = ml_client.create_or_update(plotly_roc_component.component)

# Create (register) the component in your workspace
print(
    f"Component {plotly_roc_component.name} with Version {plotly_roc_component.version} is registered"
)

[32mUploading plotly_roc (0.0 MBs): 100%|██████████| 882/882 [00:00<00:00, 30044.72it/s]
[39m



Component roc_plot_for_eppi_classifier_workbench with Version 1 is registered


## View HTML image component


In [58]:
import os

view_html_image = "./components/view_html_image"
os.makedirs(view_html_image, exist_ok=True)

In [63]:
%%writefile {view_html_image}/view_html_image.py

import argparse
import os
import mlflow


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--image",
        type=str,
        help="path to image",
    )
    args = parser.parse_args()

    image_path = os.path.join(args.image, os.listdir(args.image)[0])

    mlflow.log_artifact(image_path)


if __name__ == "__main__":
    main()

Overwriting ./components/view_html_image/view_html_image.py


In [62]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

view_html_image_component = command(
    name="view_html_image",
    display_name="Display image from html file in logs",
    description=("Display image from html file in logs"),
    inputs={
        "image": Input(type="uri_folder"),
    },
    # The source folder of the component
    code=view_html_image,
    command="""python view_html_image.py \
            --image ${{inputs.image}} \
            """,
    environment=f"{display_image_env.name}:{display_image_env.version}",
)

In [64]:
# Now we register the component to the workspace
view_html_image_component = ml_client.create_or_update(
    view_html_image_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {view_html_image_component.name} with Version {view_html_image_component.version} is registered"
)

Component view_html_image with Version 2024-09-19-14-53-48-5180406 is registered


## Get raw threshold


In [69]:
import os

get_threshold = "./components/get_threshold"
os.makedirs(get_threshold, exist_ok=True)

In [85]:
%%writefile {get_threshold}/get_threshold.py

import argparse
import json
import os

from eppi_text_classification import get_raw_threshold
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_joblib_model_at_directory,
    load_np_array_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--X",
        type=str,
        help="path to X data",
    )
    parser.add_argument(
        "--y",
        type=str,
        help="path to y data",
    )
    parser.add_argument(
        "--model",
        type=str,
        help="path to model",
    )
    parser.add_argument(
        "--target_tpr",
        type=str,
        help="path to target true positive rate",
    )
    parser.add_argument(
        "--threshold",
        type=str,
        help="path to threshold",
    )
    args = parser.parse_args()

    model = load_joblib_model_at_directory(args.model)
    X = load_csr_at_directory(args.X)
    y = load_np_array_at_directory(args.y)
    with open(args.target_tpr) as file:
        target_tpr = float(json.load(file))

    threshold = get_raw_threshold(model, X, y, target_tpr)

    print(f"threshold: {threshold}")
    with open(os.path.join(args.threshold, "threshold.json"), "w") as file:
        json.dump(threshold, file)


if __name__ == "__main__":
    main()

Overwriting ./components/get_threshold/get_threshold.py


In [86]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

get_threshold_component = command(
    name="get_classification_threshold_for_classifier_workbench",
    display_name="Get the classification threshold for a given TPR",
    description=(
        "For a given desired true positive rate, get the classification threshold"
    ),
    inputs={
        "y": Input(type="uri_folder"),
        "X": Input(type="uri_folder"),
        "model": Input(type="uri_folder"),
        "target_tpr": Input(type="uri_folder"),
    },
    outputs={
        "threshold": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=get_threshold,
    command="""python get_threshold.py \
            --y ${{inputs.y}} \
            --X ${{inputs.X}} \
            --model ${{inputs.model}} \
            --target_tpr ${{inputs.target_tpr}} \
            --threshold ${{outputs.threshold}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [87]:
# Now we register the component to the workspace
get_threshold_component = ml_client.create_or_update(get_threshold_component.component)

# Create (register) the component in your workspace
print(
    f"Component {get_threshold_component.name} with Version {get_threshold_component.version} is registered"
)

[32mUploading get_threshold (0.0 MBs): 100%|██████████| 1341/1341 [00:00<00:00, 41354.33it/s]
[39m



Component get_classification_threshold_for_classifier_workbench with Version 2024-09-19-16-14-31-9634317 is registered


## Threshold Predict component



In [108]:
import os

threshold_predict = "./components/threshold_predict"
os.makedirs(threshold_predict, exist_ok=True)

In [109]:
%%writefile {threshold_predict}/threshold_predict.py

import argparse
import json
import os

import numpy as np

from eppi_text_classification import raw_threshold_predict
from eppi_text_classification.utils import (
    load_csr_at_directory,
    load_joblib_model_at_directory,
    load_np_array_at_directory,
    load_value_from_json_at_directory,
)


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--X",
        type=str,
        help="path to X data",
    )
    parser.add_argument(
        "--model",
        type=str,
        help="path to model",
    )
    parser.add_argument(
        "--threshold",
        type=str,
        help="path to threshold",
    )
    parser.add_argument(
        "--y_pred",
        type=str,
        help="path to y predictions",
    )
    args = parser.parse_args()

    model = load_joblib_model_at_directory(args.model)
    X = load_csr_at_directory(args.X)
    threshold = float(load_value_from_json_at_directory(args.threshold))

    y_pred = raw_threshold_predict(model, X, threshold)

    np.save(os.path.join(args.y_pred, "y_pred.npy"), y_pred)


if __name__ == "__main__":
    main()

Overwriting ./components/threshold_predict/threshold_predict.py


In [110]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

threshold_predict = command(
    name="predict_given_threshold_for_classifier_workbench",
    display_name="Predict given a threshold for classifier workbench model",
    description=("Predict given a threshold for classifier workbench model"),
    inputs={
        "X": Input(type="uri_folder"),
        "model": Input(type="uri_folder"),
        "threshold": Input(type="uri_folder"),
    },
    outputs={
        "y_pred": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=threshold_predict,
    command="""python threshold_predict.py \
            --X ${{inputs.X}} \
            --model ${{inputs.model}} \
            --threshold ${{inputs.threshold}} \
            --y_pred ${{outputs.y_pred}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [111]:
# Now we register the component to the workspace
threshold_predict = ml_client.create_or_update(threshold_predict.component)

# Create (register) the component in your workspace
print(
    f"Component {threshold_predict.name} with Version {threshold_predict.version} is registered"
)

Component predict_given_threshold_for_classifier_workbench with Version 2024-09-19-17-29-28-1618210 is registered


## Plotly confusion plots


In [143]:
import os

plotly_confusion = "./components/plotly_confusion"
os.makedirs(plotly_confusion, exist_ok=True)

In [157]:
%%writefile {plotly_confusion}/plotly_confusion.py

import argparse
import os
from pathlib import Path

from eppi_text_classification import (
    binary_train_valid_confusion_plotly,
    binary_train_valid_test_confusion_plotly,
)
from eppi_text_classification.utils import load_np_array_at_directory


def main():
    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--y_train",
        type=str,
        help="path to y_train",
    )
    parser.add_argument(
        "--y_train_pred",
        type=str,
        help="path to y_train_pred",
    )
    parser.add_argument(
        "--y_val",
        type=str,
        help="path to y_val",
    )
    parser.add_argument(
        "--y_val_pred",
        type=str,
        help="path to y_val_pred",
    )
    parser.add_argument(
        "--y_test",
        type=str,
        help="path to y_test",
    )
    parser.add_argument(
        "--y_test_pred",
        type=str,
        help="path to y_test_pred",
    )
    parser.add_argument(
        "--confusion_plot",
        type=str,
        help="path to confusion plot",
    )
    args = parser.parse_args()
    y_train = load_np_array_at_directory(args.y_train)
    y_train_pred = load_np_array_at_directory(args.y_train_pred)
    y_val = load_np_array_at_directory(args.y_val)
    y_val_pred = load_np_array_at_directory(args.y_val_pred)
    y_test = load_np_array_at_directory(args.y_test)
    y_test_pred = load_np_array_at_directory(args.y_test_pred)

    save_path = Path(args.confusion_plot) / "confusion_plot.html"

    if not args.y_test:
        binary_train_valid_confusion_plotly(
            y_train,
            y_train_pred,
            y_val,
            y_val_pred,
            postive_label="Included",
            negative_label="Excluded",
            save_path=save_path,
        )
    else:
        binary_train_valid_test_confusion_plotly(
            y_train,
            y_train_pred,
            y_val,
            y_val_pred,
            y_test,
            y_test_pred,
            postive_label="Included",
            negative_label="Excluded",
            save_path=save_path,
        )


if __name__ == "__main__":
    main()

Overwriting ./components/plotly_confusion/plotly_confusion.py


In [161]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

plotly_confusion_component = command(
    name="confusion_plot_for_classifier_workbench",
    display_name="Confusion Matrix Plot",
    description=(
        "Confusion matrix that plots three or two confusion plots based on whether"
        "test data is provided"
    ),
    inputs={
        "y_train": Input(type="uri_folder"),
        "y_train_pred": Input(type="uri_folder"),
        "y_val": Input(type="uri_folder"),
        "y_val_pred": Input(type="uri_folder"),
        "y_test": Input(type="uri_folder", optional=True),
        "y_test_pred": Input(type="uri_folder", optional=True),
    },
    outputs={
        "confusion_plot": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=plotly_confusion,
    command="""python plotly_confusion.py \
            --y_train ${{inputs.y_train}} \
            --y_train_pred ${{inputs.y_train_pred}} \
            --y_val ${{inputs.y_val}} \
            --y_val_pred ${{inputs.y_val_pred}} \
            $[[--y_test ${{inputs.y_test}}]] \
            $[[--y_test_pred ${{inputs.y_test_pred}}]] \
            --confusion_plot ${{outputs.confusion_plot}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [162]:
# Now we register the component to the workspace
plotly_confusion_component = ml_client.create_or_update(
    plotly_confusion_component.component
)

# Create (register) the component in your workspace
print(
    f"Component {plotly_confusion_component.name} with Version {plotly_confusion_component.version} is registered"
)

Component confusion_plot_for_classifier_workbench with Version 2024-09-19-17-47-20-7625953 is registered
