In [1]:
# import packages
import numpy as np
from si4automl import (
    construct_pipelines,
    cook_distance,
    definite_regression_imputation,
    extract_features,
    initialize_dataset,
    intersection,
    lasso,
    marginal_screening,
    mean_value_imputation,
    remove_outliers,
    soft_ipod,
    stepwise_feature_selection,
    union,
    PipelineManager,
)


In [2]:
# define the pipeline manager with only one pipeline

def option1() -> PipelineManager:
    X, y = initialize_dataset()
    y = mean_value_imputation(X, y)

    O = soft_ipod(X, y, 0.02)
    X, y = remove_outliers(X, y, O)

    M = marginal_screening(X, y, 5)
    X = extract_features(X, M)

    M1 = stepwise_feature_selection(X, y, 3)
    M2 = lasso(X, y, 0.08)
    M = union(M1, M2)
    return construct_pipelines(output=M)


def option2() -> PipelineManager:
    X, y = initialize_dataset()
    y = definite_regression_imputation(X, y)

    M = marginal_screening(X, y, 5)
    X = extract_features(X, M)

    O = cook_distance(X, y, 3.0)
    X, y = remove_outliers(X, y, O)

    M1 = stepwise_feature_selection(X, y, 3)
    M2 = lasso(X, y, 0.08)
    M = intersection(M1, M2)
    return construct_pipelines(output=M)


In [3]:
# example of printing the pipeline manager with only one pipeline

manager = option1()
print(manager)
print()

manager = option2()
print(manager)

PipelineManager with 1 Pipelines
Representing Pipeline (index: 0)
start -> mean_value_imputation_0
mean_value_imputation_0 -> soft_ipod_0(param:0.02)
soft_ipod_0(param:0.02) -> outlier_removal_0
outlier_removal_0 -> marginal_screening_0(param:5)
marginal_screening_0(param:5) -> feature_extraction_0
feature_extraction_0 -> lasso_0(param:0.08)
feature_extraction_0 -> stepwise_feature_selection_0(param:3)
stepwise_feature_selection_0(param:3) -> union_features_0
lasso_0(param:0.08) -> union_features_0
union_features_0 -> end

PipelineManager with 1 Pipelines
Representing Pipeline (index: 0)
start -> definite_regression_imputation_1
definite_regression_imputation_1 -> marginal_screening_1(param:5)
marginal_screening_1(param:5) -> feature_extraction_1
feature_extraction_1 -> cook_distance_0(param:3.0)
cook_distance_0(param:3.0) -> outlier_removal_1
outlier_removal_1 -> lasso_1(param:0.08)
outlier_removal_1 -> stepwise_feature_selection_1(param:3)
stepwise_feature_selection_1(param:3) -> int

In [4]:
# apply the pipeline managed by the option1 pipeline manager on the actual dataset

n, d, sigma = 100, 10, 1.0

rng = np.random.default_rng(0)
X, y = rng.normal(size=(n, d)), rng.normal(size=n)
nan_mask = rng.choice(n, rng.binomial(n, 0.03), replace=False)
y[nan_mask] = np.nan

M, O = manager(X, y) # manager is callable
print(f"selected features: {M}")
print(f"detected outliers: {O}")
print()

# inference for the each selected features
M, p_list = manager.inference(X, y, sigma)
for feature, p_value in zip(M, p_list):
    print(f"feature:{feature}, p-value:{p_value:.3f}")

selected features: [0, 2]
detected outliers: [19, 25, 48, 59, 64, 74, 77, 90, 94, 95]

feature:0, p-value:0.139
feature:2, p-value:0.358


In [5]:
# define the pipeline manager with multiple pipelines

def option1_multi() -> PipelineManager:
    X, y = initialize_dataset()
    y = mean_value_imputation(X, y)

    O = soft_ipod(X, y, [0.02, 0.018])
    X, y = remove_outliers(X, y, O)

    M = marginal_screening(X, y, [3, 5])
    X = extract_features(X, M)

    M1 = stepwise_feature_selection(X, y, [2, 3])
    M2 = lasso(X, y, [0.08, 0.12])
    M = union(M1, M2)
    return construct_pipelines(output=M)


def option2_multi() -> PipelineManager:
    X, y = initialize_dataset()
    y = definite_regression_imputation(X, y)

    M = marginal_screening(X, y, [3, 5])
    X = extract_features(X, M)

    O = cook_distance(X, y, [2.0, 3.0])
    X, y = remove_outliers(X, y, O)

    M1 = stepwise_feature_selection(X, y, [2, 3])
    M2 = lasso(X, y, [0.08, 0.12])
    M = intersection(M1, M2)
    return construct_pipelines(output=M)


In [6]:
# example of printing the pipeline manager with multiple pipelines

manager_op1_mul = option1_multi()
print(manager_op1_mul)
print()
print()

manager_op2_mul = option2_multi()
print(manager_op2_mul)


PipelineManager with 16 Pipelines
Representing Pipeline (index: 0)
start -> mean_value_imputation_2
mean_value_imputation_2 -> soft_ipod_1(param:0.02)
soft_ipod_1(param:0.02) -> outlier_removal_2
outlier_removal_2 -> marginal_screening_2(param:3)
marginal_screening_2(param:3) -> feature_extraction_2
feature_extraction_2 -> stepwise_feature_selection_2(param:2)
feature_extraction_2 -> lasso_2(param:0.08)
stepwise_feature_selection_2(param:2) -> union_features_1
lasso_2(param:0.08) -> union_features_1
union_features_1 -> end


PipelineManager with 16 Pipelines
Representing Pipeline (index: 0)
start -> definite_regression_imputation_3
definite_regression_imputation_3 -> marginal_screening_3(param:3)
marginal_screening_3(param:3) -> feature_extraction_3
feature_extraction_3 -> cook_distance_1(param:2.0)
cook_distance_1(param:2.0) -> outlier_removal_3
outlier_removal_3 -> stepwise_feature_selection_3(param:2)
outlier_removal_3 -> lasso_3(param:0.08)
stepwise_feature_selection_3(param:2) -> 

In [7]:
# pipeline manager can select the best pipeline using the cross-validation
manager_op1_mul.tune(X, y, num_folds=2)

# print the best pipeline (note that index is changed)
print(manager_op1_mul)
print()

# after tuning, when called the pipeline manager, the best pipeline is applied
M, O = manager_op1_mul(X, y)
print(f"selected features: {M}")
print(f"detected outliers: {O}")
print()

# inference for the each selected features considering the cross-validation process
M, p_list = manager_op1_mul.inference(X, y, sigma)
for feature, p_value in zip(M, p_list):
    print(f"feature:{feature}, p-value:{p_value:.3f}")

PipelineManager with 16 Pipelines
Representing Pipeline (index: 1)
start -> mean_value_imputation_2
mean_value_imputation_2 -> soft_ipod_1(param:0.02)
soft_ipod_1(param:0.02) -> outlier_removal_2
outlier_removal_2 -> marginal_screening_2(param:3)
marginal_screening_2(param:3) -> feature_extraction_2
feature_extraction_2 -> stepwise_feature_selection_2(param:2)
feature_extraction_2 -> lasso_2(param:0.12)
stepwise_feature_selection_2(param:2) -> union_features_1
lasso_2(param:0.12) -> union_features_1
union_features_1 -> end

selected features: [0, 2, 9]
detected outliers: [45, 51, 59, 77, 94]

feature:0, p-value:0.741
feature:2, p-value:0.462
feature:9, p-value:0.229


In [8]:
# multiple pipeline managers can be concatenated into one pipeline manager
manager = manager_op1_mul | manager_op2_mul

# print the concatenated manager, number of pipelines is 32 (=16 + 16)
print(manager)
print()

# concatenated manager is also pipeline manager, so it can also be tuned
manager.tune(X, y, num_folds=2)
print(f"Tuned Index: {manager.representeing_index} (previously 0)")
print()

# of course, the concatenated manager can also be called
M, O = manager(X, y)
print(f"selected features: {M}")
print(f"detected outliers: {O}")
print()

# inference for the each selected features considering the cross-validation process
M, p_list = manager.inference(X, y, sigma)
for feature, p_value in zip(M, p_list):
    print(f"feature:{feature}, p-value:{p_value:.3f}")


PipelineManager with 32 Pipelines
Representing Pipeline (index: 0)
start -> mean_value_imputation_2
mean_value_imputation_2 -> soft_ipod_1(param:0.02)
soft_ipod_1(param:0.02) -> outlier_removal_2
outlier_removal_2 -> marginal_screening_2(param:3)
marginal_screening_2(param:3) -> feature_extraction_2
feature_extraction_2 -> stepwise_feature_selection_2(param:2)
feature_extraction_2 -> lasso_2(param:0.08)
stepwise_feature_selection_2(param:2) -> union_features_1
lasso_2(param:0.08) -> union_features_1
union_features_1 -> end

Tuned Index: 18 (previously 0)

selected features: [0, 1, 2]
detected outliers: [19, 22, 29, 36, 46, 48, 59, 74, 77, 78, 90, 94, 95]

feature:0, p-value:0.563
feature:1, p-value:0.943
feature:2, p-value:0.671
