In [None]:

!pip install ibm-watson-machine-learning | tail -n 1
!pip install -U autoai-libs==1.12.9 | tail -n 1
!pip install -U scikit-learn==0.23.2 | tail -n 1
!pip install -U snapml==1.7.4 | tail -n 1

In [2]:
experiment_metadata = dict(
    prediction_type='binary',
    prediction_column='Label',
    holdout_size=0.1,
    scoring='accuracy',
    csv_separator=',',
    random_state=33,
    max_number_of_estimators=4,
    positive_label=1,
    drop_duplicates=True
)

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd
df=pd.read_csv('OUT.1.csv')
#df.drop_duplicates(inplace=True)
X = df.drop([experiment_metadata['prediction_column']], axis=1).values
y = df[experiment_metadata['prediction_column']].values

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=experiment_metadata['holdout_size'],
                                                    stratify=y, random_state=experiment_metadata['random_state'])

In [4]:
from autoai_libs.transformers.exportable import NumpyColumnSelector
from autoai_libs.transformers.exportable import CompressStrings
from autoai_libs.transformers.exportable import NumpyReplaceMissingValues
from autoai_libs.transformers.exportable import NumpyReplaceUnknownValues
from autoai_libs.transformers.exportable import boolean2float
from autoai_libs.transformers.exportable import CatImputer
from autoai_libs.transformers.exportable import CatEncoder
import numpy as np
from autoai_libs.transformers.exportable import float32_transform
from sklearn.pipeline import make_pipeline
from autoai_libs.transformers.exportable import FloatStr2Float
from autoai_libs.transformers.exportable import NumImputer
from autoai_libs.transformers.exportable import OptStandardScaler
from sklearn.pipeline import make_union
from autoai_libs.transformers.exportable import NumpyPermuteArray
from snapml import SnapSVMClassifier

numpy_column_selector_0 = NumpyColumnSelector(columns=[2, 10, 14, 22])
compress_strings = CompressStrings(
    compress_type="hash",
    dtypes_list=[
        "float_int_num", "float_int_num", "float_int_num", "float_int_num",
    ],
    missing_values_reference_list=["", "-", "?", float("nan")],
    misslist_list=[[], [], [float("nan")], [float("nan")]],
)
numpy_replace_missing_values_0 = NumpyReplaceMissingValues(
    missing_values=[float("nan")], filling_values=100001
)
numpy_replace_unknown_values = NumpyReplaceUnknownValues(
    filling_values=100001,
    filling_values_list=[100001, 100001, 100001, 100001],
    missing_values_reference_list=["", "-", "?", float("nan")],
)
cat_imputer = CatImputer(
    strategy="most_frequent",
    missing_values=100001,
    sklearn_version_family="23",
)
cat_encoder = CatEncoder(
    encoding="ordinal",
    categories="auto",
    dtype=np.float64,
    handle_unknown="error",
    sklearn_version_family="23",
)
pipeline_0 = make_pipeline(
    numpy_column_selector_0,
    compress_strings,
    numpy_replace_missing_values_0,
    numpy_replace_unknown_values,
    boolean2float(),
    cat_imputer,
    cat_encoder,
    float32_transform(),
)
numpy_column_selector_1 = NumpyColumnSelector(
    columns=[
        0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23,
        24, 25, 26, 27, 28,
    ]
)
float_str2_float = FloatStr2Float(
    dtypes_list=[
        "float_int_num", "float_int_num", "float_num", "float_num",
        "float_num", "float_num", "float_num", "float_num", "float_int_num",
        "float_num", "float_int_num", "float_int_num", "float_num",
        "float_num", "float_num", "float_num", "float_num", "float_num",
        "float_int_num", "float_num", "float_num", "float_num", "float_num",
        "float_num", "float_num",
    ],
    missing_values_reference_list=[float("nan")],
)
numpy_replace_missing_values_1 = NumpyReplaceMissingValues(
    missing_values=[float("nan")], filling_values=float("nan")
)
num_imputer = NumImputer(strategy="median", missing_values=float("nan"))
opt_standard_scaler = OptStandardScaler(
    num_scaler_copy=None,
    num_scaler_with_mean=None,
    num_scaler_with_std=None,
    use_scaler_flag=False,
)
pipeline_1 = make_pipeline(
    numpy_column_selector_1,
    float_str2_float,
    numpy_replace_missing_values_1,
    num_imputer,
    opt_standard_scaler,
    float32_transform(),
)
union = make_union(pipeline_0, pipeline_1)
numpy_permute_array = NumpyPermuteArray(
    axis=0,
    permutation_indices=[
        2, 10, 14, 22, 0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18,
        19, 20, 21, 23, 24, 25, 26, 27, 28,
    ],
)
snap_svm_classifier = SnapSVMClassifier(
    class_weight="balanced",
    device_ids=np.array([0], dtype=np.uint32),
    random_state=33,
)

In [5]:

pipeline = make_pipeline(union, numpy_permute_array, snap_svm_classifier)

In [6]:
from sklearn.metrics import get_scorer

scorer = get_scorer(experiment_metadata['scoring'])


In [7]:
pipeline.fit(train_X,train_y)



Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('numpycolumnselector',
                                                                  NumpyColumnSelector(columns=[2,
                                                                                               10,
                                                                                               14,
                                                                                               22])),
                                                                 ('compressstrings',
                                                                  CompressStrings(compress_type='hash',
                                                                                  dtypes_list=['float_int_num',
                                                                                               'float_int_num',
    

In [8]:
score = scorer(pipeline, test_X, test_y)
print(score)

0.9473684210526315
