# Entire Pipeline
data-preprocessing >> [train-test-split, train-eval-model] >> element-mlflow-model-registry, update-thresholds

In [7]:
#!/usr/bin/env python
# coding: utf-8
import sys
import os
from pathlib import Path

# Imports for vertex pipeline
from google.cloud import aiplatform
import google_cloud_pipeline_components
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
from kfp.v2 import compiler
from kfp.v2.dsl import (
    Artifact,
    component,
    pipeline,
    Input,
    Output,
    Model,
    Dataset,
    InputPath,
    OutputPath,
)
import kfp.components as comp
import kfp.dsl as dsl
# import c_utils
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime

sys.path.append(str(Path(".").absolute().parent))
sys.path.append(str(Path(".").absolute().parent) + "/utils")
sys.path.append(str(Path(".").absolute().parent.parent))
sys.path.append(str(Path(".").absolute().parent.parent.parent))

import pipeline_utils
import argparse


ModuleNotFoundError: No module named 'pipeline_utils'

In [10]:
try:
    args = pipeline_utils.get_args()
except:
    parser = argparse.ArgumentParser()
    parser.add_argument("--MODE", required=True, type=str)
    parser.add_argument("--STAGE1_FLAG", required=True, type=str)
    parser.add_argument("--ENSEMBLE_FLAG", required=True, type=str)
    parser.add_argument("--RF_CLF_MODEL_PATH", required=True, type=str)
    parser.add_argument("--LOGISTIC_CLF_MODEL_PATH", required=True, type=str)
    parser.add_argument("--STAGE1_NN_MODEL_PATH", required=True, type=str)
    parser.add_argument("--GNB_MODEL_PATH", required=True, type=str)
    parser.add_argument("--STG1_FEATURE_SELECTOR_MODEL_PATH", required=True, type=str)
    parser.add_argument("--NOSALES_MODEL_PATH", required=True, type=str)
    sys.args = [
        "--MODE", "test",
        "--STAGE1_FLAG", "train",
        "--ENSEMBLE_FLAG", "train",
        "--RF_CLF_MODEL_PATH", "",
        "--LOGISTIC_CLF_MODEL_PATH", "",
        "--STAGE1_NN_MODEL_PATH", "",
        "--GNB_MODEL_PATH", "",
        "--STG1_FEATURE_SELECTOR_MODEL_PATH", "",
        "--NOSALES_MODEL_PATH", "",
    ]
    args = parser.parse_args(sys.args)


usage: ipykernel_launcher.py [-h] --MODE MODE --STAGE1_FLAG STAGE1_FLAG
                             --ENSEMBLE_FLAG ENSEMBLE_FLAG --RF_CLF_MODEL_PATH
                             RF_CLF_MODEL_PATH --LOGISTIC_CLF_MODEL_PATH
                             LOGISTIC_CLF_MODEL_PATH --STAGE1_NN_MODEL_PATH
                             STAGE1_NN_MODEL_PATH --GNB_MODEL_PATH
                             GNB_MODEL_PATH --STG1_FEATURE_SELECTOR_MODEL_PATH
                             STG1_FEATURE_SELECTOR_MODEL_PATH
                             --NOSALES_MODEL_PATH NOSALES_MODEL_PATH
ipykernel_launcher.py: error: the following arguments are required: --MODE, --STAGE1_FLAG, --ENSEMBLE_FLAG, --RF_CLF_MODEL_PATH, --LOGISTIC_CLF_MODEL_PATH, --STAGE1_NN_MODEL_PATH, --GNB_MODEL_PATH, --STG1_FEATURE_SELECTOR_MODEL_PATH, --NOSALES_MODEL_PATH


In [11]:
PARAMS = pipeline_utils.yaml_import('settings.yml')

ENV = PARAMS['env_flag']

PROJECT_ID = PARAMS['envs'][ENV]['PROJECT_ID']
REGION = PARAMS['envs'][ENV]['REGION']
BASE_IMAGE = PARAMS['envs'][ENV]['BASE_IMAGE']
MLFLOW_IMAGE = PARAMS['envs'][ENV]['MLFLOW_IMAGE']

PIPELINE_ROOT = PARAMS['envs'][ENV]['PIPELINE_ROOT']
PIPELINE_NAME = PARAMS['envs'][ENV]['PIPELINE_NAME']
PIPELINE_JSON = PARAMS['envs'][ENV]['PIPELINE_JSON']
TMP_PIPELINE_JSON = os.path.join("/tmp", PIPELINE_JSON)


TRAINING_TABLE_NAME = PARAMS['envs'][ENV]['TRAINING_TABLE_NAME']
TRAINING_DATA_BQ_QUERY = f'select * from {TRAINING_TABLE_NAME} LIMIT 10000' #f'select * from {TRAINING_TABLE_NAME}'  

MLFLOW_EXP_NAME = PARAMS['envs'][ENV]['MLFLOW_EXP_NAME']
MODEL_REGISTRY_NAME = PARAMS['envs'][ENV]['MODEL_REGISTRY_NAME']

SERVICE_ACCOUNT = PARAMS['envs'][ENV]['SERVICE_ACCOUNT']

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
 
# Matches on non-word, non-regular-punctuation characters.
MATCHER = r"""[^a-zA-Z0-9'"!@#$%\^&*()\[\]{}:;<>?,.-=_+ ]+""" 

CLUB_THRESH_PATH = PARAMS['envs'][ENV]['CLUB_THRESH_PATH']
LATEST_NOSALES_MODEL_PATH = PARAMS['envs'][ENV]['LATEST_NOSALES_MODEL_PATH']
LATEST_PIPELINE_PATH = PARAMS['envs'][ENV]['LATEST_PIPELINE_PATH']
RUN_PIPELINE = PARAMS['envs'][ENV]['RUN_PIPELINE']
print(f"ENV: {ENV}, \nPROJECT_ID: {PROJECT_ID}, \nBASE_IMAGE: {BASE_IMAGE}, \nMLFLOW_IMAGE: {MLFLOW_IMAGE}, \nPIPELINE_NAME: {PIPELINE_NAME}, \nPIPELINE_JSON: {PIPELINE_JSON}")



ENV: dev, 
PROJECT_ID: wmt-mlp-p-oyi-ds-or-oyi-dsns, 
BASE_IMAGE: gcr.io/wmt-mlp-p-oyi-ds-or-oyi-dsns/oyi-vertex-pipeline-dev:latest, 
MLFLOW_IMAGE: gcr.io/wmt-mlp-p-oyi-ds-or-oyi-dsns/mlflow-image-dev:latest, 
PIPELINE_NAME: oyi-nosales-model-pipeline-dev, 
PIPELINE_JSON: oyi-nosales-model-pipeline-dev.json


In [12]:
TMP_PIPELINE_JSON

'/tmp/oyi-nosales-model-pipeline-dev.json'

In [13]:
print(f"LATEST_NOSALES_MODEL_PATH: {LATEST_NOSALES_MODEL_PATH}, \nLATEST_PIPELINE_PATH: {LATEST_PIPELINE_PATH}")

LATEST_NOSALES_MODEL_PATH: gs://oyi-ds-vertex-pipeline-bucket-nonprod/latest_nosales_model_output_dev, 
LATEST_PIPELINE_PATH: gs://oyi-ds-vertex-pipeline-bucket-nonprod/latest_training_pipeline_dev.json


In [None]:
LATEST_PIPELINE_PATH

In [14]:
@component(base_image=BASE_IMAGE)
def data_preprocessing(
    training_data_bq_query_input: str,
    matcher: str,
    project_id: str,
    env: str,
    pipeline_root: str,
    training_data_output: Output[Dataset]):
    
    import pandas as pd
    from datetime import timedelta
    import utils
    from google.cloud import bigquery

    client = bigquery.Client(project=project_id)
    data = client.query(training_data_bq_query_input).to_dataframe()
    nosales_data = data[
      (data.report_type!='C') &
      (data.display_ind == "Display") &
      (data.oh_qty>=0)]
    nosales_data["item_desc"] = nosales_data['item_desc'].str.replace(matcher, "", regex=True)
    nosales_data['run_date'] = pd.to_datetime(nosales_data['run_date'])
    max_date = nosales_data['run_date'].max()
    cutoff_date = (max_date - timedelta(days=182)).strftime('%Y-%m-%d')
    nosales_data = nosales_data[nosales_data.run_date > cutoff_date]
    
    nosales_data.replace("No Action Taken, already OFS", "No Action Taken, already out for sale", inplace=True)
    nosales_data.replace('Updated the NOSALES type with scrubber event', "No Action Taken, already out for sale", inplace=True)
    nosales_data.sort_values(by = ['run_date','club_nbr','item_nbr','event_ts'],inplace = True)
    nosales_data.drop_duplicates(['old_nbr','club_nbr','run_date'], keep='first',inplace = True)
    
    nosales_ext = utils.calculate_all_level_tpr(df=nosales_data, env=env, pipeline_root=pipeline_root, path='', save=False) #calculate_all_level_tpr(nosales_data, env, pipeline_root, save=True) 
    nosales_ext.fillna(0, inplace=True)
    nosales_ext.to_csv(training_data_output.path, index=False)
    

In [15]:
@component(base_image=BASE_IMAGE)
def train_test_split(
    nosales_ext_input: Input[Dataset],
    nosales_train_ext_output: Output[Dataset],
    nosales_test_ext_output: Output[Dataset],
    nosales_train_usampled_output: Output[Dataset]
    
):
    import pandas as pd
    from datetime import timedelta
    
    nosales_ext = pd.read_csv(nosales_ext_input.path)
    nosales_ext['run_date'] = pd.to_datetime(nosales_ext['run_date'])
    split_date = (nosales_ext.run_date.max() - timedelta(days=50)).strftime('%Y-%m-%d')
    nosales_train_ext = nosales_ext[pd.to_datetime(nosales_ext.run_date) < split_date].copy() 
    nosales_test_ext  = nosales_ext[pd.to_datetime(nosales_ext.run_date) >= split_date].copy() 

    x=nosales_train_ext.shape[0]
    y=nosales_test_ext.shape[0]
    print(f"split_date is {split_date}.")
    print("Train/Test ratio:", x*100/(x+y))
    seed = 2019
    frac = 11
    grouped = nosales_train_ext[nosales_train_ext.event_note == "No Action Taken, already out for sale"].groupby('club_nbr')
    u1 = grouped.apply(lambda x: x.sample(n=int(x.shape[0]/frac),  random_state=seed)).reset_index(drop=True)

    u2 = nosales_train_ext[nosales_train_ext.event_note != "No Action Taken, already out for sale"]

    nosales_train_usampled = pd.concat([u1, u2])
    nosales_train_usampled = nosales_train_usampled.sample(frac=1)
    print(nosales_train_usampled.shape)
    nosales_train_usampled.event_note.value_counts()
    
    nosales_train_ext.to_csv(nosales_train_ext_output.path, index=False)
    nosales_test_ext.to_csv(nosales_test_ext_output.path, index=False)
    nosales_train_usampled.to_csv(nosales_train_usampled_output.path, index=False)
    

In [None]:
# def check1(
#     training_data_bq_query_input: str,
#     matcher: str,
#     project_id: str,
#     env: str,
#     pipeline_root: str):
    
#     import pandas as pd
#     from datetime import timedelta
#     import utils
#     from google.cloud import bigquery

#     client = bigquery.Client(project=project_id)
#     data = client.query(training_data_bq_query_input).to_dataframe()
#     nosales_data = data[
#       (data.report_type!='C') &
#       (data.display_ind == "Display") &
#       (data.oh_qty>=0)]
#     nosales_data["item_desc"] = nosales_data['item_desc'].str.replace(matcher, "", regex=True)
#     nosales_data['run_date'] = pd.to_datetime(nosales_data['run_date'])
#     max_date = nosales_data['run_date'].max()
#     cutoff_date = (max_date - timedelta(days=182)).strftime('%Y-%m-%d')
#     nosales_data = nosales_data[nosales_data.run_date > cutoff_date]
    
#     nosales_data.replace("No Action Taken, already OFS", "No Action Taken, already out for sale", inplace=True)
#     nosales_data.replace('Updated the NOSALES type with scrubber event', "No Action Taken, already out for sale", inplace=True)
#     nosales_data.sort_values(by = ['run_date','club_nbr','item_nbr','event_ts'],inplace = True)
#     nosales_data.drop_duplicates(['old_nbr','club_nbr','run_date'], keep='first',inplace = True)
    
#     nosales_ext = c_utils.calculate_all_level_tpr(df=nosales_data, env=env, pipeline_root=pipeline_root, path='', save=True) #calculate_all_level_tpr(nosales_data, env, pipeline_root, save=True) 
#     nosales_ext.fillna(0, inplace=True)
#     print(nosales_ext)
#     # nosales_ext.to_csv(training_data_output.path, index=False)
    
# check1(training_data_bq_query_input = f'select * from {TRAINING_TABLE_NAME} LIMIT 10000',
#        matcher=MATCHER,
#        project_id=PROJECT_ID, 
#        env=ENV, 
#        pipeline_root=PIPELINE_ROOT)
       
# # def check(nosales_ext_input: Input[Dataset],
# #           _training_data_bq_query_input=TRAINING_DATA_BQ_QUERY: str,
# #           _matcher=MATCHER: str,
# #           _project_id=PROJECT_ID: str,
# #           _env=ENV: str,
# #           _pipeline_root=PIPELINE_ROOT: str):
    
    
# #     data = data_preprocessing(_training_data_bq_query_input,
# #                               _matcher,
# #                               _project_id, 
# #                               _env, 
# #                               _pipeline_root)
# #     train_test_data = train_test_split(nosales_ext_input=data.outputs['training_data_output'])
    
    

    
# # nosales_train_ext[nosales_train_ext.event_note == "No Action Taken, already out for sale"].groupby('club_nbr')
# # train_test_data = train_test_split(nosales_ext_input=data.outputs['training_data_output'])

In [16]:
@component(base_image=BASE_IMAGE)
def train_eval_model(
    nosales_ext_input: Input[Dataset],
    nosales_train_ext_input: Input[Dataset],
    nosales_test_ext_input: Input[Dataset],
    nosales_train_usampled_input: Input[Dataset],
    mode: str,
    stage1_flag: str,
    ensemble_flag: str,
    rf_clf_model_path_input: str,
    logistic_clf_model_path_input: str,
    stage1_nn_model_path_input: str,
    gnb_model_path_input: str,
    stg1_feature_selector_model_path_input: str,
    nosales_model_path_input: str,
    latest_nosales_model_path_input: str,
    project_id: str,
    region: str,
    timestamp: str,
    rf_clf_model_output: Output[Model],
    logistic_clf_model_output: Output[Model],
    stage1_nn_model_output: Output[Model],
    gnb_model_output: Output[Model],
    stg1_feature_selector_model_output: Output[Model],
    nosales_model_output: Output[Model],
    nosales_test_ext_output: Output[Dataset]
):
    import os 
    import pandas as pd
    from sklearn.pipeline import Pipeline, make_pipeline
    import utils
    import diagnosis_utils
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from keras.wrappers.scikit_learn import KerasClassifier
    from sklearn.cluster import KMeans
    import pickle
    from google.cloud import storage, aiplatform
    
    nosales_ext = pd.read_csv(nosales_ext_input.path)
    nosales_train_ext = pd.read_csv(nosales_train_ext_input.path)
    nosales_test_ext = pd.read_csv(nosales_test_ext_input.path)
    nosales_train_usampled = pd.read_csv(nosales_train_usampled_input.path)
    
    nosales_ext['run_date'] = pd.to_datetime(nosales_ext['run_date'])
    nosales_train_ext['run_date'] = pd.to_datetime(nosales_train_ext['run_date'])
    nosales_test_ext['run_date'] = pd.to_datetime(nosales_test_ext['run_date'])
    nosales_train_usampled['run_date'] = pd.to_datetime(nosales_train_usampled['run_date'])
    
    tpr_features = [col for col in nosales_train_ext.columns if '_tpr' in col]  # len(tpr_features) : 45

    numerical_features= ['gap_days','exp_scn_in_nosale_period', 'unit_retail','oh_qty','avg_sales_interval']
    numerical_features.extend(tpr_features)
    categorical_features = ['club_nbr','state','cat']

    all_features = numerical_features + categorical_features
    target = ['event_note']

    top_features = list(['oh_qty_log',  'club_nbr_cat_update_loc_tpr_log',  'club_nbr_cat_new_price_sign_tpr_log',  'club_nbr_update_loc_tpr_log',
    'club_nbr_new_price_sign_tpr_log',  'club_nbr_cat_add_to_picklist_tpr_log',  'item_nbr_update_ohq_tpr_log',
    'item_nbr_add_to_picklist_tpr_log',  'club_nbr_add_to_picklist_tpr_log',  'avg_sales_interval_log', 
    'club_nbr_cat_no_action_taken_tpr_log',  'club_nbr_no_action_taken_tpr_log',  'item_nbr_no_action_taken_tpr_log',
    'cat_add_to_picklist_tpr_log',  'unit_retail_log',  'exp_scn_in_nosale_period_log',  'club_nbr_cat_update_ohq_tpr_log', 
    'cat_update_ohq_tpr_log',  'club_nbr_update_ohq_tpr_log',  'state_cat_add_to_picklist_tpr_log',  'reg_cat_update_ohq_tpr_log',
    'state_cat_new_price_sign_tpr_log',  'mkt_cat_new_price_sign_tpr_log',  'mkt_cat_update_ohq_tpr_log', 
    'reg_cat_add_to_picklist_tpr_log',  'state_cat_update_ohq_tpr_log',  'cat_new_price_sign_tpr_log', 
    'mkt_cat_update_loc_tpr_log',  'mkt_update_loc_tpr_log',  'mkt_new_price_sign_tpr_log', 
    'mkt_cat_add_to_picklist_tpr_log',  'mkt_no_action_taken_tpr_log',  'reg_no_action_taken_tpr_log', 
    'cat_no_action_taken_tpr_log',  'mkt_cat_no_action_taken_tpr_log',  'state_cat_update_loc_tpr_log', 
    'gap_days_log',  'reg_new_price_sign_tpr_log',  'mkt_update_ohq_tpr_log',  'state_cat_no_action_taken_tpr_log'])

    if mode == 'test':
        verbose_flag = True
    else:
        verbose_flag = False


    feature_flags = {'kmeans_clustering': False}

    class_weights = dict(nosales_train_usampled.event_note.value_counts()[0]/nosales_train_usampled.event_note.value_counts()[:])


    # pipeline: location-feat
    location_features_tf= Pipeline([
        ('select_loc', utils.DataFrameSelector(['sales_floor_location']))
    ])

    # pipeline: time-feat
    time_features_tf= Pipeline([
        ('select_rundate', utils.DataFrameSelector(['run_date'])),
        ('time_featurize', utils.TimeExtractor())
    ])


    # pipeline: other-catg-feat
    add_cat_tf= Pipeline([
        ('select_other_cat', utils.DataFrameSelector(['club_nbr','cat','state']))
    ])  


    # pipeline: K-means clustering
    kmeans_tf = make_pipeline(
        utils.DataFrameSelector(numerical_features),
        utils.MinMaxScalerTransformer(),
        utils.ModelTransformer(KMeans(2))
    )

    ######################################## Assembling 'Catg' n 'Numeric' Features  #####################################

    # list(catg pipelines)
    list_of_pipelines_for_catg_feat = [
        ('loc_features',location_features_tf),
        ('time_features',time_features_tf),
        ('other_cat_features', add_cat_tf)
    ]
    if feature_flags['kmeans_clustering']:
        list_of_pipelines_for_catg_feat.append(('clusters', kmeans_tf))


    # pipeline: encoding the catg features.
    cat_tf = Pipeline([
        ('combine_cats', utils.ColumnMerge(transformer_list=list_of_pipelines_for_catg_feat)),
        ('cat_featurize', utils.CategoryFeaturizer())
    ])


    # pipeline: numeric_features + log-transformation   
    num_features_tf= Pipeline([
        ('select_num', utils.DataFrameSelector(numerical_features)),
        ('log', utils.LogFeaturizer()),
        ('select_top_features', utils.DataFrameSelector(top_features))
    ])

    stage2_init_feature_num = 20
    num_features_tf2= Pipeline([
        ('select_num', utils.DataFrameSelector(numerical_features)),
        ('log', utils.LogFeaturizer()),
        ('select_top_features', utils.DataFrameSelector(top_features[:stage2_init_feature_num]))
    ])


    # all_feat => catg_feat + numerical_feat
    add_all_tf= utils.ColumnMerge([
        ('num_features',num_features_tf),
        ('cat_features',cat_tf)
    ])

    ############################################################## Final pipelines ######################################################################

    # Lone classifier-pipelines and pre-processors

    #1
    rf_clf = RandomForestClassifier(n_jobs=-1, criterion='gini',n_estimators=50, max_depth=7,max_features='sqrt',
                                    class_weight = class_weights )

    #2
    logistic_clf = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='lbfgs', max_iter=1000, penalty='l2', class_weight=class_weights)
    

    #3
    gnb = utils.CustomizedGaussianNB()

    #4
    stage1_nn = utils.Stage1_NeuralNetwork(num_classes=5, batch_size=128, epochs=25, verbose=verbose_flag)


    stage1_classifiers = {'rf_clf':rf_clf, 'logistic_clf':logistic_clf, 'stage1_nn':stage1_nn, 'gnb':gnb}

    stage2_nn_input_dimen = stage2_init_feature_num + len(stage1_classifiers)*5
    stage2_estimator = KerasClassifier(build_fn=utils.stage2_nn, input_dimen=stage2_nn_input_dimen, epochs=5, batch_size=128, verbose=verbose_flag)
    
    ##set flags when in mode: 'test'#####
    # True: if you want to save stage1 models during test. Will automatically set to False when in prod
    s1_save_flag = True

    # Stage 1 models
    ####################################
    # Force flag to be 'train' during prod
    if mode == 'prod':
        s1_save_flag = False
        stage1_flag = 'train'

    stg1_feature_selector = num_features_tf



    if stage1_flag == 'train': 
        print("Training and saving models...")
        X_train = stg1_feature_selector.fit_transform(nosales_train_usampled, nosales_train_usampled.event_note)
        X_train = X_train.astype('float128')
        y_train = nosales_train_usampled.event_note
        if s1_save_flag:
            with open(stg1_feature_selector_model_output.path, 'wb') as file:  
                pickle.dump(stg1_feature_selector, file)
    
        X_test= stg1_feature_selector.transform(nosales_test_ext)
        stage1_model_output_paths = {'rf_clf':rf_clf_model_output.path, 'logistic_clf':logistic_clf_model_output.path,
                               'stage1_nn':stage1_nn_model_output.path, 'gnb':gnb_model_output.path}
        for clf in stage1_classifiers:
            print(clf)

            model = stage1_classifiers[clf]
            # filename = clf + ".model"
            model.fit(X_train, y_train)

            print("\n")
            if s1_save_flag:
                save_path = stage1_model_output_paths[clf]
                with open(save_path, 'wb') as file:  
                    pickle.dump(model, file)

    else:
        print("Loading models...")
        
        with open(rf_clf_model_path_input, "rb") as handler:
            rf_clf = pickle.load(handler)
       
        with open(logistic_clf_model_path_input, "rb") as handler:
            logistic_clf = pickle.load(handler)
        
        with open(stage1_nn_model_path_input, "rb") as handler:
            stage1_nn = pickle.load(handler)
        
        with open(gnb_model_path_input, "rb") as handler:
            gnb = pickle.load(handler)
       
        stage1_classifiers = {'rf_clf':rf_clf, 'logistic_clf':logistic_clf, 'stage1_nn':stage1_nn, 'gnb':gnb}
        
        with open(stg1_feature_selector_model_path_input, "rb") as handler:
            stg1_feature_selector = pickle.load(handler)
        X_test= stg1_feature_selector.transform(nosales_test_ext)
        for clf in stage1_classifiers:
            print(clf)
            model = stage1_classifiers[clf]
            nosales_test_ext = diagnosis_utils.model_diag(nosales_test_ext, model.predict_proba(X_test), model.classes_)
            print("\n")
        
        rf_clf_model_output.path = rf_clf_model_path_input
        logistic_clf_model_output.path = logistic_clf_model_path_input
        stage1_nn_model_output.path = stage1_nn_model_path_input
        gnb_model_output.path = gnb_model_path_input
        stg1_feature_selector_model_output.path = stg1_feature_selector_model_path_input
        


    # ensemble model
    #################################################################### 
    if mode == 'test':
        train_x = nosales_train_ext
        train_y = nosales_train_ext.event_note

    if mode == 'prod':
        ensemble_flag = 'train'
        train_x = nosales_ext
        train_y = nosales_ext.event_note


    print(mode, ensemble_flag, train_x.shape[0])  

    stg2_feture_selector = num_features_tf2

    if ensemble_flag == 'train': 
        print("Training and saving ensemble...")
        stack_pipeline = Pipeline([
            ('ensemble_classifier', utils.EnsembleClassifier(stg1_feature_selector, list(stage1_classifiers.values()),
                                                     stg2_feture_selector, stage2_estimator)) ])
        stack_pipeline.fit(train_x, train_y)
        with open(nosales_model_output.path, 'wb') as file:  
            pickle.dump(stack_pipeline, file)
        
        with open('latest_nosales_model_output', 'wb') as file:  
            pickle.dump(stack_pipeline, file) 
        blob = storage.blob.Blob.from_string(latest_nosales_model_path_input, client=storage.Client())
        blob.upload_from_filename('latest_nosales_model_output')
        print("Saved the final model")
        
        if mode == 'test':
            nosales_test_ext = diagnosis_utils.model_diag(nosales_test_ext, stack_pipeline.predict_proba(nosales_test_ext), stack_pipeline.classes_)
        

    else:
        print("Loading ensemble...")
        with open(nosales_model_path_input, "rb") as handler:
            stack_pipeline = pickle.load(handler)
        nosales_test_ext = diagnosis_utils.model_diag(nosales_test_ext, stack_pipeline.predict_proba(nosales_test_ext), stack_pipeline.classes_)
        
        nosales_model_output.path = nosales_model_path_input
        with open('latest_nosales_model_output', 'wb') as file:  
            pickle.dump(stack_pipeline, file) 
        blob = storage.blob.Blob.from_string(latest_nosales_model_path_input, client=storage.Client())
        blob.upload_from_filename('latest_nosales_model_output')
       
        
    nosales_test_ext.to_csv(nosales_test_ext_output.path, index = False)


In [1]:
# !gsutil -m cp -r {nosales_model_input_path} {nosales_model_input_path}

CommandException: No URLs matched: {nosales_model_input_path}
CommandException: 1 file/object could not be transferred.


In [2]:
!gsutil ls gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/

gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/executor_output.json
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/gnb_model_output
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/logistic_clf_model_output
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_model_output
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_test_ext_output
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/

In [2]:
def read_model():
    import utils
    from google.cloud import storage
    from tempfile import TemporaryFile
    import joblib

    storage_client = storage.Client()
    bucket_name = "oyi-ds-vertex-pipeline-bucket-nonprod"
    model_bucket = "335163835346/oyi-nosales-model-pipeline-dev-20221212232521/train-eval-model_-7056643361955905536/nosales_model_output"
    # model_bucket = "335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/nosales_model_output"
    # "335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_model_output"
    #"oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_model_output" #'model.joblib'

    bucket = storage_client.get_bucket(bucket_name)
    #select bucket file
    blob = bucket.blob(model_bucket)
    with TemporaryFile() as temp_file:
        #download blob into temp file
        blob.download_to_file(temp_file)
        temp_file.seek(0)
        #load into joblib
        model=joblib.load(temp_file)
    return model


In [1]:
import pandas as pd
import pickle
import os
import joblib
from google.cloud import storage
from tempfile import TemporaryFile
nosales_test_ext_path = "gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/nosales_test_ext_output"
# "gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_test_ext_output"

nosales_test_ext = pd.read_csv(nosales_test_ext_path)
nosales_test_ext['run_date'] = pd.to_datetime(nosales_test_ext['run_date'])



In [23]:
nosales_test_ext.head(3)

Unnamed: 0,club_nbr,item_nbr,old_nbr,days_no_sale,cat,subcat,state,unit_retail,oh_qty,avg_sales_interval,...,mkt_cat_update_ohq_tpr,mkt_cat_update_loc_tpr,mkt_cat_new_price_sign_tpr,reg_cat_no_action_taken_tpr,reg_cat_add_to_picklist_tpr,reg_cat_update_ohq_tpr,reg_cat_update_loc_tpr,reg_cat_new_price_sign_tpr,act_bool,rank
0,4718,56815108,980380431,4.0,23,33,UT,14.98,3,2.885714,...,0.263,0.053,0.368,0.538,0.0,0.368,0.028,0.066,0,0.320038
1,4768,43078151,84408,5.0,46,13,NV,10.98,1,4.643548,...,0.333,0.0,0.0,0.375,0.0,0.625,0.0,0.0,1,0.892883
2,4831,56815116,980380434,15.0,23,33,NC,14.98,2,4.809524,...,0.44,0.04,0.04,0.519,0.006,0.394,0.025,0.056,0,0.565246


In [6]:
import utils_1
model = read_model()
nosales_thresh = utils_1.gen_thresholds(df = nosales_test_ext,  predictions = model.predict_proba(X=nosales_test_ext), classes = model.classes_)

  extract= X.loc[:,col].str.extract('(\w+)-.*').iloc[:,0]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(resu

{4712: 0.9934,
 4743: 0.0761,
 4770: 0.3251,
 4785: 0.6331,
 4822: 0.9861,
 4857: 0.0552,
 4872: 0.2899,
 4914: 0.9492,
 4969: 0.6791,
 6219: 0.0606,
 6261: 0.109,
 6310: 0.3089,
 6321: 0.1739,
 6331: 0.2995,
 6359: 0.9904,
 6366: 0.4227,
 6380: 0.1984,
 6405: 0.2825,
 6427: 0.2063,
 6461: 0.4435,
 6464: 0.245,
 6540: 0.7946,
 6621: 0.9951,
 6626: 0.407,
 6664: 0.4194,
 6674: 0.4966,
 6686: 0.3632,
 6781: 0.2549,
 8112: 0.2029,
 8158: 0.3059,
 8165: 0.9825,
 8167: 0.6808,
 8174: 0.9895,
 8181: 0.5718,
 8185: 0.177,
 8202: 0.1292,
 8250: 0.4138,
 8288: 0.9796,
 8291: 0.0581,
 8294: 0.3559,
 4721: 0.1275,
 4735: 0.5751,
 4780: 0.4246,
 4802: 0.7121,
 4808: 0.9944,
 4817: 0.3784,
 4836: 0.5111,
 4879: 0.3192,
 4963: 0.424,
 6262: 0.3118,
 6304: 0.5143,
 6328: 0.4408,
 6330: 0.2603,
 6375: 0.2986,
 6432: 0.3478,
 6457: 0.0868,
 6522: 0.0389,
 6524: 0.5413,
 6527: 0.7198,
 6535: 0.5645,
 6571: 0.989,
 6635: 0.9788,
 6979: 0.1187,
 8106: 0.6211,
 8120: 0.6002,
 8130: 0.334,
 8139: 0.1425,
 8

In [25]:
!gsutil ls gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/*

gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/executor_output.json
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/gnb_model_output
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/logistic_clf_model_output
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/nosales_model_output
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/nosales_test_ext_output
gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/

In [None]:
# from google.cloud import storage
# import joblib
# from tempfile import TemporaryFile
# import utils

In [26]:


storage_client = storage.Client()
bucket_name = "oyi-ds-vertex-pipeline-bucket-nonprod"
model_bucket = "335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/nosales_model_output"
# "335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_model_output"
#"oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_model_output" #'model.joblib'

bucket = storage_client.get_bucket(bucket_name)
#select bucket file
blob = bucket.blob(model_bucket)
with TemporaryFile() as temp_file:
    #download blob into temp file
    blob.download_to_file(temp_file)
    temp_file.seek(0)
    #load into joblib
    model=joblib.load(temp_file)

In [None]:
storage_client = storage.Client()
bucket_name = "oyi-ds-vertex-pipeline-bucket-nonprod"
model_bucket = "335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_model_output"
#"oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_model_output" #'model.joblib'

bucket = storage_client.get_bucket(bucket_name)
#select bucket file
blob = bucket.blob(model_bucket)


In [None]:
blob

In [None]:
with TemporaryFile() as temp_file:
    #download blob into temp file
    blob.download_to_file(temp_file)
    temp_file.seek(0)
    #load into joblib
    model=joblib.load(temp_file)
    
# model.predict_proba(X=nosales_test_ext)

In [28]:
import utils
nosales_thresh = utils.gen_thresholds(df = nosales_test_ext,  predictions = model.predict_proba(X=nosales_test_ext), classes = model.classes_)

In [29]:
nosales_thresh

{4712: 0.9157,
 4743: 0.3395,
 4770: 0.3761,
 4785: 0.6456,
 4822: 0.9228,
 4857: 0.2585,
 4872: 0.4438,
 4914: 0.8228,
 4969: 0.6622,
 6219: 0.2129,
 6261: 0.2896,
 6310: 0.4781,
 6321: 0.1958,
 6331: 0.2647,
 6359: 0.9066,
 6366: 0.8305,
 6380: 0.551,
 6405: 0.5337,
 6427: 0.3526,
 6461: 0.5945,
 6464: 0.4866,
 6540: 0.739,
 6621: 0.5936,
 6626: 0.5528,
 6664: 0.8952,
 6674: 0.7282,
 6686: 0.5227,
 6781: 0.8106,
 8112: 0.35,
 8158: 0.392,
 8165: 0.8668,
 8167: 0.6936,
 8174: 0.9065,
 8181: 0.5713,
 8185: 0.3777,
 8202: 0.4114,
 8250: 0.6538,
 8288: 0.8751,
 8291: 0.1884,
 8294: 0.9174,
 4721: 0.2645,
 4735: 0.774,
 4780: 0.5201,
 4802: 0.718,
 4808: 0.9183,
 4817: 0.5413,
 4836: 0.5616,
 4879: 0.5926,
 4963: 0.6249,
 6262: 0.5184,
 6304: 0.478,
 6328: 0.6288,
 6330: 0.7313,
 6375: 0.2456,
 6432: 0.4209,
 6457: 0.3717,
 6522: 0.209,
 6524: 0.5684,
 6527: 0.5282,
 6535: 0.5831,
 6571: 0.8682,
 6635: 0.9064,
 6979: 0.2056,
 8106: 0.849,
 8120: 0.6166,
 8130: 0.6581,
 8139: 0.2887,
 8180

In [None]:
def check3(
    nosales_test_ext_input: Input[Dataset],
    club_thresh_path_input: str,
    club_threshold_output: Output[Dataset]
):
    import pandas as pd
    import pickle
    import os
    from google.cloud import storage
    from tempfile import TemporaryFile
    nosales_test_ext_path = "gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221028164657/train-eval-model_7275718579889111040/nosales_test_ext_output"
    nosales_test_ext = pd.read_csv(nosales_test_ext_path)
    nosales_test_ext['run_date'] = pd.to_datetime(nosales_test_ext['run_date'])
    
    nosales_model_input_path = "gs://oyi-ds-vertex-pipeline-bucket-nonprod/335163835346/oyi-nosales-model-pipeline-dev-20221029205839/train-eval-model_4943346554120437760/nosales_model_output"
    with open(nosales_model_input_path, "rb") as handler:
        stack_pipeline = pickle.load(handler)
    
    nosales_thresh = c_utils.gen_thresholds(df = nosales_test_ext,  predictions = stack_pipeline.predict_proba(X=nosales_test_ext), classes = stack_pipeline.classes_)
    df_nosales_thresh = pd.DataFrame(nosales_thresh.items(), columns = ['club_nbr','nosales_club_thresh'])
    
    club_threshold_file_path = os.path.join(club_thresh_path_input, "club_thresh_chain.csv")
    df_cancelled_thresh = pd.read_csv(club_threshold_file_path).drop(columns = 'nosales_club_thresh')
    all_thresh = df_cancelled_thresh.merge(df_nosales_thresh, how = 'left', on = 'club_nbr')
    club_threshold_output.path = club_threshold_file_path
    all_thresh.to_csv(club_threshold_file_path, index = False)

In [None]:
@component(base_image=BASE_IMAGE)
def update_thresholds(
    nosales_test_ext_input: Input[Dataset],
    club_thresh_path_input: str,
    nosales_model_input: Input[Model],
    club_threshold_output: Output[Dataset]
):
    
    import utils
    import pandas as pd
    import pickle
    import os
    from google.cloud import storage
    from tempfile import TemporaryFile
    
    nosales_test_ext = pd.read_csv(nosales_test_ext_input.path)
    nosales_test_ext['run_date'] = pd.to_datetime(nosales_test_ext['run_date'])
   
    with open(nosales_model_input.path, "rb") as handler:
        stack_pipeline = pickle.load(handler)
    
    nosales_thresh = utils.gen_thresholds(df = nosales_test_ext,  predictions = stack_pipeline.predict_proba(X=nosales_test_ext), classes = stack_pipeline.classes_)
    df_nosales_thresh = pd.DataFrame(nosales_thresh.items(), columns = ['club_nbr','nosales_club_thresh'])
    
    club_threshold_file_path = os.path.join(club_thresh_path_input, "club_thresh_chain.csv")
    df_cancelled_thresh = pd.read_csv(club_threshold_file_path).drop(columns = 'nosales_club_thresh')
    all_thresh = df_cancelled_thresh.merge(df_nosales_thresh, how = 'left', on = 'club_nbr')
    club_threshold_output.path = club_threshold_file_path
    all_thresh.to_csv(club_threshold_file_path, index = False)

In [18]:
import pandas as pd

CLUB_THRESH_PATH

df_cancelled_test = pd.read_csv(os.path.join(CLUB_THRESH_PATH, "club_thresh_chain.csv"))
df_cancelled_test

Unnamed: 0,club_nbr,cancelled_club_thresh,nosales_club_thresh
0,4041,0.5448,0.5220
1,4109,0.5617,0.5857
2,4702,0.5333,0.4565
3,4703,0.3857,0.3308
4,4704,0.3947,0.3207
...,...,...,...
594,8295,0.3840,0.4089
595,8296,0.5001,0.5485
596,8297,0.2706,0.4746
597,8298,0.4145,0.4570


In [19]:
df_cancelled_test = df_cancelled_test.drop(columns = 'nosales_club_thresh')

In [20]:
df_cancelled_test

Unnamed: 0,club_nbr,cancelled_club_thresh
0,4041,0.5448
1,4109,0.5617
2,4702,0.5333
3,4703,0.3857
4,4704,0.3947
...,...,...
594,8295,0.3840
595,8296,0.5001
596,8297,0.2706
597,8298,0.4145


In [None]:
update_thresholds(nosales_test_ext_input=train_eval_data.outputs['nosales_test_ext_output'],  
                                           club_thresh_path_input=CLUB_THRESH_PATH,
                                           nosales_model_input=train_eval_data.outputs['nosales_model_output'])

In [None]:
@dsl.pipeline(pipeline_root=PIPELINE_ROOT, name=PIPELINE_NAME)
def pipeline():
    data = data_preprocessing(training_data_bq_query_input = TRAINING_DATA_BQ_QUERY,
                              matcher=MATCHER,
                              project_id = PROJECT_ID, env=ENV, pipeline_root=PIPELINE_ROOT)
    
    train_test_data = train_test_split(nosales_ext_input=data.outputs['training_data_output'])
    
    train_eval_data = train_eval_model(nosales_ext_input=data.outputs['training_data_output'],
                                       nosales_train_ext_input=train_test_data.outputs['nosales_train_ext_output'],
                                       nosales_test_ext_input=train_test_data.outputs['nosales_test_ext_output'],
                                       nosales_train_usampled_input=train_test_data.outputs['nosales_train_usampled_output'],
                                       mode=args.MODE,
                                       stage1_flag=args.STAGE1_FLAG,
                                       ensemble_flag=args.ENSEMBLE_FLAG,
                                       rf_clf_model_path_input=args.RF_CLF_MODEL_PATH,
                                       logistic_clf_model_path_input=args.LOGISTIC_CLF_MODEL_PATH,
                                       stage1_nn_model_path_input=args.STAGE1_NN_MODEL_PATH,
                                       gnb_model_path_input=args.GNB_MODEL_PATH,
                                       stg1_feature_selector_model_path_input=args.STG1_FEATURE_SELECTOR_MODEL_PATH,
                                       nosales_model_path_input=args.NOSALES_MODEL_PATH,
                                       latest_nosales_model_path_input=LATEST_NOSALES_MODEL_PATH,
                                       project_id=PROJECT_ID,
                                       region=REGION,
                                       timestamp=TIMESTAMP)
   
    updated_thresholds = update_thresholds(nosales_test_ext_input=train_eval_data.outputs['nosales_test_ext_output'],  
                                           club_thresh_path_input=CLUB_THRESH_PATH,
                                           nosales_model_input=train_eval_data.outputs['nosales_model_output'])
    

    
    
    element_model_registry = CustomTrainingJobOp(
        project=PROJECT_ID,
        location=REGION,
        service_account=SERVICE_ACCOUNT,
        network="projects/12856960411/global/networks/vpcnet-private-svc-access-usc1",
        # # reserved_ip_ranges=["vpcnet-shared-prod-01-datafusion-01"],
        # network="projects/12856960411/global/networks/vpcnet-shared-prod-01",
        # reserved_ip_ranges=["vpcnet-shared-prod-01-datafusion-01"],

        display_name="mlflow-model-registry",

        worker_pool_specs=[{
            "replica_count": 1,
            "machine_spec": {
                "machine_type": "n1-standard-4",
                "accelerator_count": 0,
            },
            # The below dictionary specifies:
            #   1. The URI of the custom image to run this CustomTrainingJobOp against
            #      - this image is built from ../../custom_image_builds/model_registry_image_build.ipynb
            #   2. The command to run against that image
            #   3. The arguments to supply to that custom image 
            "container_spec": {
                "image_uri": MLFLOW_IMAGE,
                "command": [
                    "python3", "nosales_model_registry.py"
                ],
                "args": [
                    "--GCS_MODEL_PATH", LATEST_NOSALES_MODEL_PATH,
                    "--MLFLOW_EXP_NAME", MLFLOW_EXP_NAME,
                    "--MODEL_REGISTRY_NAME", MODEL_REGISTRY_NAME
                ],
            },
        }],

    ).set_display_name("element-mlflow-model-registry")
    element_model_registry.after(train_eval_data)
    
    
#     element_model_registry = CustomTrainingJobOp(
#         project=PROJECT_ID,
#         location=REGION,
#         service_account=SERVICE_ACCOUNT,
#         network="projects/12856960411/global/networks/vpcnet-shared-prod-01",
#         reserved_ip_ranges=["vpcnet-shared-prod-01-datafusion-01"],

#         display_name="mlflow-model-registry",

#         worker_pool_specs=[{
#             "replica_count": 1,
#             "machine_spec": {
#                 "machine_type": "n1-standard-4",
#                 "accelerator_count": 0,
#             },
#             # The below dictionary specifies:
#             #   1. The URI of the custom image to run this CustomTrainingJobOp against
#             #      - this image is built from ../../custom_image_builds/model_registry_image_build.ipynb
#             #   2. The command to run against that image
#             #   3. The arguments to supply to that custom image 
#             "container_spec": {
#                 "image_uri": MLFLOW_IMAGE,
#                 "command": [
#                     "python3", "cancelled_model_registry.py"
#                 ],
#                 "args": [
#                     "--GCS_MODEL_PATH", CANCELLED_MODEL_PATH,
#                     "--MLFLOW_EXP_NAME", MLFLOW_EXP_NAME,
#                     "--MODEL_REGISTRY_NAME", MODEL_REGISTRY_NAME,
#                 ],
#             },
#         }],

#     ).set_display_name("element-mlflow-model-registry")
    
    

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline, 
    package_path=TMP_PIPELINE_JSON,
)

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name=f"{PIPELINE_NAME}-{TIMESTAMP}",
    template_path=TMP_PIPELINE_JSON,
    pipeline_root=PIPELINE_ROOT,
    parameter_values={},
    enable_caching=False,
)


In [None]:
pipeline_utils.store_pipeline(
    storage_path=LATEST_PIPELINE_PATH, 
    filename=TMP_PIPELINE_JSON
)

In [None]:
pipeline_job.submit(service_account=SERVICE_ACCOUNT,network='projects/12856960411/global/networks/vpcnet-private-svc-access-usc1')

In [None]:
PIPELINE_ROOT

In [None]:
TMP_PIPELINE_JSON

In [None]:
LATEST_PIPELINE_PATH

In [None]:
# nosales_thresh = utils.gen_thresholds(df = nosales_test_ext,  predictions = stack_pipeline.predict_proba(X=nosales_test_ext), classes = stack_pipeline.classes_)