# [1] Import Packages

### [1.1] Install Necessary Packages

In [1]:
# %pip install azureml-dataset-runtime
# %pip install -U azureml-fsspec
# %pip install azureml.pipeline
# %pip install azureml-explain-model
# %pip install azureml-interpret
# %pip install azureml.fsspec

### [1.2] Import Packages

In [2]:
%%time
# Import required Azure Packages
from azureml.core import Workspace, Dataset, Datastore, Environment, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration, DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter, TrainingOutput
from azureml.pipeline.steps import PythonScriptStep, AutoMLStep
from azureml.data import OutputFileDatasetConfig
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.core.graph import PipelineParameter
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule

CPU times: user 4.46 ms, sys: 4.01 ms, total: 8.47 ms
Wall time: 29.2 ms


In [3]:
import os
import pandas as pd
import datetime as dt
from dateutil.relativedelta import relativedelta
from pathlib import Path

# [2] Global Variables

In [4]:
date_today = dt.date.today().strftime('%d_%m_%Y')
datetime_now = dt.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
print('Print Current Date:', date_today)
print('Experiment folder will be created with name:', datetime_now)
# print(f'For production we will run model on previous month i.e. month: {dt.date.today().month}, to predict results for next 2 months.')

Print Current Date: 20_09_2023
Experiment folder will be created with name: 20_09_2023_07_55_00


### [2.1] Arguments for Pipeline

In [5]:
# User name - specify your user name (this name will be used for creating base directories,  dataAsset naming)
# Make sure to keep this unique & short
user_name = 'master'

In [6]:
# Model name for deployment of model in azure. It will auto-increment the version everytime it runs.
model_name = f'{user_name}_ChurnPrediction_Model'

In [7]:
# new_data_flag seperates the model training pipeline (mtp) & new data pipeline (ndp).
# new_data_flag is False: Run the pipeline for trainig the model on train dataset:  model training pipeline (mtp)
# new_data_flag is True: Run the pipeline for new dataset: new data pipeline (ndp).
new_data_flag = False 

# flag only when new_data_flag = True - for NDP
# schedule_pipeline_flag = True, to trigger the pipeline when new data is added to data_path_to_monitor path
schedule_pipeline_flag = False

# flag only when new_data_flag = True - for NDP
# If latest_data_ndp_flag = True => Run the NDP pipeline on the latest month data, this will ignore variables observation_month_number & observation_year.
# If latest_data_ndp_flag = False => Run the NDP pipeline on specific month data by setting observation_month_number & observation_year.
latest_data_ndp_flag = False

# flag only when new_data_flag = True - for NDP
# Specify model version which needs to be used for predicting the new data set. To use latest model trained, use model_version_ndp = -1
model_version_ndp = -1
# model_version_ndp = 18

In [8]:
# Use monitoring_flag = True to monitor the model's performance by comparing actual vs predicted churn flag
monitoring_flag = True

In [9]:
# Observation Month: Month for which we will run the model and it will do the prediction for next 2 months
# Define the observation month for data preparation and modeling. Historical features will be created for this observation month.
# Use month number for: January: 1, February: 2, March: 3, April: 4, May: 5, June: 6, 
# July: 7, August: 8, September: 9, October: 10, November: 11, December: 12
observation_month_number = 10
observation_year = 2022

# To run model on previous month - for production
# observation_month_number = dt.date.today().month 
# observation_year = dt.date.today().year

# Define the number of months to be considered for historical feature creation. 
# Historical features will be created from last n months till observation month. 
# Ex: If observation month is July(7), then it will create last 6 months till Feb(2) (observation month is included).
historical_months = 6

In [10]:
# Note: For the first time, set reuse_sample_flag flag as False

# Use sample_flag as true for testing to select some sample customer ids, if sample_flag is False then it will run on entire data
num_sample_customer_ids = 10000
sample_flag = False

# For the first time, make reuse_sample_flag flag as False
# Set reuse_samle_flag to True for re-using the last created sample (if it is false, it will overwrite the last sample created)
reuse_sample_flag = False

In [11]:
# Parameters for Train Test Split Step
random_state = 42
test_size = 0.1

# Parameters for outlier treatment step
outliers_method = 'iForest'
outliers_threshold = 0.05

# Parameters for multiCollinearity treatment step
multiCollinearity_method = 'pearson'
multiCollinearity_threshold = 0.95

# Parameters for Feature selection step
# feature selection method
featureSelection_method = 'lightGBM'
# Select features with score greater than or equal to featureSelection_threshold
# featureSelection_threshold = 0 means select features based on featureSelection_percentage
featureSelection_threshold = 0 #0.001
# Select top x percetange of features x/100 = featureSelection_percentage
# featureSelection_threshold will eliminate some features and then select 
# x (featureSelection_percentage) percentage of features from those selected features
# featureSelection_percentage = 1 means select features based on featureSelection_threshold
featureSelection_percentage = 1

# Parameters for normalization step
# normalization_method = 'standard'
normalization_method = 'skip'

In [12]:
# AutoML Step Pipeline Parameters(HyperParameter Tunning parameters):
# primary_metric: The metric that Automated Machine Learning will optimize for model selection. 
# Automated Machine Learning collects more metrics than it can optimize. 
# You can use get_primary_metrics to get a list of valid metrics for your given task. 
# For more information on how metrics are calculated, 
# see https://docs.microsoft.com/azure/machine-learning/how-to-configure-auto-train#primary-metric.
# primay metric has to be one of the metric that azure offeres see above link for the list
primary_metric = 'norm_macro_recall'

# If user_selected_metric = '', best model is selected based on primary_metric and if user_selected_metric = 'f1_score_macro' 
# then azure will run the automlstep with primary_metric first and then we select the best model based on the user_selected_metric.
# For list of user_selected_metric refer link 
# https://learn.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml?view=azureml-api-2
# Average precision summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold, 
# with the increase in recall from the previous threshold used as the weight.
# user_selected_metric = 'f1_score_macro'
# user_selected_metric = 'average_precision_score_macro'
user_selected_metric = ''

# iterations: The total number of different algorithm and parameter combinations to test during an automated ML experiment. 
# If not specified, the default is 1000 iterations.
# More iterations => more different model algorithms and more different hyperparameters to be tried.
model_iterations = 40

# Maximum amount of time in hours that all iterations combined can take before the experiment terminates. 
# Can be a decimal value like 0.25 representing 15 minutes. If not specified, the default experiment timeout is 6 days. 
# To specify a timeout less than or equal to 1 hour, make sure your dataset's size is not greater than 10,000,000 (rows times column) 
# or an error results.
experiment_timeout_hours = 3

In [13]:
# Pipeline step will use previously stored run details if no change is made in the data if allow_reuse_step = True
# You can specify allow_reuse=False as a parameter of the Step. When allow_reuse is set to False, 
# the step run won’t be reused, and a new run will always be generated for the step during pipeline execution. 
# Default behavior of Pipelines is to set allow_reuse=True for steps.
allow_reuse_step = True

# Pipeline step will use previously stored output if no change is made in the data if regenerate_outputs = True
# If regenerate_outputs is set to True for the Experiment.Submit() call, a new submit will always force generation of all step outputs, 
# and disallow data reuse for any step of this run. Once this run is complete, however, subsequent runs may reuse the results of this run. 
# Default behavior of Pipelines is to set regenerate_outputs=False for experiment submit calls.
regenerate_outputs = False

In [14]:
# # Python scripts folder (.py files folder location)
# script_data_transformation_folder = '../DataPreparation/DataTransformation/'
# script_data_preprocess_folder = '../DataPreparation/DataPreprocess/'
# script_modeling_folder = '../Modeling/'

# # Input data raw files (path inside the blob container)
# # input_full_raw_data_path = 'InputData/FullRawFiles'
# input_raw_data_path = 'InputData/InputRawFiles'

# # Input mapping files (path inside the blob container)
# input_mapping_data_path = 'InputData/MappingFiles'

# # For model training pipeline (MTP)
# if not new_data_flag:
#     # Sample file path
#     input_sample_data_path = f'Users/{user_name}/MTP/InputData/SampleFiles'

#     # Intermediate output files (path inside the blob container)
#     output_data_intermediate_path = f'Users/{user_name}/MTP/Outputs/OutputDataset/{datetime_now}'

#     # Output path for model info - latest run
#     output_data_other_file_path = f'Users/{user_name}/MTP/Outputs/OutputOtherFiles'

# # For new data pipeline (NDP)
# if new_data_flag:
#     # Sample file path
#     input_sample_data_path = f'Users/{user_name}/NDP/InputData/SampleFiles'

#     # Intermediate output files (path inside the blob container)
#     output_data_intermediate_path = f'Users/{user_name}/NDP/Outputs/OutputDataset/{datetime_now}'

#     # Output path for model info - latest run
#     output_data_other_file_path = f'Users/{user_name}/NDP/Outputs/OutputOtherFiles'

# # Path to monitor for changes if schedule_pipeline_flag = True
# data_path_to_monitor = f'{input_raw_data_path}/transaction_data/'

# # Config files - for scheduler
# # This file keep tracks of the pipeline parameters. Modify the parameters to alter pipeline output.
# scheduler_config_file='../Configurations/schedule_ndp_config.yaml'

In [15]:
# Python scripts folder (.py files folder location)
script_data_transformation_folder = '../DataPreparation/DataTransformation/'
script_data_preprocess_folder = '../DataPreparation/DataPreprocess/'
script_modeling_folder = '../Modeling/'

# Input data raw files (path inside the blob container)
# input_full_raw_data_path = 'InputData/FullRawFiles'
if new_data_flag:
    input_raw_data_path = 'InputData/InputRawFiles_VM'
else:
    input_raw_data_path = 'InputData/InputRawFiles'
    
# Input mapping files (path inside the blob container)
input_mapping_data_path = 'InputData/MappingFiles'

# For model training pipeline (MTP)
if not new_data_flag:
    # Sample file path
    input_sample_data_path = f'Users/{user_name}/MTP/InputData/SampleFiles'

    # Intermediate output files (path inside the blob container)
    output_data_intermediate_path = f'Users/{user_name}/MTP/Outputs/OutputDataset/{datetime_now}'

    # Output path for model info - latest run
    output_data_other_file_path = f'Users/{user_name}/MTP/Outputs/OutputOtherFiles'

# For new data pipeline (NDP)
if new_data_flag:
    # Sample file path
    input_sample_data_path = f'Users/{user_name}/NDP/InputData/SampleFiles'

    # Intermediate output files (path inside the blob container)
    output_data_intermediate_path = f'Users/{user_name}/NDP/Outputs/OutputDataset/{datetime_now}'

    # Output path for model info - latest run
    output_data_other_file_path = f'Users/{user_name}/NDP/Outputs/OutputOtherFiles'

# Path to monitor for changes if schedule_pipeline_flag = True
data_path_to_monitor = f'{input_raw_data_path}/transaction_data/'

# Config files - for scheduler
# This file keep tracks of the pipeline parameters. Modify the parameters to alter pipeline output.
scheduler_config_file='../Configurations/schedule_ndp_config.yaml'

In [16]:
# Choose a name for your CPU cluster, script will create compute cluster automatically if not exist
if new_data_flag:
    amlcompute_cluster_name = f"{user_name}-cpu-cluster-ndp"
else:
    amlcompute_cluster_name = f"{user_name}-cpu-cluster-mtp"

# Select number of nodes for autoscaling of cluster
num_min_nodes = 0
num_max_nodes = 10

# Parallel process to be run by parallel functions
njobs = num_max_nodes*2-1

In [17]:
# Specify Environment & Experiment name - This will be visible in Azure UI
# environment_name = f'{user_name}_env'

# Experiment Name For Model Training Pipeline & New Data Pipeline
if sample_flag and not new_data_flag:
    experiment_name = f'{user_name}_model_training_pipeline_test'
elif not new_data_flag:
    experiment_name = f'{user_name}_model_training_pipeline'
elif new_data_flag and sample_flag:
    experiment_name = f'{user_name}_new_data_pipeline_test'
elif new_data_flag and not sample_flag:
    experiment_name = f'{user_name}_new_data_pipeline'
else:
    experiment_name = f'{user_name}_test_pipeline'
    
print('Experiment Name:', experiment_name)

Experiment Name: master_model_training_pipeline


In [18]:
# Provide suffix for naming the asset. Ex: transaction_data is the data asset name which script will create. 
# If dataAssetName_suffix = '', then default names will be used, specified in various sections of script.
# If you specify dataAssetName_suffix = '_v1' then transaction_data asset name can be seen as -> username_transaction_data_v1
# Suffix mtp: model training pipeline Ex: username_transaction_data_mtp
# Suffix ndp: new data pipeline Ex: username_transaction_data_mtp
dataAssetName_suffix = '_mtp'
if new_data_flag:
    dataAssetName_suffix = '_ndp'

In [19]:
if new_data_flag == False:
    # These flags can only be true for NDP
    schedule_pipeline_flag = False
    latest_data_ndp_flag = False

### [2.2] Azure ML Configurations

In [20]:
# Loads workspace info from config.json file
ws = Workspace.from_config(path='../Configurations/', _file_name='workspace_config.json')

### [2.3] Azure Blob Storage Configuration

In [21]:
# blob storage configurations
blob_storage_name = 'azureconstorage3630ce4c4'
blob_container = 'azureml'
blob_key = 'OaxkL/pE4/XVJ1nJiZABE8wvsfSN434A66MiyiyywQOhYEDRYXS0t9DP+xLDT+RS8Alcygkszd+W+ASt54cY2w=='

# Datastore name: We can see dataAssets created in this, this will also refer to the default blob storage specified
# datastore_name = f"{user_name}_blob_datastore"
datastore_name = "bidv_blob_datastore"

### [2.4] Other Variables

In [22]:
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", 50)
# pd.options.display.float_format = '{:.3f}'.format

# [3] Reference Data

### [3.1] Create and Register DataStore

In [23]:
%%time
# Blob storage to Datastore
blob_data_store = Datastore.register_azure_blob_container(
    workspace = ws, 
    datastore_name = datastore_name,
    account_name = blob_storage_name,
    container_name = blob_container,
    account_key = blob_key
)

# Default Datastore
# default_store = ws.get_default_datastore()
default_store = blob_data_store
ws.set_default_datastore(name = datastore_name)

default_store = ws.get_default_datastore()
eval(str(default_store))['name']

CPU times: user 80.1 ms, sys: 14.5 ms, total: 94.6 ms
Wall time: 811 ms


'bidv_blob_datastore'

### [3.2] Create and Register Datasets

#### [3.2.1] Input Mapping Files

In [24]:
%%time
# List of mapping csv files to read  - use only csv files
map_dataAsset_mappingFile = {
    #'mapping_data_mstatus' : 'mapping_mstatus.csv',
    'mapping_data_occupation' : 'mapping_occupation.csv',
    'mapping_data_translation' : 'mapping_vn_en.csv',
    'mapping_data_region' : 'mapping_region.csv'
}

# Create and Register Data Assets
for dataAssetName, mappingFileName in map_dataAsset_mappingFile.items():
    dataAssetNameOnAzure = f'{user_name}_{dataAssetName}{dataAssetName_suffix}'
    csv_path = [(blob_data_store, f"{input_mapping_data_path}/{mappingFileName}")]
    try:
        globals()[dataAssetName] = Dataset.get_by_name(ws, dataAssetNameOnAzure)
        print(f'Referencing {dataAssetNameOnAzure} data aset on azure. Use {dataAssetName} in notebook to reference it.')
    except:
        globals()[dataAssetName] = Dataset.Tabular.from_delimited_files(path=csv_path)
        globals()[dataAssetName] = eval(dataAssetName).register(
            ws, dataAssetNameOnAzure, create_new_version=True, 
            tags={'step':'input', 'data': 'mapping', 'sample_data':'false' ,'temporary_data': 'False'}
        )
        print(f'Created {dataAssetNameOnAzure} data aset on azure. Use {dataAssetName} here to reference it.')

Referencing master_mapping_data_occupation_mtp data aset on azure. Use mapping_data_occupation in notebook to reference it.
Referencing master_mapping_data_translation_mtp data aset on azure. Use mapping_data_translation in notebook to reference it.
Referencing master_mapping_data_region_mtp data aset on azure. Use mapping_data_region in notebook to reference it.
CPU times: user 262 ms, sys: 18.1 ms, total: 280 ms
Wall time: 2.6 s


# [4] Setup Environment & Cluster

### [4.1] Create & Register Custom Environment

In [25]:
# # Create new enviroment
# myenv = Environment(environment_name)

# # Use conda_dependencies.yml to create a conda environment in the docker image
# myenv.python.user_managed_dependencies = False

# # Specify CondaDependencies
# myenv_dependencies = CondaDependencies.create(
#     conda_packages=['pandas', 'scikit-learn'],
#     pip_packages= ['azureml-sdk[automl]', 'pyarrow', 'ipywidgets']
# )
# myenv.python.conda_dependencies = myenv_dependencies

# # Register enviroment to workspace
# myenv.register(ws)

# print('Environment Created & Registerd to workspace.')

### [4.2] Create new or use an existing compute cluster

In [26]:
# Verify that cluster does not exist already
try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print(f'Found existing cluster {amlcompute_cluster_name}, will use that for pipeline.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2', max_nodes=1)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)
    print('Compute cluster created with name:', amlcompute_cluster_name)
    
# Check the cluster status
aml_compute.wait_for_completion(show_output=True)

Found existing cluster master-cpu-cluster-mtp, will use that for pipeline.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### [4.3] Define Run Config

In [27]:
# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute cluster
aml_run_config.target = aml_compute

# Enable autoscaling
aml_compute.update(min_nodes=num_min_nodes, max_nodes=num_max_nodes, idle_seconds_before_scaledown=1800)

# Use myenv environment
# aml_compute.environment = myenv

# Enable Docker
docker_config = DockerConfiguration(use_docker=True)
aml_run_config.docker = docker_config

# Use conda_dependencies.yml to create a conda environment in the docker image
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas', 'scikit-learn'],
    pip_packages= [
        'azureml-sdk[automl]==1.52.0', 'azureml-fsspec', 'pyarrow>=0.16.0', 'ipywidgets', 'xgboost', 'joblib'
    ]
)

print('Run configuration created.')

Run configuration created.


# [5] Pipeline

Define Default Pipeline Input Parameters

In [28]:
# Dynamic parameters to the pipeline - will be used while scheduling the pipeline
# Specifying default values of these pipeline parameters
# Change them in schedule_ndp.yaml file to use dynamic values during runtime
pipeline_parameters_mtp = {
    "observation_year": observation_year,
    "observation_month_number" : observation_month_number,
    "historical_months" : historical_months,
    "latest_data_ndp_flag" : False, # this should always be false as this is for NDP
}

pipeline_parameters_ndp = {
    "model_version_ndp" : model_version_ndp,
    "observation_year": observation_year,
    "observation_month_number" : observation_month_number,
    "historical_months" : historical_months,
    "latest_data_ndp_flag" : latest_data_ndp_flag
}

if new_data_flag:
    pipeline_parameters = pipeline_parameters_ndp
else:
    pipeline_parameters = pipeline_parameters_mtp

for key, value in pipeline_parameters.items():
    globals()[f'pp_{key}'] = PipelineParameter(name=key, default_value=value)

pp_model_name = PipelineParameter("model_name", default_value=model_name)

In [29]:
# Defining pipeline name & tags
if new_data_flag:
    # For NDP - new data pipeline
    pipeline_name = f'{user_name}_Pipeline-DataPrep-NewData_{datetime_now}'
    pipeline_tags = {
        'pipeline type' : 'New Data Pipeline (NDP)', 'model_version' : pp_model_version_ndp,
        'pipeline_name' : pipeline_name, 
        'username' : user_name, 'output_data_intermediate_path' : output_data_intermediate_path, 
        'new_data_flag' : new_data_flag, 'schedule_pipeline_flag' : schedule_pipeline_flag, 
        'latest_data_ndp_flag' : latest_data_ndp_flag, 'data_path_to_monitor' : data_path_to_monitor,
        'sample_flag' : sample_flag, 'sample_size' : num_sample_customer_ids, 'reuse_sample_flag' : reuse_sample_flag,
        'observation_month_number' : pp_observation_month_number, 'observation_year' : pp_observation_year, 
        'historical_months' : pp_historical_months, 
        'allow_reuse_step' : allow_reuse_step, 'regenerate_outputs' : regenerate_outputs,
        'amlcompute_cluster_name' : amlcompute_cluster_name, 'num_min_nodes' : num_min_nodes, 'num_max_nodes' : num_max_nodes,
        'test_size' : test_size, 'outliers_method' : outliers_method, 'outliers_threshold' : outliers_threshold,
        'multiCollinearity_method' : multiCollinearity_method, 'multiCollinearity_threshold' : multiCollinearity_threshold,
        'featureSelection_method' : featureSelection_method, 'featureSelection_threshold' : featureSelection_threshold, 
        'featureSelection_percentage' : featureSelection_percentage, 'normalization_method' : normalization_method,
        'model_name' : model_name, 'primary_metric' : primary_metric, 'user_selected_metric' : user_selected_metric, 
    }
else:
    # For MTP - model training pipeline
    pipeline_name = f'{user_name}_Pipeline-DataPrep-ModelTrainTest_{datetime_now}'
    pipeline_tags = {
        'pipeline type' : 'Model Training Pipeline (MTP)', 'model_version' : model_version_ndp,
        'pipeline_name' : pipeline_name, 
        'username' : user_name, 'output_data_intermediate_path' : output_data_intermediate_path, 
        'new_data_flag' : new_data_flag, 'schedule_pipeline_flag' : schedule_pipeline_flag, 
        'latest_data_ndp_flag' : latest_data_ndp_flag, 'data_path_to_monitor' : data_path_to_monitor,
        'sample_flag' : sample_flag, 'sample_size' : num_sample_customer_ids, 'reuse_sample_flag' : reuse_sample_flag,
        'observation_month_number' : pp_observation_month_number, 'observation_year' : pp_observation_year, 
        'historical_months' : pp_historical_months,
        'allow_reuse_step' : allow_reuse_step, 'regenerate_outputs' : regenerate_outputs,
        'amlcompute_cluster_name' : amlcompute_cluster_name, 'num_min_nodes' : num_min_nodes, 'num_max_nodes' : num_max_nodes,
        'test_size' : test_size, 'outliers_method' : outliers_method, 'outliers_threshold' : outliers_threshold,
        'multiCollinearity_method' : multiCollinearity_method, 'multiCollinearity_threshold' : multiCollinearity_threshold,
        'featureSelection_method' : featureSelection_method, 'featureSelection_threshold' : featureSelection_threshold, 
        'featureSelection_percentage' : featureSelection_percentage, 'normalization_method' : normalization_method,
        'model_name' : model_name, 'primary_metric' : primary_metric, 'user_selected_metric' : user_selected_metric, 
    }

## [5.1] Data Transformation Pipeline Steps

### [5.1.1] Fetch Latest Data Step

In [30]:
# Step Overview:
# Step will fetch data from input_raw_data_path path. Based on the months specified in 
# observation_year, observation_month_number & historical_months variables.
# This step is common for both model training pipeline (mtp) & new data pipeline (ndp)

# Define step variables
step_output_data_name_demo = 'demographic_data'
step_output_data_name_trx = 'transaction_data'
step_output_data_name_acc = 'account_data'
step_display_title = "Fetch_Latest_Data"
step_script_name = 'fetch_latest_data.py'
ws_details = ws.get_details()['id']

# Creating output folder with name step_output_data_name & Registering it to DataAssets
globals()[step_output_data_name_demo] = OutputFileDatasetConfig(
    # name of the file in the blob
    name = step_output_data_name_demo, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_demo}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_demo}{dataAssetName_suffix}',
    # tags={'step':'output', 'data': 'raw', 'multiple_files' : 'True', 'sample_data':'False' ,'temporary_data': 'False'}
    tags = pipeline_tags
)
# Creating output folder with name step_output_data_name & Registering it to DataAssets
globals()[step_output_data_name_trx] = OutputFileDatasetConfig(
    name = step_output_data_name_trx, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_trx}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_trx}{dataAssetName_suffix}',
    description = f'{step_output_data_name_trx} for {pp_observation_year}_{pp_observation_month_number}',
    # tags={
    #     'step':'output', 'data': 'raw', 
    #     'observation_year' : pp_observation_year, 'observation_month' : pp_observation_month_number,
    #     'historical_months' :  pp_historical_months,
    #     'multiple_files' : 'True', 'sample_data':'False', 'temporary_data': 'True'
    # }
    tags = pipeline_tags
)
# Creating output folder with name step_output_data_name & Registering it to DataAssets
globals()[step_output_data_name_acc] = OutputFileDatasetConfig(
    name = step_output_data_name_acc, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_acc}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_acc}{dataAssetName_suffix}',
    # tags={'step':'output', 'data': 'raw', 'multiple_files' : 'True', 'sample_data':'False' ,'temporary_data': 'False'}
    tags = pipeline_tags
)   

# Creating the step
fetchLatestDataStep = PythonScriptStep(
    name = step_display_title,
    script_name = step_script_name,
    arguments=[
        "--output_data_path_demo", eval(step_output_data_name_demo),
        "--output_data_path_trx", eval(step_output_data_name_trx),
        "--output_data_path_acc", eval(step_output_data_name_acc),
        "--observation_year", pp_observation_year,
        "--observation_month_number", pp_observation_month_number,
        "--historical_months", pp_historical_months,
        "--input_raw_data_path", input_raw_data_path,
        "--new_data_flag", new_data_flag,
        "--monitoring_flag", monitoring_flag,
        "--latest_data_ndp_flag", pp_latest_data_ndp_flag,
        "--ws_details", ws_details
    ],
    #inputs=[],
    outputs=[
        eval(step_output_data_name_trx), 
        eval(step_output_data_name_demo),
        eval(step_output_data_name_acc)
    ],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_transformation_folder,
    allow_reuse = allow_reuse_step
)

print('Fetch_Data step created.')

Fetch_Data step created.


### [5.1.2] Data Sampling

In [31]:
# Step Overview:
# Step will create the sample if reuse_sample_flag = False, it will return previous sample if reuse_sample_flag is True
# This step is common for both model training pipeline (mtp) & new data pipeline (ndp)

# Define step variables
step_output_data_name_demo = 'sample_demographic_data'
step_output_data_name_trx = 'sample_transaction_data'
step_output_data_name_acc = 'sample_account_data'
step_display_title = "Sampling_Data"
step_script_name = 'data_sampling.py'

# Creating output folder with name step_output_data_name & Registering it to DataAssets
globals()[step_output_data_name_demo] = OutputFileDatasetConfig(
    # name of the file in the blob
    name = step_output_data_name_demo, 
    destination = (default_store, f'{input_sample_data_path}/{step_output_data_name_demo}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_demo}{dataAssetName_suffix}',
    # tags={'step':'output', 'data': 'sampled', 'sample_data':'True' ,'temporary_data': 'False'}
    tags = pipeline_tags
)
# Creating output folder with name step_output_data_name & Registering it to DataAssets
globals()[step_output_data_name_trx] = OutputFileDatasetConfig(
    name = step_output_data_name_trx, 
    destination = (default_store, f'{input_sample_data_path}/{step_output_data_name_trx}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_trx}{dataAssetName_suffix}',
    # tags={'step':'output', 'data': 'sampled', 'sample_data':'True' ,'temporary_data': 'False'}
    tags = pipeline_tags
)
# Creating output folder with name step_output_data_name & Registering it to DataAssets
globals()[step_output_data_name_acc] = OutputFileDatasetConfig(
    name = step_output_data_name_acc, 
    destination = (default_store, f'{input_sample_data_path}/{step_output_data_name_acc}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_acc}{dataAssetName_suffix}',
    # tags={'step':'output', 'data': 'sampled', 'sample_data':'True' ,'temporary_data': 'False'}
    tags = pipeline_tags
)   

# Creating the step
samplingDataStep = PythonScriptStep(
    name = step_display_title,
    script_name = step_script_name,
    arguments=[
        "--output_data_path_demo", eval(step_output_data_name_demo),
        "--output_data_path_trx", eval(step_output_data_name_trx),
        "--output_data_path_acc", eval(step_output_data_name_acc),
        "--num_sample_customer_ids", num_sample_customer_ids,
        "--reuse_sample_flag", reuse_sample_flag
    ],
    inputs=[
        transaction_data.read_parquet_files().as_input('raw_data_transaction'),
        demographic_data.read_parquet_files().as_input('raw_data_demographics'),
        account_data.read_parquet_files().as_input('raw_data_account'),
    ],
    outputs=[
        eval(step_output_data_name_trx), 
        eval(step_output_data_name_demo), 
        eval(step_output_data_name_acc)
    ],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_transformation_folder,
    allow_reuse = allow_reuse_step
)

print('Sampling_Data step created.')

Sampling_Data step created.


### [5.1.3] Prepare Demographics Data

In [32]:
# Step Overview:
# Step will transform the demographics data.
# This step is common for both model training pipeline (mtp) & new data pipeline (ndp)

# Define step variables
step_output_data_name = 'transformed_demographics_data'
step_display_title = "Transform_Demographics_Data"
step_script_name = 'transform_demographics.py'

# Creating output folder with name step_output_data_name & Registering it to DataAssets
globals()[step_output_data_name] = OutputFileDatasetConfig(
    name = step_output_data_name, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name}{dataAssetName_suffix}',
    # tags={'step':'output', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

# Inputs for the step
step_inputs = [
    mapping_data_region.as_named_input('mapping_data_region'), 
    mapping_data_occupation.as_named_input('mapping_data_occupation'),
    mapping_data_translation.as_named_input('mapping_data_translation')
]
# Selecting input data as raw or sample
if sample_flag:
    # If we are using the output from previous step, then we use .read_parquet_files().as_input() functions to send input to this step
    raw_data_input=[sample_demographic_data.read_parquet_files().as_input('raw_data_demographics')] 
else:
    # If both sample and reuse sample flags are false, then we run the code on entire data, readind as data assets
    # raw_data_input=[demographics_data.as_named_input('raw_data_demographics')] # <- if reading direct previous data assets
    raw_data_input=[demographic_data.read_parquet_files().as_input('raw_data_demographics')]
# Combining mapping files with the raw or sample input data
step_inputs = raw_data_input + step_inputs
    
# Creating the step
transformingDemographicsStep = PythonScriptStep(
    name = step_display_title,
    script_name = step_script_name,
    arguments=[
        f"--output_data_path", eval(step_output_data_name)
    ],
    inputs=step_inputs,
    outputs=[eval(step_output_data_name)],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_transformation_folder,
    allow_reuse = allow_reuse_step
)

### [5.1.4] Prepare Transaction Data

In [33]:
# Step Overview:
# Step will transform the transaction data.
# This step is common for both model training pipeline (mtp) & new data pipeline (ndp)

step_output_data_name = 'transformed_transaction_data'
step_display_title = "Transform_Transaction_Data"
step_script_name = 'transform_transaction.py'

globals()[step_output_data_name] = OutputFileDatasetConfig(
    name = step_output_data_name, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name}{dataAssetName_suffix}',
    # tags={'step':'output', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

# Inputs for the step
step_inputs = [
    mapping_data_translation.as_named_input('mapping_data_translation'),
    transformed_demographics_data.read_parquet_files().as_input('transformed_demographics_data')
]
if sample_flag:
    raw_data_input=[sample_transaction_data.read_parquet_files().as_input('raw_data_transaction')] 
else:
    raw_data_input=[transaction_data.read_parquet_files().as_input('raw_data_transaction')]
step_inputs = raw_data_input + step_inputs

transformingTransactionStep = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path", eval(step_output_data_name),
        "--njobs", njobs,
    ],
    inputs=step_inputs,
    outputs=[eval(step_output_data_name)],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_transformation_folder,
    allow_reuse = allow_reuse_step)

### [5.1.5] Prepare Account Data

In [34]:
# Step Overview:
# Step will transform the account data.
# This step is common for both model training pipeline (mtp) & new data pipeline (ndp)

step_output_data_name = 'transformed_account_data'
step_display_title = "Transform_Account_Data"
step_script_name = 'transform_account.py'

globals()[step_output_data_name] = OutputFileDatasetConfig(
    name = step_output_data_name, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name}{dataAssetName_suffix}', 
    # tags={'step':'output', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

# Inputs for the step
step_inputs = [
    mapping_data_translation.as_named_input('mapping_data_translation'),
]
if sample_flag:
    raw_data_input=[sample_account_data.read_parquet_files().as_input('raw_data_account')] 
else:
    raw_data_input=[account_data.read_parquet_files().as_input('raw_data_account')]
step_inputs = raw_data_input + step_inputs

transformingAccountStep = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path", eval(step_output_data_name)
    ],
    inputs=step_inputs,
    outputs=[eval(step_output_data_name)],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_transformation_folder,
    allow_reuse = allow_reuse_step
)

### [5.1.6] Feature Engineering Transaction Data

In [35]:
# Step Overview:
# Step will create new features using the transaction data.
# This step is common for both model training pipeline (mtp) & new data pipeline (ndp)

step_output_data_name = 'feature_eng_transaction_data'
step_display_title = "Feature_Engineering-Transaction_Data"
step_script_name = 'feature_eng_transaction.py'

globals()[step_output_data_name] = OutputFileDatasetConfig(
    name = step_output_data_name, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name}{dataAssetName_suffix}', 
    # tags={'step':'output', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

featureEngTransactionStep = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path", eval(step_output_data_name),
        "--observation_month_number", pp_observation_month_number, 
        "--observation_year", pp_observation_year, 
        "--historical_months", pp_historical_months
    ],
    inputs=[transformed_transaction_data.read_parquet_files().as_input('transformed_transaction_data')],
    outputs=[eval(step_output_data_name)],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_preprocess_folder,
    allow_reuse = allow_reuse_step
)

### [5.1.7] Merging Data

In [36]:
# Step Overview:
# Step will merge transaction, demographic, account & feature engineered transaction data.
# This step is common for both model training pipeline (mtp) & new data pipeline (ndp).

step_output_data_name = 'merged_transformed_data'
step_display_title = "Merge_Transformed_Data"
step_script_name = 'merge_transformed_data.py'

globals()[step_output_data_name] = OutputFileDatasetConfig(
    name = step_output_data_name, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name}{dataAssetName_suffix}', 
    # tags={'step':'output', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

mergingTransformedDataStep = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path", eval(step_output_data_name),
        "--observation_month_number", pp_observation_month_number, 
        "--observation_year", pp_observation_year,
    ],
    inputs=[
        feature_eng_transaction_data.read_parquet_files().as_input('feature_eng_transaction_data'),
        transformed_transaction_data.read_parquet_files().as_input('transformed_transaction_data'),
        transformed_demographics_data.read_parquet_files().as_input('transformed_demographics_data'),
        transformed_account_data.read_parquet_files().as_input('transformed_account_data'), 
    ],
    outputs=[eval(step_output_data_name)],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_transformation_folder,
    allow_reuse = allow_reuse_step
)

### [5.1.8] Create Target Variable - Churn Flag

In [37]:
# Step Overview:
# Step will create churn flag for training and testing set.
# This step is only for model training pipeline (mtp).

step_output_data_name = 'created_churn_flag_data'
step_display_title = "Create_Target-Churn_Flag"
step_script_name = 'create_churn_flag.py'

globals()[step_output_data_name] = OutputFileDatasetConfig(
    name = step_output_data_name, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name}{dataAssetName_suffix}', 
    # tags={'step':'output', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

creatingChurnFlagStep = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path", eval(step_output_data_name),
        "--njobs", njobs,
    ],
    inputs=[
        transformed_transaction_data.read_parquet_files().as_input('transformed_transaction_data'),
    ],
    outputs=[eval(step_output_data_name)],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_transformation_folder,
    allow_reuse = allow_reuse_step
)

### [5.1.9] Merging Data with Target Churn Flag

In [38]:
# Step Overview:
# Step will merge churn flag with the previous merged data.
# This step is only for model training pipeline (mtp).

step_output_data_name = 'merged_with_churn_flag_data'
step_display_title = "Merge_Data_with_Churn_Flag"
step_script_name = 'merge_churn_flag.py'

globals()[step_output_data_name] = OutputFileDatasetConfig(
    name = step_output_data_name, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name}{dataAssetName_suffix}', 
    # tags={'step':'output', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

mergedWithChurnStep = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path", eval(step_output_data_name),
        "--observation_month_number", pp_observation_month_number, 
        "--observation_year", pp_observation_year,
    ],
    inputs=[
        created_churn_flag_data.read_parquet_files().as_input('created_churn_flag_data'),
        merged_transformed_data.read_parquet_files().as_input('merged_transformed_data'),
    ],
    outputs=[eval(step_output_data_name)],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_transformation_folder,
    allow_reuse = allow_reuse_step
)

## [5.2] Data Preprocess Pipeline Steps

### [5.2.1] Data Train Test Split

In [39]:
# Step Overview:
# Step will create training and testing set based on test_size parameter specified in global variables.
# This step is only for model training pipeline (mtp).

# Parameters for Train Test Split Step
random_state = 42

# Output variable name from this step
step_output_data_name_train = 'train_data'
step_output_data_name_test = 'test_data'
step_display_title = "Train_Test_Data_Split"
step_script_name = 'train_test_split.py'

globals()[step_output_data_name_train] = OutputFileDatasetConfig(
    name = step_output_data_name_train, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_train}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name}{dataAssetName_suffix}',
    # tags={'step':'output', 'train_data':'True', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

globals()[step_output_data_name_test] = OutputFileDatasetConfig(
    name = step_output_data_name_test, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_test}')
).register_on_complete(
    name = step_output_data_name_test, 
    # tags={'step':'output', 'train_data':'False', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

trainTestSplitStep = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path_train", eval(step_output_data_name_train),
        "--output_data_path_test", eval(step_output_data_name_test),
        "--test_size", test_size,
        "--random_state", random_state
    ],
    inputs=[
        merged_with_churn_flag_data.read_parquet_files().as_input('merged_with_churn_flag_data'),
    ],
    outputs=[eval(step_output_data_name_train), eval(step_output_data_name_test)],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_preprocess_folder,
    allow_reuse = allow_reuse_step
)

### [5.2.2] Outlier Treatment

In [40]:
# Step Overview:
# Step will remove outliers based on outliers_threshold parameter specified in global variables.
# This step is only for model training pipeline (mtp).

# This step is Only for train data set
outliers_method = 'iForest'
outliers_threshold = 0.05

step_output_data_name_train = 'outlier_treated_data_train'
step_display_title = "Outlier_Treatment"
step_script_name = 'outlier_treatment.py'
pipelineStep = 'trainOutlierTreatmentStep'
input_data_train = train_data

globals()[step_output_data_name_train] = OutputFileDatasetConfig(
    name = step_output_data_name_train, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_train}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_train}{dataAssetName_suffix}',
    # tags={'step':'output', 'train_data':'True', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

globals()[pipelineStep] = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path_train", eval(step_output_data_name_train),
        "--outliers_threshold", outliers_threshold, 
        "--outliers_method", outliers_method,
        "--njobs", njobs,
    ],
    inputs=[
        input_data_train.read_parquet_files().as_input('train_data'),
    ],
    outputs=[
        eval(step_output_data_name_train), 
    ],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_preprocess_folder,
    allow_reuse = allow_reuse_step
)
## This step should not be included for new data

### [5.2.3] Multicolinearity Test

In [41]:
# Step Overview:
# Step will remove columns that are highly correlated based on the multiCollinearity_threshold parameter specified in global variables.
# This step is only for model training pipeline (mtp).

multiCollinearity_method = 'pearson'
multiCollinearity_threshold = 0.95

step_output_data_name_train = 'multicollinearity_treated_data_train'
step_output_data_name_test = 'multicollinearity_treated_data_test'
step_display_title = "Multi-Collinearity_Treatment"
step_script_name = 'multicollinearity_treatment.py'
pipelineStep = 'multiCollinearityTreatmentStep'
input_data_train = outlier_treated_data_train
input_data_test = test_data

globals()[step_output_data_name_train] = OutputFileDatasetConfig(
    name = step_output_data_name_train, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_train}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_train}{dataAssetName_suffix}', 
    # tags={'step':'output', 'train_data':'True', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

globals()[step_output_data_name_test] = OutputFileDatasetConfig(
    name = step_output_data_name_test, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_test}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_test}{dataAssetName_suffix}', 
    # tags={'step':'output', 'train_data':'False', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

globals()[pipelineStep] = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path_train", eval(step_output_data_name_train),
        "--output_data_path_test", eval(step_output_data_name_test),
        "--multiCollinearity_threshold", multiCollinearity_threshold, 
        "--multiCollinearity_method", multiCollinearity_method,
    ],
    inputs=[
        input_data_train.read_parquet_files().as_input('train_data'),
        input_data_test.read_parquet_files().as_input('test_data'), 
    ],
    outputs=[
        eval(step_output_data_name_train), 
        eval(step_output_data_name_test)
    ],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_preprocess_folder,
    allow_reuse = allow_reuse_step
)

### [5.2.4] Feature Selection

In [42]:
# Step Overview:
# Step will select columns based on the featureSelection_method & featureSelection_threshold.
# This step is only for model training pipeline (mtp).

# feature selection method
featureSelection_method = 'lightGBM'
# Select features with score greater than or equal to featureSelection_threshold
# featureSelection_threshold = 0 means select features based on featureSelection_percentage
featureSelection_threshold = 0 #0.001
# Select top x percetange of features x/100 = featureSelection_percentage
# featureSelection_threshold will eliminate some features and then select 
# x (featureSelection_percentage) percentage of features from those selected features
# featureSelection_percentage = 1 means select features based on featureSelection_threshold
featureSelection_percentage = 1
random_state = 42

step_output_data_name_train = 'featureSelected_data_train'
step_output_data_name_test = 'featureSelected_data_test'
step_display_title = "Feature Selection - Train Set"
step_script_name = 'feature_selection.py'
pipelineStep = 'featureSelectionStep'
input_data_train = multicollinearity_treated_data_train
input_data_test = multicollinearity_treated_data_test

globals()[step_output_data_name_train] = OutputFileDatasetConfig(
    name = step_output_data_name_train, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_train}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_train}{dataAssetName_suffix}', 
    # tags={'step':'output', 'train_data':'True', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

globals()[step_output_data_name_test] = OutputFileDatasetConfig(
    name = step_output_data_name_test, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_test}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_test}{dataAssetName_suffix}', 
    # tags={'step':'output', 'train_data':'False', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

globals()[pipelineStep] = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path_train", eval(step_output_data_name_train),
        "--output_data_path_test", eval(step_output_data_name_test),
        "--featureSelection_threshold", featureSelection_threshold, 
        "--featureSelection_percentage", featureSelection_percentage, 
        "--featureSelection_method", featureSelection_method,
        "--random_state", random_state,
        "--njobs", njobs,
    ],
    inputs=[
        input_data_train.read_parquet_files().as_input('train_data'),
        input_data_test.read_parquet_files().as_input('test_data'), 
    ],
    outputs=[
        eval(step_output_data_name_train), 
        eval(step_output_data_name_test)
    ],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_preprocess_folder,
    allow_reuse = allow_reuse_step
)

### [5.2.5] Normalize the Data

In [43]:
# Step Overview:
# Step will normalize the data. For new data pipeline, train and test both will refer to same new data set.
# This step is common for both model training pipeline (mtp) & new data pipeline (ndp).
# Skip this manual normalization step: as azure will handle this for us automatically

# normalization_method = 'standard'
normalization_method = 'skip'

# Formodel training pipeline (mtp)
step_output_data_name_train = 'normalized_data_train'
step_output_data_name_test = 'normalized_data_test'
step_display_title = "Data_Normalization"
step_script_name = 'data_normalization.py'
pipelineStep = 'dataNormalizationStep'
input_data_train = featureSelected_data_train
input_data_test = featureSelected_data_test

# For new data pipeline (ndp)
if (new_data_flag == True) and (monitoring_flag == True):
    print('Running Normalization Step for New Data Pipeline')
    # step_output_data_name_test = 'normalized_data_new_data'
    step_display_title = "Data_Normalization-New_Data"
    # No splitting required for new data set. We set new data set as test set and run the code only for test set.
    input_data_train = merged_with_churn_flag_data
    input_data_test = merged_with_churn_flag_data
elif (new_data_flag == True) and (monitoring_flag == False):
    print('Running Normalization Step for New Data Pipeline')
    # step_output_data_name_test = 'normalized_data_new_data'
    step_display_title = "Data_Normalization-New_Data"
    # No splitting required for new data set. We set new data set as test set and run the code only for test set.
    input_data_train = merged_transformed_data
    input_data_test = merged_transformed_data

globals()[step_output_data_name_train] = OutputFileDatasetConfig(
    name = step_output_data_name_train, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_train}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_train}{dataAssetName_suffix}', 
    # tags={'step':'output', 'train_data':'True', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

globals()[step_output_data_name_test] = OutputFileDatasetConfig(
    name = step_output_data_name_test, 
    destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_test}')
).register_on_complete(
    # data asset name on azure
    name = f'{user_name}_{step_output_data_name_test}{dataAssetName_suffix}', 
    # tags={'step':'output', 'train_data':'False', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
    tags = pipeline_tags
)

globals()[pipelineStep] = PythonScriptStep(
    name = step_display_title,
    script_name=step_script_name,
    arguments=[
        "--output_data_path_train", eval(step_output_data_name_train),
        "--output_data_path_test", eval(step_output_data_name_test), 
        "--normalization_method", normalization_method,
        # "--new_data_flag", new_data_flag,
    ],
    inputs=[
        input_data_train.read_parquet_files().as_input('train_data'),
        input_data_test.read_parquet_files().as_input('test_data'), 
    ],
    outputs=[
        eval(step_output_data_name_train), 
        eval(step_output_data_name_test)
    ],    
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=script_data_preprocess_folder,
    allow_reuse = allow_reuse_step
)

### [5.2.6] Finalize Data

#### [5.2.6.1] For Model Training Pipeline (MTP)

In [44]:
# Step Overview:
# Step will finalize the data for model training and testing. Step takes input from the previous component 
# i.e. either normalized_data or featureSelected_data
# This step is only for model training pipeline (mtp).

if not new_data_flag:
    step_output_data_name_train = 'final_data_train'
    step_output_data_name_test = 'final_data_test'
    step_output_data_name_id_train = 'final_data_id_train'
    step_output_data_name_id_test = 'final_data_id_test'

    step_display_title = "Finalize_Data-Train_Test"
    step_script_name = 'data_final_prep_mtp.py'
    pipelineStep = 'finalDataPrepStep'
    input_data_train = normalized_data_train
    input_data_test = normalized_data_test
    # input_data_train = featureSelected_data_train
    # input_data_test = featureSelected_data_test

    # Save final data sets
    globals()[step_output_data_name_train] = OutputFileDatasetConfig(
        name = step_output_data_name_train, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_train}')
    ).register_on_complete(
        # data asset name on azure
        name = f'{user_name}_{step_output_data_name_train}{dataAssetName_suffix}', 
        # tags={'step':'output', 'train_data':'True', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
        tags = pipeline_tags
    )
    globals()[step_output_data_name_test] = OutputFileDatasetConfig(
        name = step_output_data_name_test, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_test}')
    ).register_on_complete(
        # data asset name on azure
        name = f'{user_name}_{step_output_data_name_test}{dataAssetName_suffix}', 
        # tags={'step':'output', 'train_data':'False', 'data': 'transformed', 'sample_data':'false' ,'temporary_data': 'True'}
        tags = pipeline_tags
    )

    # For saving HASHED_CIF dataframe as a parquet file
    globals()[step_output_data_name_id_train] = OutputFileDatasetConfig(
        name = step_output_data_name_id_train, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_id_train}')
    ).register_on_complete(
        name = f'{user_name}_{step_output_data_name_id_train}{dataAssetName_suffix}',
        # tags={
        #     'step':'output', 'train_data':'True', 'data': 'transformed', 
        #     'sample_data':'false' ,'temporary_data': 'True', 'only_hashed_cif_id':'True'
        # }
        tags = pipeline_tags
    )
    globals()[step_output_data_name_id_test] = OutputFileDatasetConfig(
        name = step_output_data_name_id_test, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_id_test}')
    ).register_on_complete(
        name = f'{user_name}_{step_output_data_name_id_test}{dataAssetName_suffix}',
        # tags={
        #     'step':'output', 'test_data':'True', 'data': 'transformed', 
        #     'sample_data':'false' ,'temporary_data': 'True', 'only_hashed_cif_id':'True'
        # }
        tags = pipeline_tags
    )

    globals()[pipelineStep] = PythonScriptStep(
        name = step_display_title,
        script_name=step_script_name,
        arguments=[
            "--output_data_path_train", eval(step_output_data_name_train),
            "--output_data_path_test", eval(step_output_data_name_test), 
            "--output_data_path_id_train", eval(step_output_data_name_id_train),
            "--output_data_path_id_test", eval(step_output_data_name_id_test), 
        ],
        inputs=[
            input_data_train.read_parquet_files().as_input('train_data'),
            input_data_test.read_parquet_files().as_input('test_data'), 
        ],
        outputs=[
            eval(step_output_data_name_train), 
            eval(step_output_data_name_test),
            eval(step_output_data_name_id_train), 
            eval(step_output_data_name_id_test),           
        ],    
        compute_target=aml_compute,
        runconfig = aml_run_config,
        source_directory=script_data_preprocess_folder,
        allow_reuse = allow_reuse_step
    )

#### [5.2.6.2] For New Data Pipeline (NDP)

In [45]:
# Step Overview:
# Step will finalize the data for evaluation using the trained. This step will take input from the 
# merged_transformed_data, as no outlier, feature selection & multicolinearity needs to be done on new data.
# These values will be taken from the training data.
# This step is only for new data pipeline (ndp).

if new_data_flag:
    step_output_data_name_new_data = 'final_data_new_data'
    step_output_data_name_id_new_data = 'final_data_id_new_data'

    step_display_title = "Finalize_Data-New_Data"
    step_script_name = 'data_final_prep_ndp.py'
    pipelineStep = 'finalDataPrepStep'
    # input_data_new_data = merged_transformed_data
    input_data_new_data = normalized_data_test

    globals()[step_output_data_name_new_data] = OutputFileDatasetConfig(
        name = step_output_data_name_new_data, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_new_data}')
    ).register_on_complete(
        # data asset name on azure
        name = f'{user_name}_{step_output_data_name_new_data}{dataAssetName_suffix}', 
        # tags={'step':'output', 'new_data':'True', 'data': 'transformed_new_data', 'sample_data':'false' ,'temporary_data': 'True'}
        tags = pipeline_tags
    )

    # For saving HASHED_CIF dataframe as a parquet file
    globals()[step_output_data_name_id_new_data] = OutputFileDatasetConfig(
        name = step_output_data_name_id_new_data, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_id_new_data}')
    ).register_on_complete(
        name = f'{user_name}_{step_output_data_name_id_new_data}{dataAssetName_suffix}', 
        # tags={
        #     'step':'output', 'new_data':'True', 'data': 'transformed_new_data', 
        #     'sample_data':'false' ,'temporary_data': 'True', 'only_hashed_cif_id':'True'
        # }
        tags = pipeline_tags
    )

    globals()[pipelineStep] = PythonScriptStep(
        name = step_display_title,
        script_name=step_script_name,
        arguments=[
            "--output_data_path_new_data", eval(step_output_data_name_new_data), 
            "--output_data_path_id_new_data", eval(step_output_data_name_id_new_data), 
            "--monitoring_flag", monitoring_flag
        ],
        inputs=[
            input_data_new_data.read_parquet_files().as_input('new_data'),
        ],
        outputs=[
            eval(step_output_data_name_new_data),  
            eval(step_output_data_name_id_new_data)
        ],    
        compute_target=aml_compute,
        runconfig = aml_run_config,
        source_directory=script_data_preprocess_folder,
        allow_reuse = allow_reuse_step
    )

## [5.3] Model Building Pipeline Steps

### [5.3.1] Train, Validate, HyperTune & Test Model Step

Define settings for autogeneration and tuning

In [46]:
# This is for new data flag, we use final_data_new_data for getting prediciton.
# Model trainig will be skipped for NDP, Trained model from MTP will be used.
if new_data_flag:
    final_data_train = final_data_new_data
    final_data_test = final_data_new_data

In [47]:
# HyperParameter Tunning parameters, iterations: how many different models to be tried with differnt hyperparameters
# More iterations => more different model algorithms and more different hyperparameters to be tried
# comment n_cross_validations & validation_size to let azure ml optimize hyperparameter tunning and cross validation technique

# iteration_timeout_minutes: Maximum time in minutes that each iteration can run for before it terminates. 
# If not specified, a value of 1 month or 43200 minutes is used.

# iterations: The total number of different algorithm and hyper parameter combinations to test during an automated ML experiment. 
# If not specified, the default is 1000 iterations.

# experiment_timeout_hours: Maximum amount of time in hours that all iterations combined can take before the experiment terminates. 
# Can be a decimal value like 0.25 representing 15 minutes. If not specified, the default experiment timeout is 6 days. 
# To specify a timeout less than or equal to 1 hour, make sure your dataset's size is not greater than 10,000,000 (rows times column) 
# or an error results.

# primary_metric: The metric that Automated Machine Learning will optimize for model selection.
# see https://docs.microsoft.com/azure/machine-learning/how-to-configure-auto-train#primary-metric.

# max_concurrent_iterations: Represents the maximum number of iterations that would be executed in parallel. The default value is 1.

# enable_early_stopping: Whether to enable early termination if the score is not improving in the short term. The default is True.

# n_cross_validations: How many cross validations to perform when user validation data is not specified.

# validation_size: What fraction of the data to hold out for validation when user validation data is not specified. 
# This should be between 0.0 and 1.0 non-inclusive.


automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : model_iterations,
    "experiment_timeout_hours" : experiment_timeout_hours,
    "primary_metric" : primary_metric,
    "max_concurrent_iterations" : num_max_nodes,
    "enable_early_stopping" : True,
    "n_cross_validations" : 5, 
    "validation_size" : 0.1,
}

# ensemble_settings = {
#     "ensemble_download_models_timeout_sec": 600,
#     "stack_meta_learner_type": "LogisticRegressionCV",
#     "stack_meta_learner_train_percentage": 0.3,
#     "stack_meta_learner_kwargs": {
#                                 "refit": True,
#                                 "fit_intercept": False,
#                                 "class_weight": "balanced",
#                                 "multi_class": "auto",
#                                 "n_jobs": -1
#                                 }
# }

In [48]:
# Converting data type from OutputFileDatasetConfig to PipelineData using .read_parquet_files(),
# as AutoMLConfig only takes input data type as PipelineData
automl_config = AutoMLConfig(
    task = 'classification',
    debug_log = 'automated_ml_errors.log',
    path = script_modeling_folder,
    compute_target = aml_compute,
    featurization = 'auto',
    training_data = final_data_train.read_parquet_files(),
    label_column_name = 'churn_flag',
    test_data = final_data_test.read_parquet_files(),
    **automl_settings,
    # **ensemble_settings
)
                            
print("AutoML config created.")

AutoML config created.


Specify automated ML outputs

In [49]:
metrics_data = PipelineData(
    name='metrics_data',
    datastore=default_store,
    pipeline_output_name='metrics_output',
    training_output=TrainingOutput(type='Metrics')
)

model_data = PipelineData(
    name='best_model_data',
    datastore=default_store,
    pipeline_output_name='model_output',
    training_output=TrainingOutput(type='Model')
)

Define the step

In [50]:
step_display_title = 'Model_Training_Testing'

trainWithAutomlStep = AutoMLStep(
    name=step_display_title,
    automl_config=automl_config,
    passthru_automl_config=False,
    outputs=[
        metrics_data, model_data
    ],
    enable_default_model_output=False,
    enable_default_metrics_output=False,
    allow_reuse=allow_reuse_step
)
print("trainWithAutomlStep created.")

trainWithAutomlStep created.


### [5.3.2] Registering Model Step

In [51]:
# The model name with which to register the trained model in the workspace.
step_display_title = "Register_Model"

# For adding all tags to registered model
pipeline_tags.update(automl_settings)
pipeline_tags_model = pipeline_tags.copy()
for key in pipeline_parameters.keys():
    pipeline_tags_model.pop(key, None)

registerModelStep = PythonScriptStep(
script_name="register_model.py",
    name=step_display_title,
    arguments=[
        "--model_name", pp_model_name, 
        "--model_path", model_data,
        "--observation_year", pp_observation_year,
        "--observation_month_number", pp_observation_month_number,
        "--historical_months", pp_historical_months,
        "--latest_data_ndp_flag", pp_latest_data_ndp_flag,
        "--model_tags", str(pipeline_tags_model),
        "--user_selected_metric", user_selected_metric,
        "--experiment_name", experiment_name,
        "--ws_details", ws_details,
    ],
    inputs=[
        model_data, 
        final_data_train.read_parquet_files().as_input('train_data'),
    ],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=script_modeling_folder,
    allow_reuse=allow_reuse_step,
)
print("Registering Model Step created.")

Registering Model Step created.


### [5.3.3] Model Results

#### [5.3.2.1] Model Training Pipeline (MTP)

In [52]:
if not new_data_flag:
    step_output_data_name_train = 'predicted_data_train'
    step_output_data_name_test = 'predicted_data_test'

    step_display_title = "Prediction_Results-Train_Test"
    step_script_name = 'get_predictions_mtp.py'
    pipelineStep = 'modelPredictionStep'

    # Save model predictions
    globals()[step_output_data_name_train] = OutputFileDatasetConfig(
        name = step_output_data_name_train, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_train}')
    ).register_on_complete(
        # data asset name on azure
        name = f'{user_name}_{step_output_data_name_train}{dataAssetName_suffix}', 
        # tags={'step':'output', 'train_data':'True', 'data': 'transformed', 'sample_data':'false' ,
        # 'temporary_data': 'False', 'output_file_type' : 'csv'}
        tags = pipeline_tags
    )
    globals()[step_output_data_name_test] = OutputFileDatasetConfig(
        name = step_output_data_name_test, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_test}')
    ).register_on_complete(
        # data asset name on azure
        name = f'{user_name}_{step_output_data_name_test}{dataAssetName_suffix}', 
        # tags={'step':'output', 'train_data':'False', 'data': 'transformed', 'sample_data':'false' ,
        # 'temporary_data': 'False','output_file_type' : 'csv'}
        tags = pipeline_tags
    )

    globals()[pipelineStep] = PythonScriptStep(
        name = step_display_title,
        script_name=step_script_name,
        arguments=[
            "--output_data_path_train", eval(step_output_data_name_train),
            "--output_data_path_test", eval(step_output_data_name_test), 
            "--experiment_name", experiment_name
        ],
        inputs=[
            final_data_train.read_parquet_files().as_input('train_data'),
            final_data_test.read_parquet_files().as_input('test_data'), 
            final_data_id_train.read_parquet_files().as_input('train_data_id'),
            final_data_id_test.read_parquet_files().as_input('test_data_id'), 
            model_data
        ],
        outputs=[
            eval(step_output_data_name_train), 
            eval(step_output_data_name_test),
        ],    
        compute_target=aml_compute,
        runconfig = aml_run_config,
        source_directory=script_modeling_folder,
        allow_reuse = allow_reuse_step
    )

#### [5.3.2.2] New Data Pipeline (NDP)

In [53]:
if new_data_flag:
    step_output_data_name_new_data = 'predicted_data_new_data'

    step_display_title = "Prediction_Results-New_Data"
    step_script_name = 'get_predictions_ndp.py'
    pipelineStep = 'modelPredictionStep'

    globals()[step_output_data_name_new_data] = OutputFileDatasetConfig(
        name = step_output_data_name_new_data, 
        destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_new_data}')
    ).register_on_complete(
        # data asset name on azure
        name = f'{user_name}_{step_output_data_name_new_data}{dataAssetName_suffix}', 
        # tags={'step':'output', 'new_data':'True', 'data': 'transformed_new_data', 'sample_data':'false',
        # 'temporary_data': 'False','output_file_type' : 'csv'}
        tags = pipeline_tags
    )

    globals()[pipelineStep] = PythonScriptStep(
        name = step_display_title,
        script_name=step_script_name,
        arguments=[
            "--output_data_path_new_data", eval(step_output_data_name_new_data),
            "--model_name", model_name,
            "--model_version", pp_model_version_ndp,
            # "--model_path",  model_data,
            "--experiment_name", experiment_name,
        ],
        inputs=[
            final_data_new_data.read_parquet_files().as_input('new_data'), 
            final_data_id_new_data.read_parquet_files().as_input('new_data_id'),
            # model_data
        ],
        outputs=[
            eval(step_output_data_name_new_data)
        ],    
        compute_target=aml_compute,
        runconfig = aml_run_config,
        source_directory=script_modeling_folder,
        allow_reuse = allow_reuse_step
    )

In [54]:
if new_data_flag:
    # step_output_data_name_new_data = 'predicted_data_new_data'

    step_display_title = "Model_Monitoring-New_Data"
    step_script_name = 'model_monitoring.py'
    pipelineStep = 'modelMonitoringStep'

    # globals()[step_output_data_name_new_data] = OutputFileDatasetConfig(
    #     name = step_output_data_name_new_data, 
    #     destination = (default_store, f'{output_data_intermediate_path}/{step_output_data_name_new_data}')
    # ).register_on_complete(
    #     # data asset name on azure
    #     name = f'{user_name}_{step_output_data_name_new_data}{dataAssetName_suffix}', 
    #     # tags={'step':'output', 'new_data':'True', 'data': 'transformed_new_data', 'sample_data':'false',
    #     # 'temporary_data': 'False','output_file_type' : 'csv'}
    #     tags = pipeline_tags
    # )

    globals()[pipelineStep] = PythonScriptStep(
        name = step_display_title,
        script_name=step_script_name,
        arguments=[
            # "--output_data_path_new_data", eval(step_output_data_name_new_data),
            "--model_name", model_name,
            "--model_version", pp_model_version_ndp,
            # "--model_path",  model_data,
            "--experiment_name", experiment_name,
            "--user_selected_metric", user_selected_metric
        ],
        inputs=[
            predicted_data_new_data.read_delimited_files().as_input('model_pred_data'), 
            # final_data_id_new_data.read_parquet_files().as_input('new_data_id'),
            # model_data
        ],
        # outputs=[
        #     eval(step_output_data_name_new_data)
        # ],    
        compute_target=aml_compute,
        runconfig = aml_run_config,
        source_directory=script_modeling_folder,
        allow_reuse = allow_reuse_step
    )

## [5.4] Running Pipeline

### [5.4.1] Creating & Validating Pipeline

##### [5.4.1.1] For Model Training Pipeline (MTP)

In [55]:
# Data Transformation Steps
pipeline_steps_data_transformation = [
    fetchLatestDataStep,
    transformingDemographicsStep, 
    transformingTransactionStep, 
    transformingAccountStep, 
    featureEngTransactionStep,
    mergingTransformedDataStep,
    creatingChurnFlagStep,
    mergedWithChurnStep
]

# if sample flag is true, then we run the sampling step in the pipeline else we skip
if sample_flag:
    pipeline_steps_data_transformation = [samplingDataStep] + pipeline_steps_data_transformation
    print('Pipeline will be created with step newSamplingDataStep as sample_flag is set to True')

# Data Preprocess Steps
pipeline_steps_data_preprocess = [
    trainTestSplitStep,
    trainOutlierTreatmentStep,
    multiCollinearityTreatmentStep,
    featureSelectionStep,
    dataNormalizationStep,
    finalDataPrepStep
]
# Modeling Steps
pipeline_steps_modeling = [
    trainWithAutomlStep, 
    registerModelStep,
    modelPredictionStep
]

# Creating single list of all pipeline steps
train_test_pipeline_steps = pipeline_steps_data_transformation + pipeline_steps_data_preprocess + pipeline_steps_modeling

##### [5.4.1.2] For New Data Pipeline (NDP)

In [56]:
# Data Transformation Steps
pipeline_steps_data_transformation = [
    fetchLatestDataStep,
    transformingDemographicsStep, 
    transformingTransactionStep, 
    transformingAccountStep, 
    featureEngTransactionStep,
    mergingTransformedDataStep,
]

# if sample flag is true, then we run the sampling step in the pipeline else we skip
if sample_flag:
    pipeline_steps_data_transformation = [samplingDataStep] + pipeline_steps_data_transformation
    print('Pipeline will be created with step newSamplingDataStep as sample_flag is set to True')

if monitoring_flag == True and new_data_flag == True:
    pipeline_steps_data_transformation = [creatingChurnFlagStep, mergedWithChurnStep] + pipeline_steps_data_transformation 

# Data Preprocess Steps
pipeline_steps_data_preprocess = [
    # dataNormalizationStep,
    finalDataPrepStep,
    modelPredictionStep,
    # modelMonitoringStep
]

if monitoring_flag == True and new_data_flag == True:
    pipeline_steps_data_preprocess = [modelMonitoringStep] + pipeline_steps_data_preprocess

# Creating single list of all pipeline steps
new_data_pipeline_steps = pipeline_steps_data_transformation + pipeline_steps_data_preprocess

In [57]:
%%time
# Check the cluster status
aml_compute.wait_for_completion(show_output=True)

# Defining pipeline
if new_data_flag:
    # For NDP - new data pipeline
    pipeline = Pipeline(ws, steps = new_data_pipeline_steps, description = f'Pipeline: {pipeline_name}')
else:
    # For MTP - model training pipeline
    pipeline = Pipeline(ws, steps = train_test_pipeline_steps, description = f'Pipeline: {pipeline_name}')
print(f'Pipeline {pipeline_name} is built')

# Validate the pipeline
pipeline.validate()
print("Simple validation complete")

InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
Pipeline master_Pipeline-DataPrep-ModelTrainTest_20_09_2023_07_55_00 is built
Step Model_Training_Testing is ready to be created [8022d41d]
Step Register_Model is ready to be created [4400684b]
Step Prediction_Results-Train_Test is ready to be created [c045c699]
Simple validation complete
CPU times: user 1.19 s, sys: 113 ms, total: 1.3 s
Wall time: 9.38 s


### [5.4.2] Creating & Running the Experiment

In [58]:
%%time
if not schedule_pipeline_flag:
    experiment = Experiment(ws, experiment_name)
    print(f'Experiment created {experiment_name}')

### regenerate_outputs Flag
### If regenerate_outputs is set to True for the Experiment.Submit() call, a new submit will always force generation of all step outputs, 
### and disallow data reuse for any step of this run. Once this run is complete, however, subsequent runs may reuse the results of this run. 
### Default behavior of Pipelines is to set regenerate_outputs=False for experiment submit calls.
if not schedule_pipeline_flag:
    pipeline_run = experiment.submit(
        pipeline, regenerate_outputs=regenerate_outputs, name=pipeline_name, 
        tags=pipeline_tags, pipeline_parameters=pipeline_parameters,
    )
    print('Pipeline submitted for execution')

Experiment created master_model_training_pipeline
Created step Fetch_Latest_Data [ba45bd62][51b5569e-db68-4068-bd08-82cae6f99571], (This step is eligible to reuse a previous run's output)
Created step Transform_Demographics_Data [5837c670][ef00461e-0ca2-4720-8d37-f135922ccbe0], (This step is eligible to reuse a previous run's output)
Created step Transform_Transaction_Data [47be9d02][e5082507-8c09-47ff-b969-60e69f83e5cd], (This step is eligible to reuse a previous run's output)
Created step Transform_Account_Data [f9c3f177][f242db27-3d7d-43eb-b328-9831c01e16af], (This step is eligible to reuse a previous run's output)
Created step Feature_Engineering-Transaction_Data [6e158514][fa2e3dcc-1087-4b96-ae51-95f2ae1d5f6a], (This step is eligible to reuse a previous run's output)
Created step Merge_Transformed_Data [603bdfea][e1aa6cb8-dbf6-4c06-9f11-1e79e1f23996], (This step is eligible to reuse a previous run's output)
Created step Create_Target-Churn_Flag [b6ce90e2][7e0454d4-169b-45ec-951b-a



Link to Azure Machine Learning Portal: https://ml.azure.com/runs/d0d2ac3e-43a0-4cba-8d9b-6f6e77a38d3c?wsid=/subscriptions/bdea9b80-e147-4dd1-9e22-afd1228b6d1a/resourcegroups/casa-churn-analysis-ey-demo/workspaces/AzureConnection&tid=55587992-a704-412d-a52c-ce2a96ed3cf4
Pipeline submitted for execution
CPU times: user 1.18 s, sys: 161 ms, total: 1.34 s
Wall time: 4.46 s


In [59]:
%%time
## Wait for pipeline completion - we can see the pipeline status using this code
if not schedule_pipeline_flag:
    pipeline_run.wait_for_completion(show_output=True, raise_on_error=True)

PipelineRunId: d0d2ac3e-43a0-4cba-8d9b-6f6e77a38d3c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/d0d2ac3e-43a0-4cba-8d9b-6f6e77a38d3c?wsid=/subscriptions/bdea9b80-e147-4dd1-9e22-afd1228b6d1a/resourcegroups/casa-churn-analysis-ey-demo/workspaces/AzureConnection&tid=55587992-a704-412d-a52c-ce2a96ed3cf4
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 1630a665-6cc8-482e-ab69-2f33e1a2e993
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1630a665-6cc8-482e-ab69-2f33e1a2e993?wsid=/subscriptions/bdea9b80-e147-4dd1-9e22-afd1228b6d1a/resourcegroups/casa-churn-analysis-ey-demo/workspaces/AzureConnection&tid=55587992-a704-412d-a52c-ce2a96ed3cf4
StepRun( Fetch_Latest_Data ) Status: NotStarted
StepRun( Fetch_Latest_Data ) Status: Running

StepRun(Fetch_Latest_Data) Execution Summary
StepRun( Fetch_Latest_Data ) Status: Finished
{'runId': '1630a665-6cc8-482e-ab69-2f33e1a2e993', 'target': 'master-cpu-cluster-mtp', 'status': 'Completed', 'star

## [5.5] Scheduling Pipeline

Publish Pipeline

In [None]:
if schedule_pipeline_flag:
    published_pipeline_run = pipeline.publish(
        name=pipeline_name, 
        description=pipeline_name)
    pub_pipeline_id = published_pipeline_run.id
    print("Newly published pipeline id: {}".format(published_pipeline_run.id))

Schedule Pipeline

In [None]:
# If any file content is changed or any new file is added to folder then it will trigger the pipeline.
# Currently we are checking the content change in updateStatus.json file.
# Copy new data to folder 'InputData/InputRawFiles/transaction_data/' and then update time and status in updateStatus.json file,
# pipeline will be triggered
if schedule_pipeline_flag:
    schedule_info = Schedule.load_yaml(
        workspace=ws,
        filename=scheduler_config_file
    )
    schedule_name = f'{pipeline_name}_schedule'
    schedule = Schedule.create(
        workspace=ws, 
        name=schedule_name,
        pipeline_id=pub_pipeline_id, 
        experiment_name=experiment_name+'_scheduler',
        datastore=default_store,
        wait_for_provisioning=True,
        description=schedule_info.get("description"),
        path_on_datastore=data_path_to_monitor,
        pipeline_parameters=schedule_info.get("pipeline_parameters")
    )
    schedule_id = schedule.id
    print("Created schedule {} with id: {}".format(schedule_name, schedule.id))

Get all schedules for a given pipeline

In [None]:
# pub_pipeline_id = '378b2153-9af9-43e9-8dc6-5a3197432891'
# schedules = Schedule.list(ws, pipeline_id=pub_pipeline_id)

# print("Found these schedules for the pipeline id {}:".format(pub_pipeline_id))
# for schedule in schedules: 
#     print(schedule.id)

# # print("Schedule id to be used for schedule operations: {}".format(schedule_id))

Get all schedules in your workspace

In [None]:
# # Use active_only=False to get all schedules including disabled schedules
# schedules = Schedule.list(ws, active_only=True) 
# print("Your workspace has the following schedules set up:")
# for schedule in schedules:
#     print("{} (Published pipeline: {}".format(schedule.id, schedule.pipeline_id))
# schedule_id = schedule.id

Disable the schedule

In [None]:
# def disable_by_schedule_id(ws, schedule_id):
#     s = next(s for s in Schedule.list(ws) if s.id == schedule_id)
#     s.disable()
#     return s

# def disable_by_schedule_name(ws, schedule_name):
#     s = next(s for s in Schedule.list(ws) if s.name == schedule_name)
#     s.disable()
#     return s

# disable_by_schedule_id(ws, 'f98435ff-c140-4d43-b918-90139c48ed95')
# disable_by_schedule_id(ws, schedule_id)

Enable the Schedule

In [None]:
# def enable_by_schedule_id(ws, schedule_id):
#     s = next(s for s in Schedule.list(ws) if s.id == schedule_id)
#     s.enable()
#     return s

# def enable_by_schedule_name(ws, schedule_name):
#     s = next(s for s in Schedule.list(ws) if s.name == schedule_name)
#     s.enable()
#     return s

# enable_by_schedule_id(ws, 'f98435ff-c140-4d43-b918-90139c48ed95')
# enable_by_schedule_id(ws, schedule_id)

## [5.6] Interpret Pipeline Results

In [None]:
# For automated ML runs, to access the charts from a previous run, use the experiment name:

### [5.5.1] Get Feature Importance

In [None]:
# from azureml.pipeline.core import PipelineRun
# from azureml.train.automl.run import AutoMLRun

# experiment = Experiment(ws, experiment_name)
# # pipeline_run = pipeline_run.id
# pipeline_run = PipelineRun(experiment, 'a5e588a0-4879-4b60-8ad8-24ef5cd15c81')
# automl_step_run = pipeline_run.find_step_run('Model_Training_Testing')[0]
# automl_run = AutoMLRun(experiment, automl_step_run.id)
# best_run, model = automl_run.get_output()

In [None]:
# from azureml.interpret import ExplanationClient

# client = ExplanationClient.from_run(best_run)
# engineered_explanations = client.download_model_explanation(raw=False)
# dict_feat_imp = engineered_explanations.get_feature_importance_dict()
# df_feat_imp = pd.DataFrame(dict_feat_imp.items(), columns=['Feature Name', 'Importance Value'])
# df_feat_imp

In [None]:
# client = ExplanationClient.from_run(best_run)
# raw_explanations = client.download_model_explanation(raw=True)
# dict_feat_imp = raw_explanations.get_feature_importance_dict()
# df_feat_imp = pd.DataFrame(dict_feat_imp.items(), columns=['Feature Name', 'Importance Value'])
# df_feat_imp

In [None]:
# Get best model based on metric
# automl_run.get_best_child(metrics='f1-score-macro')

In [None]:
# from azureml.widgets import RunDetails
# from azureml.core.run import Run

# experiment = Experiment (workspace, experiment_name)
# run_id = pipeline_run.id #replace with run_ID
# run = Run(experiment, run_id)
# RunDetails(run).show()