## Lead Scoring Batch Inference Pipeline


In this project, we UCI Bank Marketing dataset for lead scoring. Lead classification is used to classfiy leads

### Configure Workspace

In [1]:
from pyspark.sql import SparkSession
from azureml.core import Workspace, Experiment
import mlflow

# Setup Azure Workspace
ws = Workspace.from_config()
experiment_name = 'leads-pyspark-train'
experiment = Experiment(workspace=ws, name=experiment_name)

# Start MLflow Experiment
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)
run = mlflow.start_run()

# Get default datastore
default_ds = ws.get_default_datastore()

# Get Spark session
spark = SparkSession.builder.getOrCreate()

### Generate and Upload Batch Data

In [2]:
from azureml.core import Dataset
import pandas as pd
import os

df = spark.read.csv(
    path='data/bank-additional-full.csv',
    header="true",
    inferSchema="true",
    sep=";")
trainDF, testDF, batchDF = df.randomSplit([.7, .29, .01], seed=999)
batchData = batchDF.toPandas()

# Create a folder for storing generated batch data
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok=True)
print("Folder created!")

# Save each sample as a separate file
print("Saving files...")
x = 0
y = 10
for i in range(int(batchDF.count()/10)):
    filename = str(i+1) + '.csv'
    writeData=batchData[x:y]
    writeData.to_csv(os.path.join(batch_folder, filename), sep=",")
    x+=10
    y+=10

print("files saved!")

# Upload the files to the default datastore
print("Uploading files to datastore...")
default_ds = ws.get_default_datastore()
default_ds.upload(src_dir="batch-data", target_path="batch-data", overwrite=True, show_progress=True)

# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'batch-data/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='leads-batch-data',
                                             description='batch data for Marketing Leads UCI',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

'\n# Create a folder for storing generated batch data\nbatch_folder = \'./batch-data\'\nos.makedirs(batch_folder, exist_ok=True)\nprint("Folder created!")\n\n# Save each sample as a separate file\nprint("Saving files...")\nx = 0\ny = 10\nfor i in range(int(batchDF.count()/10)):\n    filename = str(i+1) + \'.csv\'\n    writeData=batchData[x:y]\n    writeData.to_csv(os.path.join(batch_folder, filename), sep=",")\n    x+=10\n    y+=10\n\nprint("files saved!")\n\n# Upload the files to the default datastore\nprint("Uploading files to datastore...")\ndefault_ds = ws.get_default_datastore()\ndefault_ds.upload(src_dir="batch-data", target_path="batch-data", overwrite=True, show_progress=True)\n\n# Register a dataset for the input data\nbatch_data_set = Dataset.File.from_files(path=(default_ds, \'batch-data/\'), validate=False)\ntry:\n    batch_data_set = batch_data_set.register(workspace=ws, \n                                             name=\'leads-batch-data\',\n                            

### Train Model

In [3]:
# A Transformer used in pipelines for renaming columns 
from pyspark.ml import Transformer
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable  

class ColumnRenamer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
    '''
    Renames the following columns in the dataframe: 
    employment variation rate
    consumer price index
    consumer confidence index 
    number of employees 
    '''
    def __init__(self):
        super(ColumnRenamer, self).__init__()
        self.columnsToBeRenamed = {
            'emp.var.rate':'emp_var_rate',
            'cons.price.idx':'cons_price_idx',
            'cons.conf.idx':'cons_conf_idx',
            'nr.employed':'nr_employed'}

    def _transform(self, df):
        for key in self.columnsToBeRenamed.keys():
            df = df.withColumnRenamed(key, self.columnsToBeRenamed[key])
        return df    
rename_columns = ColumnRenamer()

# Uses R Formula for automatic conversion of categorical labels to 1 hot encoding
from pyspark.ml.feature import RFormula
rFormula = RFormula(formula="y ~ .", featuresCol="features", labelCol="label", handleInvalid="skip")

# Uses String Indexer and Numeric Columns only for Tree Based Classifiers
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
example_df = rename_columns.transform(trainDF)
categorialColumns = [colname for (colname, dataType) in example_df.dtypes if ((dataType=="string") and (colname!="y"))]
stringIndexer = StringIndexer(inputCols=categorialColumns, outputCols=[c + "Index" for c in categorialColumns])
oheEncoder = OneHotEncoder(inputCols=stringIndexer.getOutputCols(), outputCols=[c + "ohe" for c in categorialColumns])
label_stringIdx = StringIndexer(inputCol="y", outputCol="label")
numericColumns = [colname for (colname, dataType) in example_df.dtypes if (dataType=="int" or dataType=="float" or dataType=="double")]
assembledInputs = numericColumns + [c + "Index" for c in categorialColumns]
vecAssembler = VectorAssembler(inputCols=assembledInputs, outputCol="features")

In [4]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import mlflow.spark
import pandas as pd

# For Tracking Models
model_num=1
pipelineModel = None

# Evaluators for performance metrics
bevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
mevaluator = MulticlassClassificationEvaluator()

# Rename Columns
columnsToBeRenamed = {'emp.var.rate':'emp_var_rate','cons.price.idx':'cons_price_idx','cons.conf.idx':'cons_conf_idx','nr.employed':'nr_employed'}
for key in columnsToBeRenamed.keys():
    trainDF = trainDF.withColumnRenamed(key, columnsToBeRenamed[key])
for key in columnsToBeRenamed.keys():
    testDF = testDF.withColumnRenamed(key, columnsToBeRenamed[key])

# Non Tree Based Models
non_tree_models = [LogisticRegression(), LinearSVC()]
for model in non_tree_models:
    non_tree_pipeline = Pipeline(stages=[rFormula, model])
    pipelineModel = non_tree_pipeline.fit(trainDF)
    predDF = pipelineModel.transform(testDF)

    modelName =str(model_num)+'-'+model.__class__.__name__
    accuracy = mevaluator.setMetricName("accuracy").evaluate(predDF)
    roc = bevaluator.setMetricName("areaUnderROC").evaluate(predDF)
    pr = bevaluator.setMetricName("areaUnderPR").evaluate(predDF)
    model_num += 1


    # Log metrics and model
    mlflow.spark.log_model(pipelineModel, modelName)
    mlflow.log_metrics({"modelNum":model_num, "accuracy":accuracy, "areaUnderROC":roc, "areaUnderPR":pr})
    print("Training complete:",modelName)
'''
# Tree Based Models
tree_models = [DecisionTreeClassifier(), RandomForestClassifier(), GBTClassifier()]
for model in tree_models:
    tree_pipeline = Pipeline(stages=[stringIndexer, oheEncoder, label_stringIdx, vecAssembler,model])
    pipelineModel = tree_pipeline.fit(trainDF)
    predDF = pipelineModel.transform(testDF)

    modelName = str(model_num)+'-'+model.__class__.__name__
    accuracy = mevaluator.setMetricName("accuracy").evaluate(predDF)
    roc = bevaluator.setMetricName("areaUnderROC").evaluate(predDF)
    pr = bevaluator.setMetricName("areaUnderPR").evaluate(predDF)
    model_num += 1

    # Log metrics and model
    mlflow.spark.log_model(pipelineModel, modelName)
    mlflow.log_metrics({"modelNum":model_num, "accuracy":accuracy, "areaUnderROC":roc, "areaUnderPR":pr})
    print("Training complete:",modelName)


pipelineModel.save('model')

from azureml.core import Model
Model.register(
    workspace=ws,
    model_path='model/',
    model_name='pyspark-batch-leads-model',
)

mlflow.end_run()
'''

Training complete: 1-LogisticRegression
Training complete: 2-LinearSVC


'\n# Tree Based Models\ntree_models = [DecisionTreeClassifier(), RandomForestClassifier(), GBTClassifier()]\nfor model in tree_models:\n    tree_pipeline = Pipeline(stages=[stringIndexer, oheEncoder, label_stringIdx, vecAssembler,model])\n    pipelineModel = tree_pipeline.fit(trainDF)\n    predDF = pipelineModel.transform(testDF)\n\n    modelName = str(model_num)+\'-\'+model.__class__.__name__\n    accuracy = mevaluator.setMetricName("accuracy").evaluate(predDF)\n    roc = bevaluator.setMetricName("areaUnderROC").evaluate(predDF)\n    pr = bevaluator.setMetricName("areaUnderPR").evaluate(predDF)\n    model_num += 1\n\n    # Log metrics and model\n    mlflow.spark.log_model(pipelineModel, modelName)\n    mlflow.log_metrics({"modelNum":model_num, "accuracy":accuracy, "areaUnderROC":roc, "areaUnderPR":pr})\n    print("Training complete:",modelName)\n\n\npipelineModel.save(\'model\')\n\nfrom azureml.core import Model\nModel.register(\n    workspace=ws,\n    model_path=\'model/\',\n    mode

In [6]:
predDF.probablity

AttributeError: 'DataFrame' object has no attribute 'probablity'

### Create Compute

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Compute params
compute_name = 'rohan-vm-cluster'
inference_cluster = None

if compute_name in ws.compute_targets:
    inference_cluster = ComputeTarget(ws, compute_name)
    print("Using existing cluster.")
else:
    try:
        compute_config = AmlCompute.provisioning_configuration(
            vm_size ='STANDARD_DS11_V2', 
            max_nodes=2 )
        inference_cluster = ComputeTarget.create(ws, compute_name, compute_config)
        inference_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    print("Cluster created.")

### Scoring Script

In [None]:
# Create a folder for storing batch-pipeline files
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok=True)
print("Folder created!")

In [81]:
%%writefile 'batch-pipeline/batch_environment.yml'
name: mlflow-environment
channels:
  - defaults
  - anaconda
  - conda-forge
dependencies:
  - python=3.6
  - scikit-learn
  - pip
  - pandas
  - numpy
  - openjdk
  - pip:
    - pyspark
    - mlflow
    - azureml-mlflow
    - azureml-defaults

Overwriting batch-pipeline/batch_environment.yml


In [82]:
%%writefile 'batch-pipeline/score.py'
import os
import numpy as np
from azureml.core import Model
from pyspark.ml import PipelineModel
from pyspark.sql import SparkSession

def init():
    global model, columnsToBeRenamed, spark
    spark = SparkSession.builder.getOrCreate()
    model_path = Model.get_model_path('pyspark-batch-leads-model')
    model= PipelineModel.load(model_path)
    columnsToBeRenamed = {'emp.var.rate':'emp_var_rate','cons.price.idx':'cons_price_idx','cons.conf.idx':'cons_conf_idx','nr.employed':'nr_employed'}

def run(mini_batch):
    # This runs for each batch
    resultList = []
    # process each file in the batch
    for f in mini_batch:
        df = spark.read.csv(path=f,header="true",inferSchema="true",sep=",").drop('_c0')
        for key in columnsToBeRenamed.keys():
            df = df.withColumnRenamed(key, columnsToBeRenamed[key]) 
        prediction = model.transform(df).select('prediction').toPandas().prediction.map({0.0:"no",1.0:"yes"}).to_numpy()
        resultList.append("{}: {}".format(os.path.basename(f), prediction))
    return resultList


Overwriting batch-pipeline/score.py


### Create Pipeline

In [None]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Create an Environment for the experiment
batch_env = Environment.from_conda_specification(name="experiment_env", file_path="batch-pipeline/batch_environment.yml")
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

In [None]:
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig
from azureml.core.runconfig import DockerConfiguration

output_dir = OutputFileDatasetConfig(name='inferences')

parallel_run_config = ParallelRunConfig(
    source_directory='batch-pipeline/',
    entry_script="score.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=inference_cluster,
    node_count=2)

parallelrun_step = ParallelRunStep(
    name='batch-score-leads',
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('leads_batch')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Steps defined')

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
pipeline_run = Experiment(workspace=ws, name='leads-batch-pipeline').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)



### Fetch predictions from run

In [None]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
result_file = None
shutil.rmtree('leads-results', ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='leads-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('leads-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]
df.to_csv("leads-results/leads.csv", header=True,index=False)
# Display the first 20 results
df.head(20)

### Publish Pipeline

In [None]:
published_pipeline = pipeline_run.publish_pipeline(name='leads-batch-pipeline', description='Batch scoring of leads data from UCI', version='1.0')
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

### Test the Pipeline

In [None]:
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails
import requests

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, headers=auth_header, json={"ExperimentName": "leads-batch-pipeline"})
run_id = response.json()["Id"]

published_pipeline_run = PipelineRun(ws.experiments['leads-batch-pipeline'], run_id)
published_pipeline_run.wait_for_completion(show_output=True)

In [84]:
prediction

NameError: name 'prediction' is not defined