# Use selenium tasks to extract information from web pages within AzureML pipelines

In [None]:
# Variables used in script. Update them to fit your environment.
compute_cluster_name='cpu-cluster'
pipeline_name='pull-data-from-site-pipeline'
experiment_name='test-selenium'
# The environment is created using the step00 file
environment_name = 'selenium-env'

In [None]:
# The following line builds a docker image with the Selenium requirements
# and registers it as an Environment in your workspace.
# If you have run this already, you can comment it out to speed up 
# notebook execution
!python step00_create_environment.py --env-name $environment_name

## Default imports from the AzureML SDK

In [None]:
import azureml

from azureml.core import Workspace, Experiment, Datastore, Environment
from azureml.core.runconfig import RunConfiguration
from azureml.data.datapath import DataPath, DataPathComputeBinding
from azureml.data.data_reference import DataReference
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.widgets import RunDetails
from azureml.train.estimator import Estimator
import os

print("Azure ML SDK Version: ", azureml.core.VERSION)

In [None]:
# Connect to workspace and get resource references
ws = Workspace.from_config()
compute_cluster = ComputeTarget(workspace=ws, name=compute_cluster_name)
datastore = ws.get_default_datastore()
env = ws.environments[environment_name]

## Test the parsing script

Use ScriptRunConfig to execute the selenium based web page scrapping script that stores a csv with all the links from a page to a CSV file in the `selenium/` folder in the root of the default blob storage container attached to the AzureML workspace.

In [None]:
# Test the script that it works
from azureml.core import ScriptRunConfig
from azureml.data import OutputFileDatasetConfig

# output is configured to write the result back to datastore, under "selenium/" folder
# learn more about options to configure the output, run 'help(OutputFileDatasetConfig)'
output = OutputFileDatasetConfig(destination=(datastore, 'selenium/'))

script = ScriptRunConfig(
    source_directory='.',
    environment=env,
    script='selenium_script.py',
    compute_target=compute_cluster,
    arguments=[
        '--url', 'https://www.bing.com',
        '--output-path', output
     ]
)

exp = Experiment(ws, experiment_name)
run = exp.submit(script)
run.wait_for_completion(show_output=True)

## Create repeatable pipeline
Create a pipeline that extracts the data and stores it in the default blob

In [None]:
# Define the pipeline parameters 
website_pipeline_param = PipelineParameter(name="url", default_value="https://www.bing.com")

# Run configuration specifying the environment to use
run_config = RunConfiguration()
run_config.environment = env

# Create a python step to run the same script. Note that this is similar to the above
parse_step = PythonScriptStep(
   'selenium_script.py',
    name='Grab web data',
    source_directory='.',
    compute_target=compute_cluster,
    arguments=[
        '--url', website_pipeline_param,
        '--output-path', output
     ],
    runconfig=run_config,
    outputs=[output]
)

pipeline = Pipeline(workspace=ws, steps=[parse_step])

published_pipeline = pipeline.publish(
    pipeline_name, 
    description="Pipeline to parse links from a web page")

In [None]:
from azureml.core.authentication import InteractiveLoginAuthentication
import requests

auth = InteractiveLoginAuthentication()
aad_token = auth.get_authentication_header()

response = requests.post(published_pipeline.endpoint, 
                         headers=aad_token, 
                         json={"ExperimentName": experiment_name,
                               "ParameterAssignments": {"url": "https://www.microsoft.com"}})

print(response)

Check the AutoShutDown example if you want to schedule the published pipeline.