In [1]:
# Load Azure Libaries
from azureml.core import Workspace, Datastore, Dataset, Experiment, Environment
from azureml.core.authentication import InteractiveLoginAuthentication
import logging
import pandas as pd
import numpy as np
import json
import os

#Load Libraries for Deployment
from azureml.core.model import Model
from azureml.pipeline.steps import PythonScriptStep
from azureml.contrib.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data.data_reference import DataReference
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.pipeline.core import Pipeline, PipelineData, PublishedPipeline, PipelineEndpoint,PipelineParameter
from azureml.core.runconfig import RunConfiguration, CondaDependencies, DEFAULT_CPU_IMAGE, DEFAULT_GPU_IMAGE
from azureml.widgets import RunDetails

In [2]:
# Check the base version of AzureMl
import azureml.core
print("You are currently using version " + azureml.core.VERSION + " of the Azure ML SDK")

You are currently using version 1.44.0 of the Azure ML SDK


In [3]:
# Retrieve your workspace by name by filling in the lower case values between double quotes
ws = Workspace.from_config()

In [4]:
# Retrieve your Compute Target for Running Jobs Remotely
compute_name = 'amlccrgp0003498' 
compute_target = ComputeTarget(ws, compute_name)
ws

Workspace.create(name='amlws-nc-rgp0003498-ci', subscription_id='ec5645d4-b966-4814-95c7-7c5649cf2e39', resource_group='rg-p-0003498')

In [5]:
# Give your environment a name and set it to your .yml file
ProjectEnv = Environment(name="Test_Docker") # CHANGE HERE
conda_dep = CondaDependencies()

# Add pip packages
# Packages
conda_dep.add_pip_package("azureml-defaults==1.30.0")
# conda_dep.add_pip_package("feedparser==6.0.8")
# conda_dep.add_pip_package("PyPDF2==1.26.0")
conda_dep.add_pip_package("lxml==4.6.3")
# conda_dep.add_pip_package("datefinder==0.7.1")
conda_dep.add_pip_package("beautifulsoup4==4.9.3")
conda_dep.add_pip_package("selenium==3.141.0")
# conda_dep.add_pip_package("scrapy==3.141.0")
# conda_dep.add_pip_package("delta-fetch==3.141.0")

# Adds dependencies to PythonSection of myenv
ProjectEnv.python.conda_dependencies=conda_dep
 

In [6]:
# # Specify docker steps as a string. 
# dockerfile = r"""
# FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04
# RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
# RUN dpkg -i ./google-chrome-stable_current_amd64.deb || :
# RUN apt-get -y install -f
# RUN wget https://chromedriver.storage.googleapis.com/106.0.5249.21/chromedriver_linux64.zip
# RUN unzip chromedriver_linux64.zip
# RUN pwd
# """

In [7]:

dockerfile = r"""
FROM mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
RUN apt-get update && apt-get install -y \
    software-properties-common \
    unzip \
    xvfb \
    curl \
    wget
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
RUN apt-get -y update
RUN apt-get install -y google-chrome-stable
RUN apt-get install -yqq unzip
RUN wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/` curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE `/chromedriver_linux64.zip
RUN unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
ENV DISPLAY=:99
"""


In [8]:
# Set base image to None, because the image is defined by dockerfile.
ProjectEnv.docker.base_image = None
ProjectEnv.docker.base_dockerfile = dockerfile

In [9]:
# Register the environment to your workspace
RegisteredEnvironment = ProjectEnv.register(workspace=ws)

In [10]:
# Create a Run Configuration with a Docker Container and your environment settings for your Data Transfer Step
run_config = RunConfiguration()
run_config.environment = RegisteredEnvironment
run_config.environment.docker.enabled = True
# docker_config = DockerConfiguration(use_docker=True)
# run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE # Use DEFAULT_GPU_IMAGE for Deep Learning Jobs

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [11]:
%%writefile test_docker.py

import requests
from bs4 import BeautifulSoup
import re
import lxml
import datetime
import pandas as pd
import urllib

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException   
import time
import os
from urllib.parse import urljoin

print(os.getcwd())


#Setting Options for efficient headless
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')


#Looking for the chromedriver file in the folder directly next to the script. If you move the script change this
chrome_driver = "/usr/local/bin/chromedriver"

#Set up the selenium driver
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver)

#Grab the test Page
driver.get("https://www.elkem.com/media/news/article/?itemid=7251C1FEE289B8FC")

#This is implicit loading. Just a time delay. We hope that the element is loaded by then.
driver.implicitly_wait(10)
print(driver.page_source)

driver.quit()

Overwriting test_docker.py


In [12]:
suffix_pipeline_param = PipelineParameter(name="suffix", default_value='dev')


run_crawls = PythonScriptStep(
    name='Test_Docker',
    source_directory='',
    script_name = "test_docker.py",
    arguments=["--datastore_suffix", suffix_pipeline_param],
    compute_target=compute_target,
    inputs=[],
    outputs=[],  # Only necessary if there's another step in the pipeline
    runconfig=run_config,
    allow_reuse = False
)

In [13]:
# Create your pipeline
pipeline = Pipeline(workspace=ws, steps=[run_crawls]) 

In [14]:
pipeline_run = Experiment(ws, 'Test_Docker').submit(pipeline,pipeline_parameters={"suffix":'dev'}, show_output=True)

Created step Test_Docker [8daee1dc][507b305b-2e4e-4e95-a732-f8e9e1869531], (This step will run and generate new outputs)
Submitted PipelineRun 1bff4ded-bb96-48da-937c-33dafaed531d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1bff4ded-bb96-48da-937c-33dafaed531d?wsid=/subscriptions/ec5645d4-b966-4814-95c7-7c5649cf2e39/resourcegroups/rg-p-0003498/workspaces/amlws-nc-rgp0003498-ci&tid=c3e32f53-cb7f-4809-968d-1cc4ccc785fe


In [15]:
# GUI to see your Pipeline Run
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 1bff4ded-bb96-48da-937c-33dafaed531d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1bff4ded-bb96-48da-937c-33dafaed531d?wsid=/subscriptions/ec5645d4-b966-4814-95c7-7c5649cf2e39/resourcegroups/rg-p-0003498/workspaces/amlws-nc-rgp0003498-ci&tid=c3e32f53-cb7f-4809-968d-1cc4ccc785fe
PipelineRun Status: Running


StepRunId: 3e48bcef-f487-4a76-aa13-85ceed5479f2
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3e48bcef-f487-4a76-aa13-85ceed5479f2?wsid=/subscriptions/ec5645d4-b966-4814-95c7-7c5649cf2e39/resourcegroups/rg-p-0003498/workspaces/amlws-nc-rgp0003498-ci&tid=c3e32f53-cb7f-4809-968d-1cc4ccc785fe
StepRun( Test_Docker ) Status: Running

StepRun(Test_Docker) Execution Summary
StepRun( Test_Docker ) Status: Finished
{'runId': '3e48bcef-f487-4a76-aa13-85ceed5479f2', 'target': 'amlccrgp0003498', 'status': 'Completed', 'startTimeUtc': '2022-09-27T14:08:04.738095Z', 'endTimeUtc': '2022-09-27T14:08:14.077433Z', 'services': {}, 'properties': {'C



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '1bff4ded-bb96-48da-937c-33dafaed531d', 'status': 'Completed', 'startTimeUtc': '2022-09-27T14:07:56.047877Z', 'endTimeUtc': '2022-09-27T14:08:15.305909Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{"suffix":"dev"}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://amlsancrgp0003498ci.blob.core.windows.net/azureml/ExperimentRun/dcid.1bff4ded-bb96-48da-937c-33dafaed531d/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=f1wLwPLsX4Pv1yWqjqyUWhOfVOjfVQ73uNF3gBQTjQo%3D&skoid=5d59eaf9-8577-4d09-bb47-26ee41cb30df&sktid=c3e32f53-cb7f-4809-968d-1cc4ccc785fe&skt=2022-09-27T12%3A11%3A16Z&ske=2022-09-28T20%3A21%3A16Z&sks=b&skv=2019-07-07&st=20

'Finished'

In [16]:
published_pipeline = pipeline_run.publish_pipeline(
    name='Test_Docker_Pipeline',\
    description="Docker_Test", version="0.1")


In [17]:
# published_pipeline