# Getting Started with Pilot-Streaming and Edge on LRZ (Jetstream/TACC WIP)

In the first step we need to import all required packages and modules into the Python Path

Pilot-Streaming can be used to manage the Dask and Kafka environments both in the cloud and on the edge. 



`resource`: URL of the Local Resource Manager. Examples:

* `slurm://localhost`: Submit to local SLURM resource manager, e.g. on master node of Wrangler or Stampede
* `slurm+ssh://login1.wrangler.tacc.utexas.edu`: Submit to Wrangler master node SLURM via SSH (e.g. on node running a job)
* `os://` Openstack
* `ec2://` EC2


`type:` The `type` attributes specifies the cluster environment. It can be: `Spark`, `Dask` or `Kafka`.


Depending on the resource there might be other configurations necessary, e.g. to ensure that the correct subnet is used the Spark driver can be configured using various environment variables:   os.environ["SPARK_LOCAL_IP"]='129.114.58.2'



In [None]:
# Pilot-Streaming
import os, sys
import distributed
import json
import pilot.streaming
import getpass
import socket
import pykafka

#configure loggin
import logging
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger("stevedore.extension").setLevel(logging.CRITICAL)
logging.getLogger("keystoneauth").setLevel(logging.CRITICAL)
logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL)
logging.getLogger("asyncio").setLevel(logging.CRITICAL)

sys.modules['pilot.streaming']

RESOURCE_URL_HPC="slurm+ssh://login4.stampede2.tacc.utexas.edu"
WORKING_DIRECTORY=os.path.join(os.environ["HOME"], "work")

#RESOURCE_URL_EDGE="ssh://js-17-136.jetstream-cloud.org"
RESOURCE_URL_EDGE="os://cc.lrz.de"
#RESOURCE_URL_EDGE="ssh://localhost"
WORKING_DIRECTORY_EDGE="/home/ubuntu"

# 1. Dask on LRZ (pre-launched VM)

## 1.1. Producer

In [None]:
pilot_compute_description_dask_producer = json.load(open("config/openstack_description_lrz_producer.json", "r"))
pilot_compute_description_dask_producer
#pilot_compute_description["os_password"] = getpass.getpass()
pilot_compute_description_dask_producer["os_password"] = open("../../../.password", "r").read().strip()
pilot_compute_description_dask_producer["type"] = "dask"

In [None]:
%%time
dask_pilot_producer = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description_dask_producer)
dask_pilot_producer.wait()
dask_pilot_producer.get_details()

## Test edge dask behind firewall

In [None]:
dask_client_producer  = distributed.Client(dask_pilot_producer.get_details()['master_url'])
#dask_client  = distributed.Client()
dask_client_producer.scheduler_info()

In [None]:
dask_client_producer.gather(dask_client_producer.map(lambda a: a*a, range(10)))

In [None]:
dask_client_producer.gather(dask_client_producer.map(lambda a: socket.gethostname(), range(10)))

## 1.2 Consumer

In [None]:
pilot_compute_description_dask_consumer = json.load(open("config/openstack_description_lrz_consumer.json", "r"))
pilot_compute_description_dask_consumer
pilot_compute_description_dask_consumer["os_password"] = open("../../../.password", "r").read().strip()
pilot_compute_description_dask_consumer["type"] = "dask"

In [None]:
%%time
dask_pilot_consumer = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description_dask_consumer)
dask_pilot_consumer.wait()
dask_pilot_consumer.get_details()

In [None]:
dask_client_consumer  = distributed.Client(dask_pilot_consumer.get_details()['master_url'])
#dask_client  = distributed.Client()
dask_client_consumer.scheduler_info()

In [None]:
dask_client_consumer.gather(dask_client_consumer.map(lambda a: a*a, range(10)))

## Stop Cluster

In [None]:
dask_pilot_consumer.cancel()

In [None]:
dask_pilot_producer.cancel()

# 2. Start Kafka Cluster


In [None]:
pilot_compute_description_kafka = json.load(open("config/openstack_description_lrz_kafka.json", "r"))
pilot_compute_description_kafka
#pilot_compute_description["os_password"] = getpass.getpass()
pilot_compute_description_kafka["os_password"] = open("../../../.password", "r").read().strip()
pilot_compute_description_kafka["type"] = "kafka"

In [None]:
%%time

kafka_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description_kafka)
kafka_pilot.wait()


# 3. Start Stream Processing on Kafka/Dask

In [None]:
kafka_details = kafka_pilot.get_details()
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])