# Getting Started with Pilot-Streaming on Wrangler

In the first step we need to import all required packages and modules into the Python Path

In [1]:
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
 

# Pilot-Streaming
import pilot.streaming
sys.modules['pilot.streaming']



<module 'pilot.streaming' from '/home/01131/tg804093/anaconda2/lib/python2.7/site-packages/pilot/streaming.pyc'>

The Pilot-Compute Description is a simple key/value style description of the cluster environment that should be started. Alternatively, the commandline tool delivered with this package can be used:

     pilot-streaming --resource=slurm://localhost --queue=normal --walltime=59 --number_cores=48 --framework spark 

# 1. Spark

In [2]:
### Required Spark configuration that needs to be provided before pyspark is imported and JVM started
#os.environ["SPARK_LOCAL_IP"]='129.114.58.2' #must be done before pyspark is loaded
import pyspark

pilot_compute_description = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"spark"
}

Start Spark Cluster and Wait for Startup Completion

In [3]:
%%time

spark_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)
spark_pilot.wait()

**** Job: 59455 State : Pending
Create Spark Context for URL: spark://129.114.58.137:7077
Create Spark Context for URL: spark://129.114.58.137:7077
CPU times: user 131 ms, sys: 96.6 ms, total: 228 ms
Wall time: 23.7 s


In [4]:
spark_pilot.get_details()

{'master_url': 'spark://129.114.58.137:7077',
 'spark_home': '/work/01131/tg804093/wrangler/work/spark-7f5f4500-f2ff-11e7-82cf-549f35098908/spark-2.2.1-bin-hadoop2.7',
 'web_ui_url': 'http://129.114.58.137:8080'}

In [4]:
#sc = pyspark.SparkContext(master="spark://129.114.58.135:7077", appName="test")

In [5]:
sc = spark_pilot.get_context()

Create Spark Context for URL: spark://129.114.58.137:7077


In [6]:
rdd = sc.parallelize([1,2,3])
rdd.map(lambda a: a*a).collect()

[1, 4, 9]

In [None]:
spark_pilot.cancel()

# 2. Kafka

In [9]:
pilot_compute_description = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"kafka"
}

**** Job: 59256 State : Pending
look for configs in: /work/01131/tg804093/wrangler/work/kafka-1e2342e4-f0f9-11e7-839e-549f35083c1c/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-1e2342e4-f0f9-11e7-839e-549f35083c1c/config (Wed Jan  3 20:45:14 2018)
{'zookeeper.connection.timeout.ms': '6000', 'broker.id': '0', 'listeners': 'PLAINTEXT://c251-132:9092', 'zookeeper.connect': 'c251-132:2181'}


In [None]:
%%time
kafka_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)
kafka_pilot.wait()

In [10]:
kafka_pilot.get_details()

{'details': {'broker.id': '0',
  'listeners': 'PLAINTEXT://c251-132:9092',
  'zookeeper.connect': 'c251-132:2181',
  'zookeeper.connection.timeout.ms': '6000'},
 'master_url': 'c251-132:2181'}

In [11]:
kafka_pilot.cancel()

# 3. Dask

In [5]:
import distributed

pilot_compute_description = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"dask"
}

In [6]:
%%time
dask_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)
dask_pilot.wait()

**** Job: 59266 State : Running
CPU times: user 181 ms, sys: 93.2 ms, total: 274 ms
Wall time: 6.3 s


In [7]:
dask_pilot.get_details()

{'master_url': 'tcp://c251-135:8786', 'web_ui_url': 'http://c251-135:8787'}

In [15]:
import distributed
dask_client  = distributed.Client(dask_pilot.get_details()['master_url'])
dask_client.scheduler_info()

{'address': 'tcp://129.114.58.135:8786',
 'id': 'Scheduler-363ae53b-1276-4ffc-bdc7-70b1aeb4283a',
 'services': {'bokeh': 8787},
 'type': 'Scheduler',
 'workers': {'tcp://129.114.58.135:41796': {'cpu': 8.0,
   'executing': 0,
   'host': '129.114.58.135',
   'in_flight': 0,
   'in_memory': 10,
   'last-seen': 1515036799.385555,
   'last-task': 1515036750.2663264,
   'local_directory': '/home/01131/tg804093/dask-worker-space/worker-lkiSY_',
   'memory': 103673856,
   'memory_limit': 134778585088,
   'name': 'tcp://129.114.58.135:41796',
   'ncores': 48,
   'num_fds': 24,
   'pid': 44991,
   'read_bytes': 158293.4720896321,
   'ready': 0,
   'services': {'bokeh': 8789, 'nanny': 42225},
   'time': 1515036798.885611,
   'time-delay': 0.00038909912109375,
   'write_bytes': 158293.4720896321}}}

In [14]:
dask_client.gather(dask_client.map(lambda a: a*a, range(10)))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]