# Getting Started with Pilot-Streaming on Wrangler

In the first step we need to import all required packages and modules into the Python Path

In [1]:
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)


# Pilot-Streaming
import pilot.streaming
sys.modules['pilot.streaming']

### Required Spark configuration that needs to be provided before pyspark is imported and JVM started
os.environ["SPARK_LOCAL_IP"]='129.114.58.2' #must be done before pyspark is loaded
import pyspark



The Pilot-Compute Description is a simple key/value style description of the cluster environment that should be started. Alternatively, the commandline tool delivered with this package can be used:

     pilot-streaming --resource=slurm://localhost --queue=normal --walltime=59 --number_cores=48 --framework spark 

## 1. Spark

In [2]:
pilot_compute_description = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "cores_per_node":1,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"spark"
}
spark_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)

**** Job: 59214 State : Pending
Create Spark Context for URL: spark://129.114.58.133:7077


In [3]:
spark_pilot.get_details()

{'master_url': 'spark://129.114.58.133:7077',
 'spark_home': '/work/01131/tg804093/wrangler/work/spark-7ffff27c-f028-11e7-b547-549f35083c1c/spark-2.2.1-bin-hadoop2.7',
 'web_ui_url': 'http://129.114.58.133:8080'}

In [4]:
sc = spark_pilot.get_context()

Create Spark Context for URL: spark://129.114.58.133:7077


In [5]:
rdd = sc.parallelize([1,2,3])
rdd.map(lambda a: a*a).collect()

[1, 4, 9]

In [None]:
spark_pilot.cancel()

# 2. Kafka

In [3]:
pilot_compute_description = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "cores_per_node":1,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"kafka"
}
kafka_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)

**** Job: 59230 State : Pending
look for configs in: /work/01131/tg804093/wrangler/work/kafka-61bbf196-f030-11e7-905c-549f35083c1c/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-61bbf196-f030-11e7-905c-549f35083c1c/config (Tue Jan  2 20:48:20 2018)
{'zookeeper.connection.timeout.ms': '6000', 'broker.id': '0', 'listeners': 'PLAINTEXT://c251-132:9092', 'zookeeper.connect': 'c251-132:2181'}


In [4]:
kafka_pilot.get_details()

{'details': {'broker.id': '0',
  'listeners': 'PLAINTEXT://c251-132:9092',
  'zookeeper.connect': 'c251-132:2181',
  'zookeeper.connection.timeout.ms': '6000'},
 'master_url': 'c251-132:2181'}

In [5]:
kafka_pilot.cancel()

## 3. Dask

In [None]:
pilot_compute_description = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "cores_per_node":1,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"dask"
}
dask_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)