# Getting Started with Pilot-Streaming on Wrangler

In the first step we need to import all required packages and modules into the Python Path

In [None]:
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
 

# Pilot-Streaming
import pilot.streaming
sys.modules['pilot.streaming']

The Pilot-Compute Description is a simple key/value style description of the cluster environment that should be started. Alternatively, the commandline tool delivered with this package can be used:

     pilot-streaming --resource=slurm://localhost --queue=normal --walltime=59 --number_cores=48 --framework spark 

# 1. Spark

## Start and Manage Pilots

In [None]:
### Required Spark configuration that needs to be provided before pyspark is imported and JVM started
os.environ["SPARK_LOCAL_IP"]='129.114.58.2' #must be done before pyspark is loaded
import pyspark

spark_pilot1_description = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"spark"
}

Start Spark Cluster and Wait for Startup Completion

In [None]:
%%time
spark_pilot1 = pilot.streaming.PilotComputeService.create_pilot(spark_pilot1_description)
spark_pilot1.wait()

In [None]:
spark_pilot1.get_details()

In [None]:
sc = spark_pilot1.get_context()

In [None]:
rdd = sc.parallelize([1,2,3])
rdd.map(lambda a: a*a).collect()

## Extend Spark Cluster 1 with additional resources

To extend a Spark cluster simple start another Pilot referencing the other Pilot-Job ID in the Pilot Compute Description.

In [None]:
spark_pilot1_jobid = spark_pilot1.get_id()

spark_pilot_description2 = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"spark",
    "parent": spark_pilot1_jobid
}

In [None]:
%%time
spark_pilot2 = pilot.streaming.PilotComputeService.create_pilot(spark_pilot_description2)
spark_pilot2.wait()

**Note:** Connection Spark Cluster are the same as the parent Pilot Job

In [None]:
spark_pilot2.get_details()

Stop both Pilots

In [None]:
spark_pilot1.cancel()
spark_pilot2.cancel()

# 2. Kafka

In [None]:
kafka_pilot_description1 = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"kafka"
}

In [None]:
%%time
kafka_pilot1 = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description1)
kafka_pilot1.wait()

In [None]:
kafka_pilot1.get_details()

## Extend Pilot

In [None]:
kafka_pilot1_jobid = kafka_pilot1.get_id()

kafka_pilot_description2 = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"kafka",
    "parent": kafka_pilot1_jobid
}

In [None]:
%%time
kafka_pilot2 = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description2)
kafka_pilot2.wait()

In [None]:
kafka_pilot2.get_details()

Check Brokers of Kafka Cluster

In [None]:
import pykafka
zkhost=kafka_pilot2.get_details()['master_url']
client = pykafka.KafkaClient(zookeeper_hosts=zkhost)
client.brokers

## Cancel Pilots

In [None]:
kafka_pilot1.cancel()
kafka_pilot2.cancel()

# 3. Dask

In [None]:
dask_pilot_description1 = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"dask"
}

In [None]:
%%time
dask_pilot1 = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description1)
dask_pilot1.wait()

In [None]:
dask_pilot1.get_details()

## Extension

In [None]:
dask_pilot1_jobid = dask_pilot1.get_id()
dask_pilot_description2 = {
    "resource":"slurm://localhost",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 59,
    "type":"dask",
    "parent": dask_pilot1_jobid
}

In [None]:
dask_pilot2 = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description2)
dask_pilot2.wait()

In [None]:
dask_pilot2.get_details()

Testing new Cluster

In [None]:
import distributed
dask_client  = distributed.Client(dask_pilot1.get_details()['master_url'])
dask_client.scheduler_info()

In [None]:
dask_client.gather(dask_client.map(lambda a: a*a, range(10)))

Cancel Pilots

In [None]:
dask_pilot1.cancel()
dask_pilot2.cancel()