# Getting Started with Pilot-Streaming on AWS

In the first step we need to import all required packages and modules into the Python Path

The Pilot-Compute Description is a simple key/value style description of the cluster environment that should be started. 

In [None]:
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
import json
import pyspark
import os
import boto3
boto3.setup_default_session(profile_name='dev')
    
# Pilot-Streaming
import pilot.streaming
sys.modules['pilot.streaming']

# 1. Kinesis

In [None]:
pilot_compute_description = {
    "resource":"kinesis://awscloud.com",
    "number_cores": 1,
    "type":"kinesis"
}

Start Spark Cluster and Wait for Startup Completion

In [None]:
%%time
kinesis_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)
kinesis_pilot.wait()

In [None]:
kinesis_pilot.get_details()

In [None]:
kinesis_pilot.get_id()

In [None]:
kinesis_client = boto3.client('kinesis', region_name='us-east-1')
put_response = kinesis_client.put_record(
                        StreamName=kinesis_pilot.get_id().split("/")[1],
                        Data="Hello World",
                        PartitionKey="A")

# 2. Lambda

In [None]:
def lambda_handler(event, context):
    import base64
    print("Lambda Function called")
    for record in event['Records']:
        #Kinesis data is base64 encoded so decode here
        payload=base64.b64decode(record["kinesis"]["data"])
        print("Decoded payload: " + str(payload))

In [None]:
pilot_compute_description = {
    "resource":"kinesis://awscloud.com",
    "number_cores": 1,
    "lambda_input_data": kinesis_pilot.get_id(),
    "lambda_function": lambda_handler,
    "type":"lambda"
}

In [None]:
%%time
lambda_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)
lambda_pilot.wait()

In [None]:
lambda_pilot.get_details()

In [None]:
def lambda_handler(event, context):
    import base64
    print("Lambda Function called Version 2")
    for record in event['Records']:
        #Kinesis data is base64 encoded so decode here
        payload=base64.b64decode(record["kinesis"]["data"])
        print("Decoded payload: " + str(payload))
        
lambda_pilot.submit(lambda_handler) #update function inside of Lambda

In [None]:
kinesis_pilot.cancel()
lambda_pilot.cancel()

# 3. EC2

In [None]:
import boto3
import json
boto3.setup_default_session(profile_name='dev')
ec2_client = boto3.resource('ec2', region_name='us-east-1')

#{
#    "resource":"ec2://awscloud.com",
#    "number_cores" : 1,
#    "ec2_ssh_username":"ubuntu",
#    "ec2_ssh_keyname":"xxx",
#    "ec2_ssh_keyfile":"",    
#    "ec2_vpc_id" : "xxx",   
#    "ec2_subnet_id": "xxx",
#    "ec2_image_id" : "xxx", 
#    "ec2_security_group": "xxxx",
#    "ec2_instance_type" : "t2.nano",
#    "ec2_name": "xxx", 
#    "type":"ec2"
#}

ec2_description = json.load(open("aws_description.json", "r"))

number_vms=1

ec2_instances = ec2_client.create_instances(ImageId=ec2_description["ec2_image_id"],
                                            InstanceType=ec2_description["ec2_instance_type"],
                                            KeyName=ec2_description["ec2_ssh_keyname"],
                                            #SubnetId=ec2_description["ec2_subnet_id"],
                                            #SecurityGroupIds=[ec2_description["ec2_security_group"]],
                                            TagSpecifications=[{'ResourceType': 'instance',
                                                                'Tags': [{"Key":"Name", 
                                                                          "Value":ec2_description["ec2_name"]}]}],
                                            NetworkInterfaces=[{'AssociatePublicIpAddress': True, 
                                                                'DeviceIndex': 0,
                                                                'SubnetId': ec2_description["ec2_subnet_id"],
                                                                'Groups': [ec2_description["ec2_security_group"]]}],
                                            BlockDeviceMappings=[{
                                                        'DeviceName': '/dev/sda1',
                                                        'Ebs': {'VolumeSize': 30,
                                                                'VolumeType': 'gp2'}}],
                                            MinCount=number_vms, 
                                            MaxCount=number_vms)

In [None]:
for i in ec2_instances:
    i.wait_until_running()
    i.load()
    print("ID: {}, State: {}, IP: {} ".format(i.id, i.state['Name'], i.private_ip_address))

In [None]:
for i in ec2_instances:
    i.terminate()

In [None]:
for i in ec2_client.instances.all():
    print(i.state)

In [None]:
# Boto 3
for status in ec2_client.meta.client.describe_instance_status()['InstanceStatuses']:
    print(status)

# Pilot-API

In [None]:
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
import json
import pyspark
import os
import boto3
boto3.setup_default_session(profile_name='dev')
    
# Pilot-Streaming
import pilot.streaming
sys.modules['pilot.streaming']

In [None]:
pilot_compute_description = json.load(open("aws_description.json", "r"))
pilot_compute_description["resource"] ="ec2+dask://awscloud.com"
pilot_compute_description["working_directory"]=os.path.join(os.getcwd(), "work")
pilot_compute_description["number_cores"] = 1
pilot_compute_description["cores_per_node"] = 1
pilot_compute_description["type"] = "dask"
dask_ec2_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)
dask_ec2_pilot.wait()

In [None]:
dask_client=dask_ec2_pilot.get_context()
dask_client.gather(dask_client.map(lambda a: a*a, range(10)))

In [None]:
dask_client.scheduler_info()

In [None]:
dask_ec2_pilot.cancel()