# Manage EMR Instance and Jobs

__WHY?__ Use this notebook to set up EMR cluster, manage executions and terminate clusters programmaticaly, without going to AWS UI

__IMPORTANT!__ Always terminate cluster when done (see end of page)

#### 1. Load AWS Configs

In [None]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

KEY = config.get('AWS', 'AWS_ACCESS_KEY_ID')
SECRET = config.get('AWS','AWS_SECRET_ACCESS_KEY')

OUTPUT_DATA = config.get('AWS','OUTPUT_DATA')

In [None]:
import os
os.environ["AWS_ACCESS_KEY_ID"]= KEY
os.environ["AWS_SECRET_ACCESS_KEY"]= SECRET
os.environ["AWS_DEFAULT_REGION"]="us-west-2"

#### 2. Create EMR Cluster via CLI

In [None]:
!aws emr create-cluster --name udacity-project\
    --release-label emr-6.2.0 \
    --applications Name=Spark \
    --instance-count 5 \
    --use-default-roles \
    --no-auto-terminate \
    --instance-type m5.xlarge \
    --no-enable-debugging \
    --ec2-attributes KeyName="emr_udacity"

#### Create EMR Cluster using Boto3

For Boto 3 cluster setup [see link](https://stackoverflow.com/questions/26314316/how-to-launch-and-configure-an-emr-cluster-using-boto)

#### 2.1 List standby clusters

In [None]:
import boto3
emr = boto3.client( 'emr', 
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET, 
                   region_name='us-west-2')

In [None]:
starting_clusters = emr.list_clusters( ClusterStates=['STARTING'] )
starting_clusters

#### 3. S3 client

In [None]:
import boto3
s3 = boto3.client( 's3', 
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET, 
                   region_name='us-west-2')

3.1 Get EMR active S3 bucket

In [None]:
# find udacity bucket
emr_bucket = [entry['Name'] for entry in s3.list_buckets()['Buckets'] if 'udacity' in entry['Name']  ]
emr_bucket

3.2 Upload "etl.py" process file

In [None]:
s3.upload_file( 'etl.py', emr_bucket[0], 'etl.py' )
s3.upload_file( 'dl.cfg', emr_bucket[0], 'dl.cfg' )

In [None]:
# verify if there is and 'etl.py' file in my s3 bucket
[entry for entry in s3.list_objects(Bucket = emr_bucket[0] )['Contents'] if 'etl.py' in entry['Key'] ]

3.3 Add Job Step to process ETL

In [None]:
# load waiting cluster to add step to
waiting_clusters = [cluster['Id'] for cluster in emr.list_clusters()['Clusters'] if cluster['Status']['State'] == 'WAITING']
waiting_clusters

In [None]:
# add step to waiting cluster
response = emr.add_job_flow_steps(
    JobFlowId = waiting_clusters[0],
    Steps=[
        {
            'Name': 'Spark from boto3',
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': [
                    'spark-submit',
                     '--deploy-mode',
                     'cluster',
                     '--master',
                     'yarn',
                     OUTPUT_DATA + 'etl.py'
                ]
            }
        },
    ]
)
response

In [None]:
# Check steps running on cluster 
[ step for step in emr.list_steps( ClusterId = waiting_clusters[0] )['Steps'] if step['Status']['State'] == 'RUNNING' ]

#### 4. Delete standby clusters

__IMPORTANT__: Always run following steps. Guarantee cluster terminated after testing

In [None]:
# including RUNNING and WAITING clusters
standby_clusters = emr.list_clusters( ClusterStates=['RUNNING', 'WAITING'] )

clusters = []
for cluster in standby_clusters["Clusters"] : 
    clusters.append( cluster["Id"] )
    
print( "Nr clusters: {}".format( len(clusters) ))

In [None]:
if len(clusters) > 0 :
    emr.terminate_job_flows(
        JobFlowIds = clusters
    )
    print( "Terminating Clusters" )
else: 
    print( "No clusters found." )