# Amazon Glue DataBrew

# Prerequisites

Add The Following Trust Relationship To Your IAM Role

```
    {
      "Sid": "",
      "Effect": "Allow",
      "Principal": {
        "Service": "databrew.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
```

and Access Policy For Glue DataBrew.

# Imports and Settings

In [1]:
import sagemaker
import boto3

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sts = boto3.Session().client(service_name='sts')
db = boto3.Session().client('databrew')

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
account_id = sts.get_caller_identity().get('Account') 

In [3]:
import json
import pandas as pd
from botocore.exceptions import ClientError

# Create Dataset

In [4]:
import time
timestamp = int(time.time())

In [5]:
dataset_name = 'reviews-dataset-{}'.format(timestamp)
input_bucket='sagemaker-us-east-1-992382405090'
key='amazon-reviews-pds/tsv/'

In [6]:
response = db.create_dataset(
    Name=dataset_name,
    Input={
        'S3InputDefinition': {
            'Bucket': input_bucket,
            'Key': key
        },
    },
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "reviews-dataset-1721203444",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "access-control-allow-headers": "*,Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token,X-Amz-Content-Sha256,X-Amz-User-Agent,Date,X-Amz-Target,X-Amzn-Platform-Id,X-Amzn-Trace-Id",
            "access-control-allow-methods": "GET,PUT,POST,DELETE,OPTIONS",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "X-Amzn-RequestId,X-Amzn-ErrorType,X-Amz-Apigw-Id,X-Amzn-Trace-Id",
            "access-control-max-age": "86400",
            "connection": "keep-alive",
            "content-length": "37",
            "content-type": "application/json",
            "date": "Wed, 17 Jul 2024 08:04:04 GMT",
            "via": "1.1 4a91a321d4c2ab7334c6f285093956ae.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "bDAmMFh3oAMEfyA=",
            "x-amz-cf-id": "nwx0BrrAwnKDpQG-phZKG13cnWraTmNMOuh_gQsvvJTfXubs3UEmTQ==",
            

In [7]:
from IPython.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#dataset-details?dataset={}&tab=preview">Dataset</a></b>'.format(region, dataset_name)))


In [8]:
response = db.describe_dataset(
    Name=dataset_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "CreateDate": "2024-07-17 08:04:04.213000+00:00",
    "CreatedBy": "arn:aws:sts::992382405090:assumed-role/AmazonSageMaker-ExecutionRole-20240708T091019/SageMaker",
    "Input": {
        "S3InputDefinition": {
            "Bucket": "sagemaker-us-east-1-992382405090",
            "Key": "amazon-reviews-pds/tsv/"
        }
    },
    "Name": "reviews-dataset-1721203444",
    "ResourceArn": "arn:aws:databrew:us-east-1:992382405090:dataset/reviews-dataset-1721203444",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "access-control-allow-headers": "*,Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token,X-Amz-Content-Sha256,X-Amz-User-Agent,Date,X-Amz-Target,X-Amzn-Platform-Id,X-Amzn-Trace-Id",
            "access-control-allow-methods": "GET,PUT,POST,DELETE,OPTIONS",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "X-Amzn-RequestId,X-Amzn-ErrorType,X-Amz-Apigw-Id,X-Amzn-Trace-Id",
            "access-control-

## Get Dataset Resource ARN

In [9]:
dataset_arn = response['ResourceArn']
print(dataset_name)
print(dataset_arn)

reviews-dataset-1721203444
arn:aws:databrew:us-east-1:992382405090:dataset/reviews-dataset-1721203444


# Create Recipe

In [10]:
recipe_name='reviews-dataset-recipe-{}'.format(timestamp)

# View Recipe File

In [11]:
!pygmentize ./amazon-reviews-dataset-recipe.json

[[37m[39;49;00m
[37m  [39;49;00m{[37m[39;49;00m
[37m    [39;49;00m[94m"Action"[39;49;00m:[37m [39;49;00m{[37m[39;49;00m
[37m      [39;49;00m[94m"Operation"[39;49;00m:[37m [39;49;00m[33m"DELETE"[39;49;00m,[37m[39;49;00m
[37m      [39;49;00m[94m"Parameters"[39;49;00m:[37m [39;49;00m{[37m[39;49;00m
[37m        [39;49;00m[94m"sourceColumns"[39;49;00m:[37m [39;49;00m[33m"[\"marketplace\",\"customer_id\",\"review_id\",\"product_id\",\"product_parent\",\"product_title\",\"helpful_votes\",\"total_votes\",\"vine\",\"verified_purchase\",\"review_headline\",\"review_date\",\"year\"]"[39;49;00m[37m[39;49;00m
[37m      [39;49;00m}[37m[39;49;00m
[37m    [39;49;00m}[37m[39;49;00m
[37m  [39;49;00m}[37m[39;49;00m
][37m[39;49;00m


# Load Recipe File

In [12]:
# Read file
with open('./amazon-reviews-dataset-recipe.json', 'r') as file:
    file_object=file.read()

# Parse file
recipe_steps = json.loads(file_object)

print(json.dumps(recipe_steps, indent=4, sort_keys=True, default=str))

[
    {
        "Action": {
            "Operation": "DELETE",
            "Parameters": {
                "sourceColumns": "[\"marketplace\",\"customer_id\",\"review_id\",\"product_id\",\"product_parent\",\"product_title\",\"helpful_votes\",\"total_votes\",\"vine\",\"verified_purchase\",\"review_headline\",\"review_date\",\"year\"]"
            }
        }
    }
]


# Create Recipe From File

In [13]:
response = db.create_recipe(
    Description='Amazon Customers Reviews Recipe',
    Name=recipe_name,
    Steps=recipe_steps
)

In [14]:
print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "reviews-dataset-recipe-1721203444",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "access-control-allow-headers": "*,Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token,X-Amz-Content-Sha256,X-Amz-User-Agent,Date,X-Amz-Target,X-Amzn-Platform-Id,X-Amzn-Trace-Id",
            "access-control-allow-methods": "GET,PUT,POST,DELETE,OPTIONS",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "X-Amzn-RequestId,X-Amzn-ErrorType,X-Amz-Apigw-Id,X-Amzn-Trace-Id",
            "access-control-max-age": "86400",
            "connection": "keep-alive",
            "content-length": "44",
            "content-type": "application/json",
            "date": "Wed, 17 Jul 2024 08:04:05 GMT",
            "via": "1.1 4a91a321d4c2ab7334c6f285093956ae.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "bDAmYFOvIAMEYVw=",
            "x-amz-cf-id": "v3GCFmIuUIeQs8oVYglL9P8nliDaf1ra_fGu4-y0dO_hKGgAIaclVg==",
     

## Create Project

In [15]:
project_name = 'reviews-dataset-project-{}'.format(timestamp)

In [16]:
response = db.create_project(
    DatasetName=dataset_name,
    Name=project_name,
    RecipeName=recipe_name,
    Sample={
        'Size': 500,
        'Type': 'FIRST_N'
    },
    RoleArn=role
)

In [17]:
print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "reviews-dataset-project-1721203444",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "access-control-allow-headers": "*,Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token,X-Amz-Content-Sha256,X-Amz-User-Agent,Date,X-Amz-Target,X-Amzn-Platform-Id,X-Amzn-Trace-Id",
            "access-control-allow-methods": "GET,PUT,POST,DELETE,OPTIONS",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "X-Amzn-RequestId,X-Amzn-ErrorType,X-Amz-Apigw-Id,X-Amzn-Trace-Id",
            "access-control-max-age": "86400",
            "connection": "keep-alive",
            "content-length": "45",
            "content-type": "application/json",
            "date": "Wed, 17 Jul 2024 08:04:05 GMT",
            "via": "1.1 4a91a321d4c2ab7334c6f285093956ae.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "bDAmZGA_IAMEPVQ=",
            "x-amz-cf-id": "fs8i9_6cc1JlNpuoBP7iu6UJNMMhttEzvJPEwxD6G0L15a-lCm0tYA==",
    

In [18]:
from IPython.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#project-workspace?project={}&view=grid">Project</a></b>'.format(region, project_name)))

# Create Recipe Job

In [19]:
job_name = 'reviews-dataset-recipe-job-{}'.format(timestamp)
output_bucket = bucket
output_key = 'databrew/'

# TODO: Add Data Brew Trust Relation to IAM Role

In [20]:
response = db.create_recipe_job(
#    DatasetName=dataset_name,
    Name=job_name,
    LogSubscription='ENABLE',
    MaxCapacity=10,
    MaxRetries=0,
    Outputs=[
        {
            'Format': 'CSV',
            'PartitionColumns': [],
            'Location': {
                'Bucket': output_bucket,
                'Key': output_key
            },
            'Overwrite': True
        },
    ],
    ProjectName=project_name,
#     RecipeReference={
#         'Name': recipe_name
#     },
    RoleArn=role,
    Timeout=2880
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "reviews-dataset-recipe-job-1721203444",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "access-control-allow-headers": "*,Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token,X-Amz-Content-Sha256,X-Amz-User-Agent,Date,X-Amz-Target,X-Amzn-Platform-Id,X-Amzn-Trace-Id",
            "access-control-allow-methods": "GET,PUT,POST,DELETE,OPTIONS",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "X-Amzn-RequestId,X-Amzn-ErrorType,X-Amz-Apigw-Id,X-Amzn-Trace-Id",
            "access-control-max-age": "86400",
            "connection": "keep-alive",
            "content-length": "48",
            "content-type": "application/json",
            "date": "Wed, 17 Jul 2024 08:04:06 GMT",
            "via": "1.1 4a91a321d4c2ab7334c6f285093956ae.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "bDAmcHSfIAMEBUQ=",
            "x-amz-cf-id": "qyx__VS1JEVYEMK5P9rno2XU8T970PLcku-sQrPsCS0K1kcZ9SyHIw==",
 

In [21]:
from IPython.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#job-details?job={}&tab=history">Recipe Job</a></b>'.format(region, job_name)))


# Start Job Run

In [22]:
response = db.start_job_run(
    Name=job_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "ResponseMetadata": {
        "HTTPHeaders": {
            "access-control-allow-headers": "*,Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token,X-Amz-Content-Sha256,X-Amz-User-Agent,Date,X-Amz-Target,X-Amzn-Platform-Id,X-Amzn-Trace-Id",
            "access-control-allow-methods": "GET,PUT,POST,DELETE,OPTIONS",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "X-Amzn-RequestId,X-Amzn-ErrorType,X-Amz-Apigw-Id,X-Amzn-Trace-Id",
            "access-control-max-age": "86400",
            "connection": "keep-alive",
            "content-length": "79",
            "content-type": "application/json",
            "date": "Wed, 17 Jul 2024 08:04:07 GMT",
            "via": "1.1 4a91a321d4c2ab7334c6f285093956ae.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "bDAmiEIWoAMEgeg=",
            "x-amz-cf-id": "9m3tPufp1N67YJT2Qcg61Kj4GITOZK7fySL4B7Eh9ZZImg3jABUx1A==",
            "x-amz-cf-pop": "IAD12-P1",
            "x

# Get Job Run ID

In [23]:
job_run_id = response['RunId']
print(job_run_id)

db_0c415b98d52a5833dcf3b0f8b3c73ea21bfb70d71bf9b57473e3886a4fb8a06d


# List Job Run

In [24]:
response = db.list_job_runs(
    Name=job_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "JobRuns": [
        {
            "Attempt": 0,
            "DatasetName": "reviews-dataset-1721203444",
            "ExecutionTime": 0,
            "JobName": "reviews-dataset-recipe-job-1721203444",
            "LogGroupName": "/aws-glue-databrew/jobs-reviews-dataset-recipe-job-1721203444",
            "LogSubscription": "ENABLE",
            "Outputs": [
                {
                    "Format": "CSV",
                    "FormatOptions": {
                        "Csv": {
                            "Delimiter": ","
                        }
                    },
                    "Location": {
                        "Bucket": "sagemaker-us-east-1-992382405090",
                        "BucketOwner": "992382405090",
                        "Key": "databrew/"
                    },
                    "Overwrite": true,
                    "PartitionColumns": []
                }
            ],
            "RecipeReference": {
                "Name": "reviews-datase

In [25]:
status = response['JobRuns'][0]['State']
print(status)

RUNNING


# _Wait For The Job Run To Complete. The Job Runs For About 30min._

In [26]:
%%time

import time

response = db.list_job_runs(Name=job_name)

while response['JobRuns'][0]['State'] == 'RUNNING':
    response = db.list_job_runs(Name=job_name)
    status = response['JobRuns'][0]['State']
    print('Job Run State: {}'.format(status))
    time.sleep(15)

Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: FAILED
CPU times: user 60.9 ms, sys: 6.93 ms, total: 67.8 ms
Wall time: 3min 16s


# Review S3 Bucket With CSV File

In [27]:
from IPython.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/s3/buckets/{}?region={}&prefix={}">S3 Bucket</a></b>'.format(output_bucket, region, output_key)))

# Show the CSV Files

In [28]:
part_file='{}_part00000.csv'.format(job_name)
print(part_file)

reviews-dataset-recipe-job-1721203444_part00000.csv


In [29]:
s3_output_bucket='{}/{}'.format(output_bucket, output_key)
print(s3_output_bucket)

sagemaker-us-east-1-992382405090/databrew/


In [None]:
!aws s3 cp s3://$s3_output_bucket$part_file ./

In [None]:
import csv

df_reviews = pd.read_csv('./amazon-customer-reviews-dataset-recipe-job_part00000.csv')

In [None]:
df_reviews.head()

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}