In [1]:
import boto3
import botocore
import json
import os
import uuid
import pandas as pd

glue = boto3.client('glue')
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
lf = boto3.client('lakeformation')
cfn = boto3.client('cloudformation')

session = boto3.session.Session()
region = session.region_name
account_id = boto3.client('sts').get_caller_identity().get('Account')

#### Get the Outputs from the CloudFormation template

In [2]:
response = cfn.describe_stacks(
    StackName='GlueStudioDemoStack'
)

outputs = response['Stacks'][0]['Outputs']

for output in outputs:
    if (output['OutputKey'] == 'DataLakeBucketName'):
        bucket = output['OutputValue']
    if (output['OutputKey'] == 'TaxiDatabase'):
        database_name = output['OutputValue']
    if (output['OutputKey'] == 'DataLakeRoleArn'):
        role_arn = output['OutputValue']
    if (output['OutputKey'] == 'TaxiDataCrawler'):
        data_crawler = output['OutputValue']        
        
pd.set_option('display.max_colwidth', None)
pd.DataFrame(outputs, columns=["OutputKey", "OutputValue"])

Unnamed: 0,OutputKey,OutputValue
0,SageMakerNotebook,arn:aws:sagemaker:us-east-1:649037252677:notebook-instance/gluestudioprepnotebook
1,DataLakeBucketName,gluestudiodemostack-datalakebucket0256ea8e-gft03a97jrzl
2,TaxiDataCrawler,GlueStudioTaxiDemoCrawler
3,TaxiDatabase,taxi_demo
4,DataLakeRoleArn,arn:aws:iam::649037252677:role/GlueStudioDataLakeServiceLinkedRole


### [Upload to S3](https://docs.aws.amazon.com/AmazonS3/latest/dev/Welcome.html)

Next, we will upload the json files located in the `data` folder to S3 to be used later in the workshop. We are using a sample file from New York City Taxi and Limousine Commission (TLC) Trip Record Data dataset available on the [AWS Open Data Registry](https://registry.opendata.aws/nyc-tlc-trip-records-pds/)

[s3.upload_file](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.upload_file) boto3 documentation

In [3]:
file_name = 'yellow_tripdata_2020-06.csv'
path = 'data'
session.resource('s3').Bucket(bucket).Object(os.path.join('datalake', 'yellow', file_name)).upload_file(path + '/' + file_name)

file_name = 'paymenttype.csv'
path = 'data'
session.resource('s3').Bucket(bucket).Object(os.path.join('datalake', 'paymenttype', file_name)).upload_file(path + '/' + file_name)

file_name = 'ratecode.csv'
session.resource('s3').Bucket(bucket).Object(os.path.join('datalake', 'ratecode', file_name)).upload_file(path + '/' + file_name)

file_name = 'taxi_zone_lookup.csv'
session.resource('s3').Bucket(bucket).Object(os.path.join('datalake', 'taxi_zone_lookup', file_name)).upload_file(path + '/' + file_name)

#### Load Taxi Demo database with S3 data from Glue Crawler

In [4]:
glue.start_crawler(Name=data_crawler)

{'ResponseMetadata': {'RequestId': '89a0f512-5ff2-4971-a885-a5b50b395af9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 08 Oct 2020 00:50:14 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': '89a0f512-5ff2-4971-a885-a5b50b395af9'},
  'RetryAttempts': 0}}

In [11]:
crawler_status = glue.get_crawler(Name=data_crawler)['Crawler']['State']

while crawler_status not in ('READY'):
    crawler_status = glue.get_crawler(Name=data_crawler)['Crawler']['State']
    print(crawler_status)
    time.sleep(30)
    
print('Crawler Complete')

Crawler Complete


In [25]:
df = pd.json_normalize(glue.get_tables(DatabaseName=database_name)['TableList'])
pd.set_option('display.max_colwidth', None)
pd.DataFrame(df, columns=["Name", "DatabaseName", "StorageDescriptor.Columns", "StorageDescriptor.Location"])

Unnamed: 0,Name,DatabaseName,StorageDescriptor.Columns,StorageDescriptor.Location
0,paymenttype,taxi_demo,"[{'Name': 'id', 'Type': 'bigint'}, {'Name': 'name', 'Type': 'string'}]",s3://gluestudiodemostack-datalakebucket0256ea8e-gft03a97jrzl/datalake/paymenttype/
1,ratecode,taxi_demo,"[{'Name': 'id', 'Type': 'bigint'}, {'Name': 'name', 'Type': 'string'}]",s3://gluestudiodemostack-datalakebucket0256ea8e-gft03a97jrzl/datalake/ratecode/
2,taxi_zone_lookup,taxi_demo,"[{'Name': 'locationid', 'Type': 'bigint'}, {'Name': 'borough', 'Type': 'string'}, {'Name': 'zone', 'Type': 'string'}, {'Name': 'service_zone', 'Type': 'string'}]",s3://gluestudiodemostack-datalakebucket0256ea8e-gft03a97jrzl/datalake/taxi_zone_lookup/
3,yellow,taxi_demo,"[{'Name': 'vendorid', 'Type': 'bigint'}, {'Name': 'tpep_pickup_datetime', 'Type': 'string'}, {'Name': 'tpep_dropoff_datetime', 'Type': 'string'}, {'Name': 'passenger_count', 'Type': 'bigint'}, {'Name': 'trip_distance', 'Type': 'double'}, {'Name': 'ratecodeid', 'Type': 'bigint'}, {'Name': 'store_and_fwd_flag', 'Type': 'string'}, {'Name': 'pulocationid', 'Type': 'bigint'}, {'Name': 'dolocationid', 'Type': 'bigint'}, {'Name': 'payment_type', 'Type': 'bigint'}, {'Name': 'fare_amount', 'Type': 'double'}, {'Name': 'extra', 'Type': 'double'}, {'Name': 'mta_tax', 'Type': 'double'}, {'Name': 'tip_amount', 'Type': 'double'}, {'Name': 'tolls_amount', 'Type': 'double'}, {'Name': 'improvement_surcharge', 'Type': 'double'}, {'Name': 'total_amount', 'Type': 'double'}, {'Name': 'congestion_surcharge', 'Type': 'double'}]",s3://gluestudiodemostack-datalakebucket0256ea8e-gft03a97jrzl/datalake/yellow/


#### Let's start with Glue Studio



In [42]:
df = pd.DataFrame(["https://console.aws.amazon.com/gluestudio/home?region={0}#/".format(region)])
df.columns = ['Link']
def make_clickable(val):
    return '<a href="{}" target="_blank">{}</a>'.format(val,val)

df.style.hide_index().format(make_clickable)

Link
https://console.aws.amazon.com/gluestudio/home?region=us-east-1#/
