## Create an IAM Role

In [1]:
%%writefile role-trust.json
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "lambda.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

Writing role-trust.json


In [2]:
role_name = "wysde2-lambda-etl-role"
!aws iam create-role --role-name {role_name} --assume-role-policy-document file://role-trust.json


{
    "Role": {
        "Path": "/",
        "RoleName": "wysde2-lambda-etl-role",
        "RoleId": "AROAZ6TLRIUJRZE4F354T",
        "Arn": "arn:aws:iam::684199068947:role/wysde2-lambda-etl-role",
        "CreateDate": "2022-10-13T04:49:37+00:00",
        "AssumeRolePolicyDocument": {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Effect": "Allow",
                    "Principal": {
                        "Service": "lambda.amazonaws.com"
                    },
                    "Action": "sts:AssumeRole"
                }
            ]
        }
    }
}


## Create a Policy

In [3]:
%%writefile lamdba-etl-policy.json
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "logs:PutLogEvents",
                "logs:CreateLogGroup",
                "logs:CreateLogStream"
            ],
            "Resource": "arn:aws:logs:*:*:*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "s3:*"
            ],
            "Resource": [
                "arn:aws:s3:::wysde2-test/*",
                "arn:aws:s3:::wysde2-test"
            ]
        },
        {
            "Effect": "Allow",
            "Action": [
                "glue:*"
            ],
            "Resource": "*"
        }
    ]
}

Writing lamdba-etl-policy.json


In [4]:
policy_name = "wysde2-lambda-etl-policy"

!aws iam create-policy --policy-name {policy_name} --policy-document file://lamdba-etl-policy.json

{
    "Policy": {
        "PolicyName": "wysde2-lambda-etl-policy",
        "PolicyId": "ANPAZ6TLRIUJUEIGJAQ4L",
        "Arn": "arn:aws:iam::684199068947:policy/wysde2-lambda-etl-policy",
        "Path": "/",
        "DefaultVersionId": "v1",
        "AttachmentCount": 0,
        "PermissionsBoundaryUsageCount": 0,
        "IsAttachable": true,
        "CreateDate": "2022-10-13T04:58:57+00:00",
        "UpdateDate": "2022-10-13T04:58:57+00:00"
    }
}


## Attach the policy to the role

In [5]:
!aws iam attach-role-policy --policy-arn arn:aws:iam::684199068947:policy/wysde2-lambda-etl-policy --role-name wysde2-lambda-etl-role

## Setup the Lambda Layer

In [None]:
!wget -q --show-progress https://github.com/awslabs/aws-data-wrangler/releases/download/2.10.0/awswrangler-layer-2.10.0-py3.8.zip

In [None]:
layer_name = "wysde2_lambda_etl_aws_data_wrangler_python38"

!aws lambda publish-layer-version --layer-name {layer_name} \
--description "this enables the usage of aws data wrangler library in lambda" \
--zip-file fileb://awswrangler-layer-2.10.0-py3.8.zip \
--compatible-runtimes python3.8 \
--cli-connect-timeout 6000

## Create Python Code

In [None]:
import boto3
import awswrangler as wr
from urllib.parse import unquote_plus

def lambda_handler(event, context):
    # Get the source bucket and object name as passed to the Lambda function
    for record in event['Records']:
        bucket = record['s3']['bucket']['name']
        key = unquote_plus(record['s3']['object']['key'])
    
    # We will set the DB and table name based on the last two elements of 
    # the path prior to the file name. If key = 'dms/sakila/film/LOAD01.csv',
    # then the following lines will set db to sakila and table_name to 'film'
    key_list = key.split("/")
    print(f'key_list: {key_list}')
    # db_name = key_list[len(key_list)-3]
    # table_name = key_list[len(key_list)-2]
    db_name = key_list[-4]
    table_name = key_list[-2]
    
    print(f'Bucket: {bucket}')
    print(f'Key: {key}')
    print(f'DB Name: {db_name}')
    print(f'Table Name: {table_name}')
    
    input_path = f"s3://{bucket}/{key}"
    print(f'Input_Path: {input_path}')
    output_path = f"s3://wysde2-test/{db_name}/cleaned/{table_name}"
    print(f'Output_Path: {output_path}')
    
    input_df = wr.s3.read_csv([input_path])
    
    current_databases = wr.catalog.databases()
    wr.catalog.databases()
    if db_name not in current_databases.values:
        print(f'- Database {db_name} does not exist ... creating')
        wr.catalog.create_database(db_name)
    else:
        print(f'- Database {db_name} already exists')
    
    result = wr.s3.to_parquet(
        df=input_df, 
        path=output_path, 
        dataset=True,
        database=db_name,
        table=table_name,
        mode="append")
        
    print("RESULT: ")
    print(f'{result}')
    
    return result

## Upload the CSV

In [5]:
%%writefile test.csv
Name,favorite_num
Vrinda,22
Tracy,28
Gareth,23
Chris,16
Emma,14
Carlos,7
Cooper,11
Praful,4
David,33
Shilpa,2
Gary,18
Sean,20
Ha-yoon,9
Elizabeth,8
Mary,1
Chen,15
Janet,22
Mariusz,25
Romain,11
Matt,25
Brendan,19
Roger,2
Jack,7
Sachin,17
Francisco,5

Writing test.csv


In [8]:
!aws s3 cp test.csv s3://wysde2-test/testdb/raw/testtable/test.csv

upload: ./test.csv to s3://wysde2-test/testdb/raw/testtable/test.csv


## Download the parquet

In [9]:
!aws s3 cp s3://wysde2-test/testdb/cleaned/testtable/d1d92b3feec54417975b2af70faa59fc.snappy.parquet test.snappy.parquet

download: s3://wysde2-test/testdb/cleaned/testtable/d1d92b3feec54417975b2af70faa59fc.snappy.parquet to ./test.snappy.parquet


In [10]:
import pandas as pd

pd.read_parquet("test.snappy.parquet")

Unnamed: 0,name,favorite_num
0,Vrinda,22
1,Tracy,28
2,Gareth,23
3,Chris,16
4,Emma,14
5,Carlos,7
6,Cooper,11
7,Praful,4
8,David,33
9,Shilpa,2
