In [None]:
{
  "transaction_id": "abcd1234",
  "timestamp": "2024-08-20T12:34:56Z",
  "customer_id": "cust5678",
  "items": [
    {"item_id": "item1", "quantity": 2, "price": 19.99},
    {"item_id": "item2", "quantity": 1, "price": 9.99}
  ],
  "total_amount": 49.97
}

In [None]:
Configure the Kinesis Agent

Create a configuration file for the Kinesis Agent (e.g., /etc/aws-kinesis-agent.json):

In [None]:
{
  "firehose": [
    {
      "delivery_stream": "your-firehose-stream-name",
      "file_pattern": "/path/to/your/logs/*.log",
      "data_processing_options": [
        {
          "type": "JSON"
        }
      ]
    }
  ]
}

In [None]:
sudo service aws-kinesis-agent start

In [None]:
Here’s the code to process and decrypt data, and then save it to S3.

Lambda Function Code (lambda_function.py):

In [None]:
import json
import boto3
from cryptography.fernet import Fernet
import os

# Initialize AWS clients
s3_client = boto3.client('s3')
key = os.environ['FERNET_KEY']  # Retrieve encryption key from environment variable
cipher_suite = Fernet(key)

def lambda_handler(event, context):
    bucket_name = 'your-s3-bucket-name'
    
    for record in event['Records']:
        # Decode and decrypt the record's data
        encrypted_data = record['kinesis']['data']
        decrypted_data = cipher_suite.decrypt(encrypted_data.encode('utf-8')).decode('utf-8')
        payload = json.loads(decrypted_data)
        
        # Generate a file name based on the record timestamp
        file_name = f"{payload['timestamp']}.json"
        
        # Save the decrypted payload to S3
        s3_client.put_object(
            Bucket=bucket_name,
            Key=file_name,
            Body=json.dumps(payload)
        )
        
    return {
        'statusCode': 200,
        'body': json.dumps('Processed records successfully.')
    }


In [None]:
aws glue create-crawler \
    --name your-crawler-name \
    --role your-iam-role-arn \
    --database-name your-database-name \
    --targets '{"s3Targets": [{"path": "s3://your-bucket-name/path/"}]}' \
    --table-prefix your-table-prefix \
    --schema-change-policy '{"UpdateBehavior": "UPDATE_IN_DATABASE", "DeleteBehavior": "DELETE_FROM_DATABASE"}'
    # --configuration '{"Version": 1, "CrawlerOutput": {"Partitions": {"AddOrUpdateBehavior": "InheritFromTable"}}}' - optional
    # --recrawl-policy '{"RecrawlBehavior": "CRAWL_EVERYTHING"}'  - optional

    # This will create a new table in the Glue Data Catalog based on the schema of the JSON files in the S3 bucket.

In [None]:
Run the Glue Crawler
To start the crawler, use the start-crawler command. This command initiates the schema discovery process.

In [None]:
aws glue start-crawler --name your-crawler-name

In [None]:
CREATE EXTERNAL SCHEMA your_external_schema
FROM DATA CATALOG # This is the keyword to create an external schema in Redshift. 
DATABASE 'your_glue_database'
IAM_ROLE 'your-iam-role'
CREATE EXTERNAL DATABASE IF NOT EXISTS;

# This is external schema, so you can use it to query data from the Glue Data Catalog.
# The data is stored in the S3 bucket, and the schema is stored in the Glue Data Catalog.
# we used the keyword external schema to create a schema that is not stored in the database which is local to Glue.

# Since we are running the query in the Redshift cluster, we need to create a schema in 
# the Redshift cluster to access the data from the Glue Data Catalog.

In [None]:
SELECT *
FROM your_external_schema.your_table
WHERE timestamp > '2024-08-01T00:00:00Z';