# Stream S3 Content
Approach for streaming S3 compressed objects and bulk loading to SQL

#### References:
- [Overview](https://aws.amazon.com/blogs/storage/querying-data-without-servers-or-databases-using-amazon-s3-select)
- [User Guide](https://docs.aws.amazon.com/AmazonS3/latest/userguide/selecting-content-from-objects.html)
- [Client.select_object_content](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.select_object_content)

#### To Do:
- Consider [Smart Open](https://github.com/RaRe-Technologies/smart_open)

In [None]:
from contextlib import closing
import csv
from datetime import datetime
import gzip as gz
import logging
from os import environ, path
import tempfile

import boto3
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker

## Config

In [None]:
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                    datefmt='%I:%M:%S %p', level=logging.INFO)

logger = logging.getLogger(__name__)

## Db Setup

In [None]:
maehc_db_url = 'postgresql://localhost@pgsql-jupyter-lib:5432/psycodb'
db_engine = create_engine(maehc_db_url, echo=False)

## S3 References
- [EventStream](https://botocore.amazonaws.com/v1/documentation/api/latest/reference/eventstream.html)
- [Partial record handling](https://github.com/aws/aws-sdk-net/issues/1296#issuecomment-494998477)
- [Example](https://kokes.github.io/blog/2018/07/26/s3-objects-streaming-python.html)

In [None]:
def get_s3_stream(s3_client, bucket: str, pattern: str, source_sql: str):
    content = s3_client.select_object_content(
        Bucket=bucket,
        Key=pattern,
        RequestProgress={"Enabled": False},
        ExpressionType="SQL",
        Expression=source_sql,
        InputSerialization={
            "CSV": {
                "FileHeaderInfo": "NONE",
                "FieldDelimiter": "|",
                "QuoteCharacter": "|",
            },
            "CompressionType": "GZIP",
        },
        OutputSerialization={
            "CSV": {
                "QuoteFields": "ALWAYS"
            }
        },
    )

    return content["Payload"]

In [None]:
def to_csv(s3_stream, target_path: str) -> None:
    end_event_received = False

    with open(target_path, 'wb') as f:

        for event in s3_stream:
            if 'Records' in event:
                f.write(event['Records']['Payload'])
            elif 'End' in event:
                end_event_received = True

    if not end_event_received:
        raise Exception("End event not received, request incomplete.")
    
    s3_stream.close()

## `COPY FROM` References
- [`copy_expert`](https://stackoverflow.com/a/34523707)
- [`copy_expert`/Psycopg2](https://www.psycopg.org/docs/usage.html#using-copy-to-and-copy-from)

In [None]:
def to_sql_bulk(db_engine, source_path: str, target_sql: str) -> int:
    try:
        c = db_engine.raw_connection()
        cursor = c.cursor()
        
        with open(source_path, 'rb') as f:
            
            cursor.copy_expert(target_sql, f)
            c.commit()
            
            return cursor.rowcount
    except Exception as e:
        c.rollback()
        raise e
    finally:
        c.close()

## Stream S3 Object Content
#### Approach
- Use `select_object_content` to stream content from a gzip-compressed S3 object
- Save as a CSV file to local disk
- Use COPY FROM to bulk import the content to SQL

#### References
- [Parameterization](https://stackoverflow.com/a/1471178)
- [Parameterization/Psycopg2](https://www.psycopg.org/docs/usage.html#passing-parameters-to-sql-queries)

In [None]:
%%time

s3_client = boto3.client('s3')
exports = s3_client.list_objects_v2(
    Bucket='my-bucket',
    Prefix='org/unload/data_file_002'
)

with db_engine.connect() as co:
    co.execute("TRUNCATE TABLE staging.result_bulk")

load_count = 0

for part in exports.get('Contents', []):

    logger.info(f"Stream file: {part['Key']}")

    event_stream = get_s3_stream(s3_client, 
                                 bucket="my-bucket", 
                                 pattern=part['Key'], 
                                 source_sql="""
                                    SELECT * FROM s3object
                                    --LIMIT 1795962
                                """)
    part_name = path.basename(part['Key']).split('.')[0]

    with tempfile.TemporaryDirectory() as tmpdir:

#         part_path = path.join('data', f'{part_name}.csv')
        part_path = path.join(tmpdir, f'{part_name}.csv')
        
        to_csv(event_stream, part_path)

        logger.info(f"Downloaded file: {part_path}")
        
        load_count += to_sql_bulk(
            db_engine, 
            source_path=part_path, 
            target_sql="""
                COPY staging.result_bulk (
                    lab_resultid, client_id, patient_id, order_number, "date", "time", code, 
                    "value", units, ref_range, abnormal, comment, observation_coding_system, 
                    observation_description, status, client_lab_result_id, row_hash, order_date,
                    collection_date
                )
                FROM STDIN
                WITH (FORMAT csv)
            """
        )
            
        logger.info(f"Loaded (staging.result_bulk): {load_count}")

# with db_engine.connect() as co:
#     co.execute("UPDATE staging.result_bulk SET source = %(source)s", 
#                {"source": part['Key'],})
logger.info(f"Total Loaded (staging.result_bulk): {load_count}")