# Upload S3 Content as a Stream

References:
- [Overview](https://aws.amazon.com/blogs/storage/querying-data-without-servers-or-databases-using-amazon-s3-select)
- [User Guide](https://docs.aws.amazon.com/AmazonS3/latest/userguide/selecting-content-from-objects.html)
- [Client.select_object_content](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.select_object_content)

To Do:
- Consider [Smart Open](https://github.com/RaRe-Technologies/smart_open)

In [None]:
from contextlib import closing
import csv
from datetime import datetime
from io import BytesIO
import gzip as gz
import logging
from os import environ

import boto3
from botocore.exceptions import ClientError, EventStreamError

In [None]:
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                    datefmt='%I:%M:%S %p', level=logging.INFO)

logger = logging.getLogger(__name__)

# Stream S3
Use `Body` attribute and `iter_lines()`.

References:
- [StreamingBody](https://botocore.amazonaws.com/v1/documentation/api/latest/reference/response.html)

## List Bucket

In [None]:
%%time

s3_client = boto3.client('s3',
        aws_access_key_id=environ['P3_AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=environ['P3_AWS_SECRET_ACCESS_KEY'],
        aws_session_token=environ['P3_AWS_SESSION_TOKEN']
)

exports = s3_client.list_objects_v2(
            Bucket='my-bucket',
            Prefix='org/unload'
        )

for part in exports.get('Contents', []):

    logger.info(f"lake_path is {part['Key']}")

## Download / Upload S3 Object

In [None]:
%%time

s3_hj_client = boto3.client('s3')
s3_client = boto3.client('s3',
        aws_access_key_id=environ['P3_AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=environ['P3_AWS_SECRET_ACCESS_KEY'],
        aws_session_token=environ['P3_AWS_SESSION_TOKEN']
)
prefix = 'test-file' 

allergy_obj = s3_hj_client.get_object(
    Bucket='my-bucket',
    Key=f'org/unload/{prefix}'
)

with closing(allergy_obj['Body']) as body: 

    s3_client.upload_fileobj(
        Fileobj=body,
        Bucket='my-bucket2',
        Key=f'org/upload/{prefix}',
    )

## Stream GZip to S3
`StreamingBody` has no `seek()` method so it is not able to be part of a stream chain with `gzip.GzipFile`.

References:
- https://kokes.github.io/blog/2018/07/26/s3-objects-streaming-python.html

In [None]:
%%time
import codecs

s3_hj_client = boto3.client('s3')
s3_client = boto3.client('s3',
        aws_access_key_id='',
        aws_secret_access_key='',
)

allergy_obj = s3_hj_client.get_object(
    Bucket='my-bucket',
    Key='org/unload/test-file'
)

body = allergy_obj['Body'] # botocore.response.StreamingBody

with closing(body):
    with gz.open(body, 'rb') as unzipped:

        s3_client.upload_fileobj(
            Fileobj=BytesIO(unzipped.read()), # this works but is not streaming
#             Fileobj=unzipped,  # fails, missing seek
            Bucket='my-bucket2',
            Key='org/unload/test-file',
        )

logger.info(f"Result is complete")


## Iterator Implementation
Adapting the `EventStream` class to a true "file-like object" is difficult as it is an iterator and not a stream.

In [None]:
class S3SelectContentIterator:

    def __init__(self, raw_stream, chunk_size=1024):
        self._raw_stream = raw_stream # botocore.eventstream.EventStream
        self._chunk_size = chunk_size
        self._amount_read = 0
        self.end_event_received = False
        
    def __iter__(self):
        """
        Read at most amt bytes from the stream.
        If the amt argument is omitted, read all data.
        """
        event_stream = self._raw_stream # botocore.eventstream.EventStream
        self.end_event_received = False
        chunk = b''

        # Iterate over events in the event stream
        for event in event_stream:
            
            # Received a records event
            if 'Records' in event:
                data = event['Records']['Payload']
                chunk = chunk + data

            # End event indicates that the request finished successfully
            elif 'End' in event:
                self.end_event_received = True

            if len(chunk) >= self._chunk_size:
                self._amount_read += len(chunk)
                logger.info(f"data chunk: {self._amount_read}")
                yield chunk
                chunk = b''

        if not self.end_event_received:
            logger.info("End event not received, request incomplete.")
                
        if len(chunk) > 0:
            self._amount_read += len(chunk)
            logger.info(f"last data chunk: {self._amount_read}")
            yield chunk

    def close(self):
        """Close the underlying event stream."""
        self._raw_stream.close()

s3_hj_client = boto3.client('s3')
s3_client = boto3.client('s3')
        
content = s3_client.select_object_content(
    Bucket='my-bucket',
    Key="test-file.csv",
    RequestProgress = {
        'Enabled': False
    },
    ExpressionType="SQL",
    Expression="""
        SELECT  *
        FROM    s3object AS s3
        --LIMIT 20
    """,
    InputSerialization = {"CSV": {'FileHeaderInfo': 'USE'}},
    OutputSerialization = {"CSV": {}},
)

event_stream = content['Payload'] # botocore.eventstream.EventStream
stream_adapter = S3SelectContentIterator(event_stream, 10024)

with closing(stream_adapter): 
    for data in stream_adapter:
        records = data.decode('utf-8')

logger.info("Result is complete")