# Purpose

This notebook will create a template document containing some basic info within S3

# Discussion

this assumes that
* aws credentials have been previously setup
* Boto3 has been installed:  `pip3 install boto3`

# Functions

## create_bucket(bucket_name, s3_resource)

`bucket_create_response = create_bucket(bucket_name, s3_resource)`

In [1]:
def create_bucket(bucket_name, s3_resource):
    return s3_resource.create_bucket(Bucket=bucket_name)

## searching for files

`get_matching_s3_objects(bucket, prefix="", suffix="")`

`key = get_matching_s3_keys(bucket, prefix="", suffix="")`

In [2]:
#https://alexwlchan.net/2017/07/listing-s3-keys/
#https://alexwlchan.net/2019/07/listing-s3-keys/

import boto3

def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    kwargs = {'Bucket': bucket}

    # We can pass the prefix directly to the S3 API.  If the user has passed
    # a tuple or list of prefixes, we go through them one by one.
    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    for key_prefix in prefixes:
        kwargs["Prefix"] = key_prefix

        for page in paginator.paginate(**kwargs):
            try:
                contents = page["Contents"]
            except KeyError:
                return

            for obj in contents:
                key = obj["Key"]
                if key.endswith(suffix):
                    yield obj


def get_matching_s3_keys(bucket, prefix="", suffix=""):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj["Key"]

#for key in get_matching_s3_keys(bucket='testname.asyla.org', prefix='BlueMarble/', suffix='.jpg'):
#    print(key)
#print('\n\n')
#for key in get_matching_s3_keys(bucket='testname.asyla.org', suffix=('.jpg', '.JPG')):
#    print(key)

## does_key_exist(bucket_name, file_name)

`boolean = does_key_exist(bucket_name, file_name)`

In [3]:
#https://stackoverflow.com/questions/33842944/check-if-a-key-exists-in-a-bucket-in-s3-using-boto3

def does_key_exist(bucket_name, file_name):
    try:
        s3.Object(bucket_name, file_name).load()
    except:
        #print ('error')
        return (False)
    else:
        #print ('worked')
        return (True)

## yamlDump(dataDict)

`yamlData = yamlDump(dataDict)`

read in a python data structure and return it in YAML format

In [4]:
#yamlData = yamlDump(dataDict)
#read in a python data structure and return it in YAML format

def yamlDump (data):
    from ruamel.yaml import YAML #[ruamel.yaml documentation](https://yaml.readthedocs.io/en/latest/index.html)
    yaml = YAML()

    #define the output format
    yaml.version = (1, 2) #https://yaml.readthedocs.io/en/latest/detail.html#document-version-support
    yaml.default_flow_style = False #https://yaml.readthedocs.io/en/latest/basicuse.html#basic-usage
    yaml.indent(mapping=2, sequence=4, offset=2) #https://yaml.readthedocs.io/en/latest/detail.html#indentation-of-block-sequences
    #yaml.top_level_colon_align = True #https://yaml.readthedocs.io/en/latest/detail.html#positioning-in-top-level-mappings-prefixing
    yaml.explicit_start=True #guessed from: https://pyyaml.org/wiki/PyYAMLDocumentation
    yaml.explicit_end=True #guessed from: https://pyyaml.org/wiki/PyYAMLDocumentation
    yaml.sort_keys=False #guessed from: [sort_keys=False](https://stackoverflow.com/a/55171433/12400492)

    from io import StringIO

    old_stdout = sys.stdout #save the original stdout
    sys.stdout = mystdout = StringIO() #redirect stdout
    yaml.dump(data, sys.stdout) #dump YAML to stdout
    sys.stdout = old_stdout #restore stdout

    return mystdout.getvalue() #return the YAML structure

## updateDatabase(bucket, filename, recordId, recordData)

In [26]:
#Update the database file
#
#This assumes the database
#  - resides in S3
#  - in YAML format
#  - it is a dictionary format
#  - and is small enough to reside in RAM

def updateDatabase(bucket, filename, recordId, recordData):
    from ruamel.yaml import YAML #[ruamel.yaml documentation](https://yaml.readthedocs.io/en/latest/index.html)
    yaml = YAML()
    
    dbContent = {} #create an empty dictionary
    
    #populate the datastructure with data if there is any data to work with
    if does_key_exist(bucket, filename):
        #fetch the DB and return it as a string
        #https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.get
        #https://stackoverflow.com/questions/31976273/open-s3-object-as-a-string-with-boto3#35376156
        fileBody = s3.Object(bucket, filename).get()['Body'].read().decode('utf-8') 
        #print (response)
    
        dbContent = yaml.load(fileBody) #read the file into an OrderdDictionary
        #print (fileData)

    
    dbContent[recordId] = recordData #save the new record to the structure
    #print (dbContent)

    result = s3.Object(bucket, filename).put( #save the data to a S3 key
        Body=yamlDump(dbContent) #convert the datastructure into YAML
    )
    
    return result

# main program

## Connect to S3

In [6]:
import boto3
s3 = boto3.resource('s3')

In [7]:
#Note that the name of a bucket must be unique to all of S3 DNS namespace
#Names can only start with [a-z0-9] but may include [a-z0-9-_./]

#bucket_name = 'Documents' #this will fail
#bucket_name = create_unique_name('') #this is safer
#bucket_name = create_unique_name('documents'+'--'+str(uuid.uuid4())) #this is easier to directly work with

bucket_name = 'documents--88767106-9edc-4028-a451-0da43b669d7f' #hardcode this so it doesnt change
#bucket_name = 'testname.asyla.org'

print('bucket_name='+bucket_name)

bucket_name=documents--88767106-9edc-4028-a451-0da43b669d7f


## Create bucket if needed

In [8]:
if s3.Bucket(bucket_name).creation_date is None: #there is no date if it doesnt exist
    response = create_bucket(bucket_name, s3)
    #print(response)
    print ('bucket created')

## Create a new (yaml) file

### Create the name

In [9]:
#generate all the file name permutations we may need

document={}

#generate the base name (documentID)
import uuid
document['base_name']=str(uuid.uuid4())
print("document['base_name']="+document['base_name'])

#generate the prefix (the path based upon the documentID)
import re
#couldnt get `re.sub()` to cooperate so did this instead
document['file_prefix'] = ''
for part in document['base_name'].split('-'):
    document['file_prefix'] += re.search('^.{2}', part).group(0) + '/' #match the first 2 chars and output the first match
print("document['file_prefix']="+document['file_prefix'])

#set the file suffix (extension)
document['file_suffix']='.ymal'
print("document['file_suffix']="+document['file_suffix'])

#generate the full filename (documentID + extension)
document['file_name']=document['base_name']+document['file_suffix']
print("document['file_name']="+document['file_name'])

#generate the full key (path + full filename)
document['key_name']=document['file_prefix']+document['file_name']
print("document['key_name']="+document['key_name'])

#document

document['base_name']=6b34e2f1-c927-41dd-964d-b606ac9c0b2f
document['file_prefix']=6b/c9/41/96/b6/
document['file_suffix']=.ymal
document['file_name']=6b34e2f1-c927-41dd-964d-b606ac9c0b2f.ymal
document['key_name']=6b/c9/41/96/b6/6b34e2f1-c927-41dd-964d-b606ac9c0b2f.ymal


### Ensure the file doesnt already exist

In [10]:
#make sure the file does not exist
#UUIDv4 generates random strings so there is a (small) chance of a duplicate
import sys

if does_key_exist(bucket_name, document['key_name']):
    #print ('ERROR: File exists: '+document['key_name'])
    sys.exit('ERROR: File exists: '+document['key_name'])

### Collect nessessary info

In [11]:
#Using YAML format for the document template because its what this will end up as and base Python structures dont play well with comments

from datetime import date

YAMLcontent = """
'id': """+document['base_name']+""" # UUIDv4
'title': '!'
'subtitle': '!'
'author': '!' # who wrote the document
'abstract': '!'
'lang': 'en'
'type': '!' # Options: [regulation|policy|standard|guidance|requirement|control|procedure]
'source': '!'
'classification': 'public' # Options: [public|private|confidential]
referenceTag:
  keywords:
    - '!'
  relationship: '!' # Format: `ID : [parent|peer|child]`
status: # this section is to provide some automatic documentation management
  status: draft # Options: [draft|review|complete|expire]
  effective: '!'
  expire: '!'
revision:
  - date: '"""+date.today().strftime('%Y%m%d')+"""'
    name: N/A
    reason: Initial template created
'body':  |
  # Blank template

  Put your text here.
"""

#print (YAMLcontent)

In [12]:
#prompt the user for input on certain fields

from ruamel.yaml import YAML #[ruamel.yaml documentation](https://yaml.readthedocs.io/en/latest/index.html)
yaml = YAML()

YAMLdata = yaml.load(YAMLcontent) #Convert the YAML text into an OrderdDictionary

#walk the data structure
for key, value in YAMLdata.items():
    #print (key, value)

    if (key == 'title') or (key == 'subtitle') or (key == 'author') or (key == 'abstract') or (key == 'type') or (key == 'source'):
        data = input(key+': '+value)
        if data != '': YAMLdata[key] = data

#print (YAMLdata)

title: ! 
subtitle: ! 
author: ! 
abstract: ! 
type: ! 
source: ! 


### Set the S3 object params

In [13]:
#set the file permissions:
#https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.put
#ACL='private'|'public-read'|'public-read-write'|'authenticated-read'|'aws-exec-read'|'bucket-owner-read'|'bucket-owner-full-control',

#print (content['classification'])

if re.search('public', YAMLdata['classification'], flags=re.IGNORECASE):
    put_ACL='public-read'
elif re.search('private', YAMLdata['classification'], flags=re.IGNORECASE):
    put_ACL='authenticated-read'
else:
    put_ACL='private'

In [14]:
#set the file's metadata
put_Metadata = { 'classification': YAMLdata['classification'] }

### Write the file to S3

In [15]:
#https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.put
s3.Object(bucket_name, document['key_name']).put(
    ACL=put_ACL,
    ContentLanguage=YAMLdata['lang'],
    Metadata=put_Metadata,
    Body=yamlDump(YAMLdata)
)

{'ResponseMetadata': {'RequestId': 'FFFA418642BF220A',
  'HostId': '8PbMZ3j0px5l1b+IsZIlh4ejyAYujHsatoIyes0Bt6XmIoDaO8Nsc2/MSZTFAmpqrD+16s8iYVc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '8PbMZ3j0px5l1b+IsZIlh4ejyAYujHsatoIyes0Bt6XmIoDaO8Nsc2/MSZTFAmpqrD+16s8iYVc=',
   'x-amz-request-id': 'FFFA418642BF220A',
   'date': 'Thu, 21 Nov 2019 23:02:39 GMT',
   'etag': '"57eee83b4ca85775a3740182106b1dce"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"57eee83b4ca85775a3740182106b1dce"'}

## Update the database

The `contents.yaml` will be structured as follows:

``` yaml
id1:
  record1
id2:
  record2
idN:
  recordN
```

Where 'id' is the ID of the file and 'record' contains all but 'body'

In [16]:
file_name='contents.yaml'

### Format new record

In [17]:
#we dont want to save these in the DB file
YAMLdata.pop('id',None)
YAMLdata.pop('body',None) 
YAMLdata

ordereddict([('title', '!'), ('subtitle', '!'), ('author', '!'), ('abstract', '!'), ('lang', 'en'), ('type', '!'), ('source', '!'), ('classification', 'public'), ('referenceTag', ordereddict([('keywords', ['!']), ('relationship', '!')])), ('status', ordereddict([('status', 'draft'), ('effective', '!'), ('expire', '!')])), ('revision', [ordereddict([('date', '20191121'), ('name', 'N/A'), ('reason', 'Initial template created')])])])

#### Fetch the object's URL

In [18]:
#https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-presigned-urls.html
#note that this is doing more then just fetching the URL
#this is promising but is incomplete:  https://stackoverflow.com/a/48197877/12400492

url = boto3.client('s3').generate_presigned_url('get_object',Params={'Bucket': bucket_name,'Key': document['key_name']},ExpiresIn=60*60*24)
#print (url)

YAMLdata['object_url'] = url.split('?')[0] #for now, just remove the extra stuff

print (YAMLdata['object_url'])

https://documents--88767106-9edc-4028-a451-0da43b669d7f.s3.amazonaws.com/6b/c9/41/96/b6/6b34e2f1-c927-41dd-964d-b606ac9c0b2f.ymal


### Update the tracking DB with latest info

In [27]:
updateDatabase(bucket_name, file_name, document['base_name'], YAMLdata)

{'ResponseMetadata': {'RequestId': 'E6E6E38581CBD67D',
  'HostId': 'n0jeR3/26qGPKMZIsrUw2a9Ge0c0Brl3rvCVlEEw2m0Cq+yE90lIWClOwVLIArx9OT1HxjDI/2E=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'n0jeR3/26qGPKMZIsrUw2a9Ge0c0Brl3rvCVlEEw2m0Cq+yE90lIWClOwVLIArx9OT1HxjDI/2E=',
   'x-amz-request-id': 'E6E6E38581CBD67D',
   'date': 'Fri, 22 Nov 2019 13:58:40 GMT',
   'etag': '"704703d04e4d97ea5d308432d351a531"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"704703d04e4d97ea5d308432d351a531"'}