In [None]:
def iterate_bucket_items(bucket):
    """
    Generator that iterates over all objects in a given s3 bucket

    See http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Client.list_objects_v2 
    for return data format
    :param bucket: name of s3 bucket
    :return: dict of metadata for an object
    """

    import boto3
    client = boto3.client('s3')
    paginator = client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket)

    for page in page_iterator:
        if page['KeyCount'] > 0:
            for item in page['Contents']:
                yield item

In [None]:
def id_target_images(cache_objects, start_time, end_time):
    #identifies all unique image hashes accessed during a given period of time
    
    image_ids = []
    for obj in cache_objects:
        if len(obj["Key"].split("/")) >= 4 and obj["LastModified"] >= start_time and obj["LastModified"] <= end_time:
            if not (obj["Key"].split("/")[2] in image_ids):
                image_ids.append(obj["Key"].split("/")[2])     
            
    return image_ids  

In [None]:
import datetime
import pytz
start_datetime = datetime.datetime(2022, 6, 29, 0, 39, 0, 0, pytz.UTC)
end_datetime = datetime.datetime(2022, 6, 29, 0, 41, 0, 0, pytz.UTC)
images = id_target_images(iterate_bucket_items("ssda-jpg-cache"), start_datetime, end_datetime)
print(len(images))
print(images[0])

10
0e4216d05278327d8900d8e32be0c446


In [None]:
"""
in case you broke it


today_items = []                
count = 0
for i in iterate_bucket_items(bucket='ssda-jpg-cache'):
    if len(i["Key"].split("/")) < 4:
        continue
    if i["LastModified"].strftime("%Y") != "2022" or i["LastModified"].strftime("%m") != "06" or i["LastModified"].strftime("%d") != "27" or (i["LastModified"].strftime("%H") != "21" and i["LastModified"].strftime("%H") != "22")  or (int(i["LastModified"].strftime("%M")) < 50 and int(i["LastModified"].strftime("%M")) > 10):
        continue
    in_list = False
    for item in today_items:
        if item["Key"].split("/")[2] == i["Key"].split("/")[2]:
            in_list = True
    if in_list == False:
        today_items.append(i)
        
len(today_items)
"""

'\nin case you broke it\n\n\ntoday_items = []                \ncount = 0\nfor i in iterate_bucket_items(bucket=\'ssda-jpg-cache\'):\n    if len(i["Key"].split("/")) < 4:\n        continue\n    if i["LastModified"].strftime("%Y") != "2022" or i["LastModified"].strftime("%m") != "06" or i["LastModified"].strftime("%d") != "27" or (i["LastModified"].strftime("%H") != "21" and i["LastModified"].strftime("%H") != "22")  or (int(i["LastModified"].strftime("%M")) < 50 and int(i["LastModified"].strftime("%M")) > 10):\n        continue\n    in_list = False\n    for item in today_items:\n        if item["Key"].split("/")[2] == i["Key"].split("/")[2]:\n            in_list = True\n    if in_list == False:\n        today_items.append(i)\n        \nlen(today_items)\n'

In [None]:
def manual_cache_clear(image_ids, bucket_name):
    import boto3
    cleared = 0
    res = boto3.resource("s3")
    bucket = res.Bucket(bucket_name)
    for im_id in image_ids:        
        im_prefix = "cache/image/" + im_id
        for obj in bucket.objects.filter(Prefix = im_prefix):
            bucket.Object(obj.key).delete()
        meta_key = "cache/info/" + im_id + ".json"
        bucket.Object(meta_key).delete()
        cleared += 1
    return cleared

In [None]:
images_cleared = manual_cache_clear(images, "ssda-jpg-cache")
print(images_cleared)

10
