In [None]:
#default_exp s3_scrape

In [1]:
#export

import boto3
import pandas as pd
import os
from PIL import Image
import json

In [9]:
def write_server(outfile, server, default):
    if server == None:
        outfile.write(default)
    else:
        outfile.write(server)

def manifest_from_cs(path_to_cs_batch, list_of_images, output_dir_prefix=None, manifest_server=None, image_server=None, default_server=None):
    '''
    Creates a IIIF manifests for volumes based on metadata from a CloudSearch upload.
        path_to_cs_batch: Path to a JSON file containing a CloudSearch document batch **modified to include the "volumes" key**
        list_of_images: a list of dictionaries in which each dictionary contains the filename and dimensions for a single image
        output_dir_prefix (optional): path to append output manifest file name to, defaults to local
        manifest_server (optional): base URL for manifest server, including trailing forward slash
        
        returns: number of manifests created
    '''    
    import json
    
    manifests = 0
    
    if default_server == None:
        default_server = "https://images.slavesocieties.org/"   
    
    with open(path_to_cs_batch, encoding="utf-8") as jsonfile:
        data = json.load(jsonfile)
        
    for volume in data["volumes"]:
        if output_dir_prefix != None:
            volume_json_path = output_dir_prefix + '\\' + volume["id"] + ".json"
        else:
            volume_json_path = volume["id"] + ".json"   
    
        with open(volume_json_path, "w", encoding = "utf-8") as outfile:
            outfile.write('{\n')
            #add indentation eventually?
            outfile.write("\"@context\": \"http://iiif.io/api/presentation/2/context.json\",\n")
            outfile.write("\"@type\": \"sc:Manifest\",\n")
            outfile.write("\"@id\": \"")
            write_server(outfile, manifest_server, default_server)        
            outfile.write("manifest/" + volume["id"] + ".json\",\n")
            if ("title" in volume["fields"]) and (volume["fields"]["title"] != None):
                outfile.write("\"label\": \"" + volume["fields"]["title"].replace("\"", "\\\"") + "\",\n")
            else:
                outfile.write("\"label\": \"\",\n")
            if "description" in volume["fields"]:
                outfile.write("\"description\": \"" + volume["fields"]["description"].replace("\"", "\\\"").replace("  ", ' ') + "\",\n")
            else:
                outfile.write("\"description\": \"\",\n")
            outfile.write("\"metadata\": [\n")
            outfile.write("{\n")
            outfile.write("\"label\": \"Title\",\n")
            if ("title" in volume["fields"]) and (volume["fields"]["title"] != None):
                outfile.write("\"value\": \"" + volume["fields"]["title"].replace("\"", "\\\"") + "\"\n")
            else:
                outfile.write("\"value\": \"\"\n")
            outfile.write("},\n")        
            if "creator" in volume["fields"]:
                outfile.write("{\n")
                outfile.write("\"label\": \"Creator\",\n")
                outfile.write("\"value\": \"" + volume["fields"]["creator"].replace("  ", ' ') + "\"\n")        
                outfile.write("},\n")        
            if ("subject" in volume["fields"]) and (volume["fields"]["subject"] != None):
                vol_subject = ''
                for subject in volume["fields"]["subject"]:
                    if len(vol_subject) > 0:
                        vol_subject += '; '
                    vol_subject += subject           
                outfile.write("{\n")
                outfile.write("\"label\": \"Subject\",\n")
                outfile.write("\"value\": \"" + vol_subject + "\"\n")
                outfile.write("},\n")
            outfile.write("{\n")
            outfile.write("\"label\": \"Digitized by\",\n")
            if "publisher" in volume["fields"]:
                outfile.write("\"value\": \"" + volume["fields"]["publisher"] + "\"\n")
            else:
                outfile.write("\"value\": \"Slave Societies Digital Archive\"\n")
            outfile.write("},\n")
            outfile.write("{\n")
            outfile.write("\"label\": \"Identifier\",\n")
            outfile.write("\"value\": \"" + volume["id"] + "\"\n")
            outfile.write("},\n")
            if ("start_date" in volume["fields"]) and ("end_date" in volume["fields"]):
                outfile.write("{\n")
                outfile.write("\"label\": \"Date\",\n")
                outfile.write("\"value\": \"" + volume["fields"]["start_date"][:volume["fields"]["start_date"].find('T')] + '-' + volume["fields"]["end_date"][:volume["fields"]["end_date"].find('T')] + "\"\n")
                outfile.write("},\n")
            vol_lang = ''
            if "language" in volume["fields"]:            
                for language in volume["fields"]["language"]:
                    if len(vol_lang) > 0:
                        vol_lang += '; '
                    vol_lang += language
            outfile.write("{\n")
            outfile.write("\"label\": \"Language\",\n")
            outfile.write("\"value\": \"" + vol_lang + "\"\n")
            outfile.write("}\n")
            outfile.write("],\n")            

            outfile.write("\"attribution\": \"Slave Societies Digital Archive\",\n")
            outfile.write("\"logo\": \"")
            write_server(outfile, image_server, default_server) 
            outfile.write("iiif/3/ssda_logo_horizontal.jpg/full/max/0/default.jpg\",\n")

            outfile.write("\"sequences\": [\n")
            outfile.write("{\n")
            outfile.write("\"@type\": \"sc:Sequence\",\n")
            outfile.write("\"@id\": \"")
            write_server(outfile, manifest_server, default_server)           
            outfile.write("sequence/" + volume["id"] + ".json\",\n")
            outfile.write("\"canvases\": [\n")

            first_image = True
            for image in list_of_images:
                if not first_image:
                    outfile.write(",\n")
                else:
                    first_image = False

                outfile.write("{\n")
                outfile.write("\"@type\": \"sc:Canvas\",\n")            
                outfile.write("\"@id\": \"")
                write_server(outfile, manifest_server, default_server)
                image_number = int(image["file name"][image["file name"].find('-') + 1:image["file name"].find('.')])
                image_id = "0" * (4 - len(str(image_number))) + str(image_number)
                canvas_id = volume["id"] + '-' + image_id
                outfile.write("canvas/" + canvas_id + ".json\",\n")
                outfile.write("\"label\": \"" + str(image_number) + "\",\n")
                outfile.write("\"width\": " + str(image["width"]) + ",\n")
                outfile.write("\"height\": " + str(image["height"]) + ",\n")
                outfile.write("\"images\": [\n")
                outfile.write("{\n")
                outfile.write("\"@type\": \"oa:Annotation\",\n")            
                outfile.write("\"@id\": \"")
                write_server(outfile, manifest_server, default_server)
                outfile.write("annotation/" + str(canvas_id) + ".json\",\n")
                outfile.write("\"motivation\": \"sc:painting\",\n")
                outfile.write("\"on\": \"")
                write_server(outfile, manifest_server, default_server)
                outfile.write("canvas/" + canvas_id + ".json\",\n")
                outfile.write("\"resource\": {\n")            
                outfile.write("\"@type\": \"dctypes:Image\",\n")            
                outfile.write("\"format\": \"image/jpg\",\n")
                outfile.write("\"@id\": \"")
                write_server(outfile, image_server, default_server)
                outfile.write("iiif/3/" + canvas_id + ".jpg/full/max/0/default.jpg\",\n")
                outfile.write("\"width\": " + str(image["width"]) + ",\n")
                outfile.write("\"height\": " + str(image["height"]) + ",\n")
                outfile.write("\"service\": {\n")
                outfile.write("\"@id\": \"")
                write_server(outfile, image_server, default_server)
                outfile.write( "iiif/3/" + canvas_id + ".jpg\",\n")
                outfile.write("\"@context\": \"http://iiif.io/api/image/3/context.json\",\n")
                outfile.write("\"profile\": \"http://iiif.io/api/image/3/level2.json\"\n")
                outfile.write("}\n")
                outfile.write("}\n")
                outfile.write("}\n")
                outfile.write("]\n")
                outfile.write("}")

            outfile.write("\n")
            outfile.write("]\n")
            outfile.write("}\n")
            outfile.write("]\n")
            outfile.write("}\n")
            
        manifests += 1
    
    return manifests
        

In [10]:
manifest_from_cs("la_aurora_cs.json", images, output_dir_prefix=None, manifest_server="https://ssda-iiif.s3.amazonaws.com/", image_server="https://images.slavesocieties.org/", default_server="https://images.slavesocieties.org/")

1

In [3]:
print(images[:5])

[{'width': 6000, 'height': 4000, 'file name': '702001-0001.jpg'}, {'width': 6000, 'height': 4000, 'file name': '702001-0002.jpg'}, {'width': 6000, 'height': 4000, 'file name': '702001-0003.jpg'}, {'width': 6000, 'height': 4000, 'file name': '702001-0004.jpg'}, {'width': 6000, 'height': 4000, 'file name': '702001-0005.jpg'}]


In [18]:
#export

def bucket_copy(source_bucket_name, target_bucket_name, source_object_prefix = ""):
    s3_resource = boto3.resource("s3")
    s3_client = boto3.client("s3")
    
    source_bucket = s3_resource.Bucket(source_bucket_name)
    copied_objects = 0
    
    for obj in source_bucket.objects.filter(Prefix = source_object_prefix):
        copy_source = {"Bucket": source_bucket_name, "Key": obj.key}
        if "Process" in obj.key:
            s3_client.download_file(source_bucket_name, obj.key, "temp.jpg")
            image_size = os.stat('temp.jpg').st_size
            while image_size > 3000000:
                im = Image.open("temp.jpg")
                width, height = im.size
                im = im.resize((int(round(width * .75)), int(round(height * .75))))
                im.save("temp.jpg")
                image_size = os.stat("temp.jpg").st_size
            s3_client.upload_file("temp.jpg", target_bucket_name, obj.key[obj.key.index("/") + 1:])            
        else:
            s3_client.copy(copy_source, target_bucket_name, obj.key)
        copied_objects += 1
        
    if "Process" in obj.key:
        os.remove("temp.jpg")
        
    return str(copied_objects) + " objects copied from " + source_bucket_name + " to " + target_bucket_name

In [17]:
bucket_copy("ssda-assets", "ssda-misc", source_object_prefix = "la-aurora/")

'724 objects copied from ssda-assets to ssda-misc'

In [3]:
#no_test

s3_resource = boto3.resource("s3")
s3_client = boto3.client("s3")
import rawpy
import imageio
    
source_bucket = s3_resource.Bucket("ssda-misc")

for obj in source_bucket.objects.filter(Prefix = "la-aurora"):    
    if obj.key.endswith("CR2"):
        s3_client.download_file("ssda-misc", obj.key, "temp.cr2")
        with rawpy.imread("temp.cr2") as raw:                
            rgb = raw.postprocess()        
        imageio.imsave("temp.jpg", rgb)        
        image_size = os.stat("temp.jpg").st_size
        while image_size > 3000000:
            im = Image.open("temp.jpg")
            width, height = im.size
            im = im.resize((int(round(width * .75)), int(round(height * .75))))
            im.save("temp.jpg")
            image_size = os.stat("temp.jpg").st_size
        s3_client.upload_file("temp.jpg", "ssda-misc", obj.key[:obj.key.find("CR2")] + "jpg")        
        
os.remove("temp.cr2")
os.remove("temp.jpg")

In [7]:
#no_test

for obj in source_bucket.objects.filter(Prefix = "la-aurora"):
    if obj.key.endswith("jpg"):
        s3_client.download_file("ssda-misc", obj.key, obj.key)    

In [1]:
#export

def copy_jpgs(json_path, source_bucket, target_bucket):
    s3_client = boto3.client('s3')
    
    images = 0
    
    with open(json_path, encoding="utf-8") as jsonfile:
        data = json.load(jsonfile)
        
    for volume in data["volumes"]:
        print("Now working on " + volume["identifier"])
        for image in volume["images"]:            
            #copy_source = {"Bucket": source_bucket, "Key": volume["s3_path"] + "/JPG/" + str(image["file_name"]) + ".JPG"}
            try:
                s3_client.download_file(source_bucket, volume["s3_path"] + "/JPG/" + str(image["file_name"]) + ".JPG", "temp.jpg")
            except:
                s3_client.download_file(source_bucket, volume["s3_path"] + "/JPG/" + str(image["file_name"]) + ".jpg", "temp.jpg")            
            image_size = os.stat("temp.jpg").st_size
            while (image_size > 3000000):                
                im = Image.open("temp.jpg")
                width, height = im.size
                im = im.resize((int(round(width * .75)), int(round(height * .75))))
                im.save("temp.jpg")
                image_size = os.stat("temp.jpg").st_size            
            image_number = str(image["file_name"] - 1000)
            padded_number = '0' * (4 - len(image_number)) + image_number
            #s3_client.copy(copy_source, target_bucket, str(volume["identifier"]) + '-' + padded_number + ".jpg", ExtraArgs={'ContentType': "image/jpeg", 'Metadata': {"x-amz-meta-width": str(image["width"]), "x-amz-meta-height": str(image["height"])}})
            s3_client.upload_file("temp.jpg", target_bucket, str(volume["identifier"]) + '-' + padded_number + ".jpg", ExtraArgs={'ContentType': "image/jpeg", 'Metadata': {"width": str(image["width"]), "height": str(image["height"])}})
            images += 1
            
    os.remove("temp.jpg")
            
    return str(images) + " images copied from " + source_bucket + " to " + target_bucket

In [None]:
#no_test

copy_jpgs("brazil.json", "ssda-assets", "ssda-production-jpgs")

Now working on 701236
10579083
1175750
13419804
1691496
14241074
1902434
11893601
1261265
10828054
1078452
10960378
1115240
11835791
1300910
12164965
1364123
12077714
1343849
12362793
1400880
11633209
1284579
12162233
1371021
11832232
1298795
12083546
1344742
11618214
1266248
11669816
1270145
11662191
1269467


In [1]:
def copy_jpgs_for_prod(source_bucket_name, volume_id, target_bucket_name = "ssda-production-jpgs", source_prefix = "", copy_to_archive = False):
    '''
    Copies images of a single volume from one S3 bucket to another. Objects to be copied can be filtered by file type and/or key prefix.
        source_bucket_name: name of S3 bucket to copy from
        target_bucket_name: name of S3 bucket to copy to (specify if not ssda-production-jps)
        volume_id: id of the volume in question
        source_prefix (optional): S3 object key prefix to filter by
        copy_to_archive: set to true to send a copy of each jpg to the archive bucket too
        
        returns: list of images with filenames and dimensions as well as a count of objects copied
    '''
    import boto3
    import os
    from PIL import Image
    
    s3_client = boto3.client('s3')
    s3_resource = boto3.resource("s3")
    source_bucket = s3_resource.Bucket(source_bucket_name)    
    
    image_count = 0
    images = []   
    
    for obj in source_bucket.objects.filter(Prefix = source_prefix):
        if obj.key.endswith("jpg"):                     
            s3_client.download_file(source_bucket_name, obj.key, "temp.jpg")            
            image = {} 
            
            with Image.open("temp.jpg") as im:
                width, height = im.size               
                image_size = os.stat("temp.jpg").st_size
                while (image_size > 3000000):                   
                    width, height = im.size
                    im = im.resize((int(round(width * .75)), int(round(height * .75))))
                    im.save("temp.jpg")
                    image_size = os.stat("temp.jpg").st_size  
                image["width"] = width
                image["height"] = height            
                    
            image_number = str(image_count + 1)
            padded_number = '0' * (4 - len(image_number)) + image_number
            image["file name"] = str(volume_id) + '-' + padded_number + ".jpg"
                        
            s3_client.upload_file("temp.jpg", target_bucket_name, image["file name"], ExtraArgs={'ContentType': "image/jpeg", 'Metadata': {"width": str(image["width"]), "height": str(image["height"])}})
            if copy_to_archive:
                s3_client.upload_file("temp.jpg", "ssda-archive", image["file name"], ExtraArgs={'ContentType': "image/jpeg", 'Metadata': {"width": str(image["width"]), "height": str(image["height"])}})            
            image_count += 1
            images.append(image)
            
    os.remove("temp.jpg")
             
    return images, image_count

In [2]:
images, count = copy_jpgs_for_prod("ssda-misc", 702001, source_prefix = "la-aurora")

now working on la-aurora/
now working on la-aurora/IMG_1901.jpg
now working on la-aurora/IMG_1902.jpg
now working on la-aurora/IMG_1903.jpg
now working on la-aurora/IMG_1904.jpg
now working on la-aurora/IMG_1905.jpg
now working on la-aurora/IMG_1906.jpg
now working on la-aurora/IMG_1907.jpg
now working on la-aurora/IMG_1908.jpg
now working on la-aurora/IMG_1909.jpg
now working on la-aurora/IMG_1910.jpg
now working on la-aurora/IMG_1911.jpg
now working on la-aurora/IMG_1912.jpg
now working on la-aurora/IMG_1913.jpg
now working on la-aurora/IMG_1914.jpg
now working on la-aurora/IMG_1915.jpg
now working on la-aurora/IMG_1916.jpg
now working on la-aurora/IMG_1917.jpg
now working on la-aurora/IMG_1918.jpg
now working on la-aurora/IMG_1919.jpg
now working on la-aurora/IMG_1920.jpg
now working on la-aurora/IMG_1921.jpg
now working on la-aurora/IMG_1922.jpg
now working on la-aurora/IMG_1923.jpg
now working on la-aurora/IMG_1924.jpg
now working on la-aurora/IMG_1925.jpg
now working on la-aurora

now working on la-aurora/IMG_2116.jpg
now working on la-aurora/IMG_2117.jpg
now working on la-aurora/IMG_2118.jpg
now working on la-aurora/IMG_2119.jpg
now working on la-aurora/IMG_2120.jpg
now working on la-aurora/IMG_2121.jpg
now working on la-aurora/IMG_2122.jpg
now working on la-aurora/IMG_2123.jpg
now working on la-aurora/IMG_2124.jpg
now working on la-aurora/IMG_2125.jpg
now working on la-aurora/IMG_2126.jpg
now working on la-aurora/IMG_2127.jpg
now working on la-aurora/IMG_2128.jpg
now working on la-aurora/IMG_2129.jpg
now working on la-aurora/IMG_2130.jpg
now working on la-aurora/IMG_2131.jpg
now working on la-aurora/IMG_2132.jpg
now working on la-aurora/IMG_2133.jpg
now working on la-aurora/IMG_2134.jpg
now working on la-aurora/IMG_2135.jpg
now working on la-aurora/IMG_2136.jpg
now working on la-aurora/IMG_2137.jpg
now working on la-aurora/IMG_2138.jpg
now working on la-aurora/IMG_2139.jpg
now working on la-aurora/IMG_2140.jpg
now working on la-aurora/IMG_2141.jpg
now working 

now working on la-aurora/IMG_2332.jpg
now working on la-aurora/IMG_2333.jpg
now working on la-aurora/IMG_2334.jpg
now working on la-aurora/IMG_2335.jpg
now working on la-aurora/IMG_2336.jpg
now working on la-aurora/IMG_2337.jpg
now working on la-aurora/IMG_2338.jpg
now working on la-aurora/IMG_2339.jpg
now working on la-aurora/IMG_2340.jpg
now working on la-aurora/IMG_2341.jpg
now working on la-aurora/IMG_2342.jpg
now working on la-aurora/IMG_2343.jpg
now working on la-aurora/IMG_2344.jpg
now working on la-aurora/IMG_2345.jpg
now working on la-aurora/IMG_2346.jpg
now working on la-aurora/IMG_2347.jpg
now working on la-aurora/IMG_2348.jpg
now working on la-aurora/IMG_2349.jpg
now working on la-aurora/IMG_2350.jpg
now working on la-aurora/IMG_2351.jpg
now working on la-aurora/IMG_2352.jpg
now working on la-aurora/IMG_2353.jpg
now working on la-aurora/IMG_2354.jpg
now working on la-aurora/IMG_2355.jpg
now working on la-aurora/IMG_2356.jpg
now working on la-aurora/IMG_2357.jpg
now working 

now working on la-aurora/IMG_2548.jpg
now working on la-aurora/IMG_2549.jpg
now working on la-aurora/IMG_2550.jpg
now working on la-aurora/IMG_2551.jpg
now working on la-aurora/IMG_2552.jpg
now working on la-aurora/IMG_2553.jpg
now working on la-aurora/IMG_2554.jpg
now working on la-aurora/IMG_2555.jpg
now working on la-aurora/IMG_2556.jpg
now working on la-aurora/IMG_2557.jpg
now working on la-aurora/IMG_2558.jpg
now working on la-aurora/IMG_2559.jpg
now working on la-aurora/IMG_2560.jpg
now working on la-aurora/IMG_2561.jpg
now working on la-aurora/IMG_2562.jpg
now working on la-aurora/IMG_2563.jpg
now working on la-aurora/IMG_2564.jpg
now working on la-aurora/IMG_2565.jpg
now working on la-aurora/IMG_2566.jpg
now working on la-aurora/IMG_2567.jpg
now working on la-aurora/IMG_2568.jpg
now working on la-aurora/IMG_2569.jpg
now working on la-aurora/IMG_2570.jpg
now working on la-aurora/IMG_2571.jpg
now working on la-aurora/IMG_2572.jpg
now working on la-aurora/IMG_2573.jpg
now working 

In [9]:
#export

def build_volume_records(json_path, target_bucket):
    s3_resource = boto3.resource('s3')
    s3_client = boto3.client('s3')
    #bucket = s3_resource.Bucket(target_bucket)    
    
    volumes = 0
    
    with open(json_path, encoding="utf-8") as jsonfile:
        data = json.load(jsonfile)
        
    for volume in data["volumes"]:
        keys_to_delete = ["s3_path", "contributor"]
        keys_to_add = []
        values_to_add = []
        for key in volume:
            if key in keys_to_delete:                
                continue
            elif (type(volume[key]) == list) and ((key == "creator") or (key == "format")):                
                temp = ''
                for element in volume[key]:
                    temp += element + "; "
                temp = temp[:len(temp) - 2]
                volume[key] = temp
            elif key == "date":
                if volume[key]["start year"] == None:
                    start_date = "1500"
                else:
                    start_date = str(volume[key]["start year"])
                if volume[key]["start month"] != None:
                    str_start_month = str(volume[key]["start month"])
                    start_date += '-' + '0' * (2 - len(str_start_month)) + str_start_month
                    if volume[key]["start day"] != None:
                        str_start_day = str(volume[key]["start day"])
                        start_date += '-' + '0' * (2 - len(str_start_day)) + str_start_day
                if volume[key]["end year"] == None:
                    end_date = "2000"
                else:
                    end_date = str(volume[key]["end year"])
                if volume[key]["end month"] != None:
                    str_end_month = str(volume[key]["end month"])
                    end_date += '-' + '0' * (2 - len(str_end_month)) + str_end_month
                    if volume[key]["end day"] != None:
                        str_end_day = str(volume[key]["end day"])
                        end_date += '-' + '0' * (2 - len(str_end_day)) + str_end_day                
                keys_to_delete.append(key)                
            elif type(volume[key]) == dict:
                for key_key in volume[key]:
                    keys_to_add.append(key_key)
                    values_to_add.append(volume[key][key_key])
                keys_to_delete.append(key)
            elif volume[key] == None:
                volume[key] = ''
                
        for i in range(len(keys_to_add)):
            if values_to_add[i] == None:
                values_to_add[i] = ''
            
            if (keys_to_add[i] == "coords") and (values_to_add[i] != ''):
                coords = values_to_add[i].split(',')
                volume[keys_to_add[i]] = '(' + coords[0] + ", " + coords[1] + ')'
                continue
            elif keys_to_add[i] == "coords":
                volume[keys_to_add[i]] = "(0,0)"
                continue
                
            volume[keys_to_add[i]] = values_to_add[i]
            
        volume["start_date"] = start_date
        volume["end_date"] = end_date
        
        for key in keys_to_delete:            
            del volume[key]
                    
        with open("temp.json", 'w', encoding="utf-8") as outfile:
            json.dump(volume, outfile)               
        s3_client.upload_file("temp.json", target_bucket, str(volume["identifier"]) + ".json", ExtraArgs={'ContentType': "application/json"})
        volumes += 1
        
    return "Metadata for " + str(volumes) + " volumes uploaded to S3."

In [11]:
#no_test

build_volume_records("us-vol.json", "ssda-volume-metadata")

'Metadata for 30 volumes uploaded to S3.'

In [25]:
#export

def build_cloudsearch_batch(json_path, output_dir):
    
    s3_resource = boto3.resource('s3')
    s3_client = boto3.client('s3')
    #bucket = s3_resource.Bucket(target_bucket)    
    
    volumes = 0
    batch = []
    
    with open(json_path, encoding="utf-8") as jsonfile:
        data = json.load(jsonfile)
        
    for volume in data["volumes"]:
        keys_to_delete = ["s3_path", "contributor"]
        keys_to_add = []
        values_to_add = []
        for key in volume:
            if key in keys_to_delete:                
                continue
            elif (type(volume[key]) == list) and ((key == "creator") or (key == "format")):                
                temp = ''
                for element in volume[key]:
                    temp += element + "; "
                temp = temp[:len(temp) - 2]
                volume[key] = temp
            elif key == "date":
                if volume[key]["start year"] == None:
                    start_date = "1500"
                else:
                    start_date = str(volume[key]["start year"])
                if volume[key]["start month"] != None:
                    str_start_month = str(volume[key]["start month"])
                    start_date += '-' + '0' * (2 - len(str_start_month)) + str_start_month
                    if volume[key]["start day"] != None:
                        str_start_day = str(volume[key]["start day"])
                        start_date += '-' + '0' * (2 - len(str_start_day)) + str_start_day
                if volume[key]["end year"] == None:
                    end_date = "2000"
                else:
                    end_date = str(volume[key]["end year"])
                if volume[key]["end month"] != None:
                    str_end_month = str(volume[key]["end month"])
                    end_date += '-' + '0' * (2 - len(str_end_month)) + str_end_month
                    if volume[key]["end day"] != None:
                        str_end_day = str(volume[key]["end day"])
                        end_date += '-' + '0' * (2 - len(str_end_day)) + str_end_day                
                keys_to_delete.append(key)                
            elif type(volume[key]) == dict:
                for key_key in volume[key]:
                    keys_to_add.append(key_key)
                    values_to_add.append(volume[key][key_key])
                keys_to_delete.append(key)
            elif volume[key] == None:
                volume[key] = ''
                
        for i in range(len(keys_to_add)):
            if values_to_add[i] == None:
                values_to_add[i] = ''
            
            if (keys_to_add[i] == "coords") and (values_to_add[i] != ''):
                coords = values_to_add[i].split(',')
                if ';' in coords[1]:
                    coords[1] = coords[1][:coords[1].find(';')]
                volume[keys_to_add[i]] = coords[0] + ", " + coords[1]
                continue
            elif keys_to_add[i] == "coords":
                volume[keys_to_add[i]] = "0, 0"
                continue
                
            volume[keys_to_add[i]] = values_to_add[i]            
        
        if len(start_date) == 9:
            date_parts = start_date.split('-')
            if len(date_parts) == 2:
                start_date = start_date[:4]
            else:           
                date_parts = start_date.split('-')                
                if len(date_parts[0]) == 1:                    
                    start_date = date_parts[2] + '-0' + date_parts[0] + '-' + date_parts[1]
                else:
                    start_date = date_parts[2] + '-' + date_parts[1] + '-0' + date_parts[0]
            
        if len(end_date) == 9:
            date_parts = end_date.split('-')
            if len(date_parts) == 2:                
                end_date = end_date[5:]
            else:                
                date_parts = end_date.split('-')
                if len(date_parts[0]) == 1:
                    end_date = date_parts[2] + '-0' + date_parts[0] + '-' + date_parts[1]
                else:
                    end_date = date_parts[2] + '-' + date_parts[1] + '-0' + date_parts[0]
                    
        long_months = ["01", "03", "05", "07", "08", "10", "12"]
        date_parts = end_date.split('-')
        
        if not ('1' in start_date):
            start_date = "Unknown"
        if not ('1' in end_date):
            end_date = "Unknown"
        if "??" in start_date:
            start_date = start_date.replace("??", "01")
        if end_date[5:7] == "??":
            end_date = end_date[:5] + "12" + end_date[7:]
        if end_date[8:] == "??":
            if date_parts[1] in long_months:
                end_date = end_date[:8] + "31"
            elif date_parts[1] == "02":
                end_date = end_date[:8] + "28"
            else:
                end_date = end_date[:8] + "30"
                
        start_date = start_date.replace('+', '-')
        end_date = end_date.replace('+', '-')
        start_date = start_date.replace('00', '01')
        end_date = end_date.replace('00', '01')
        
        if ',' in start_date:
            start_date = start_date[:4]
        if ',' in end_date:
            end_date = end_date.replace(' ', '')
            end_date = end_date[-4:]            
        
        date_parts = end_date.split('-')
        
        if len(start_date) == 10:
            volume["start_date"] = start_date + "T00:00:00Z"
        elif start_date == "Unknown":
            volume["start_date"] = "1500-01-01T00:00:00Z"
        elif len(start_date) == 7:
            volume["start_date"] = start_date + "-01T00:00:00Z"
        else:
            volume["start_date"] = start_date + "-01-01T00:00:00Z"       
        
        if len(date_parts) == 2:
            if date_parts[1] in long_months:
                end_month_length = "31"
            elif date_parts[1] == "02":
                end_month_length = "28"
            else:
                end_month_length = "30"            
        
        if len(end_date) == 10:
            volume["end_date"] = end_date + "T23:59:59Z"
        elif end_date == "Unknown":
            volume["end_date"] = "1999-12-31T23:59:59Z"
        elif len(end_date) == 7:
            volume["end_date"] = end_date + '-' + end_month_length + "T23:59:59Z"
        else:
            volume["end_date"] = end_date + "-12-31T23:59:59Z"
        
        for key in keys_to_delete:            
            del volume[key]
                    
        batch_record = {"type": "add", "id": str(volume["identifier"]), "fields": volume}
        batch.append(batch_record)
        volumes += 1
        
    with open(output_dir, 'w', encoding="utf-8") as outfile:
        json.dump(batch, outfile)   
        
    return "Metadata for " + str(volumes) + " volumes combined in a CloudSearch batch request available at " + output_dir

In [26]:
#no_test

build_cloudsearch_batch("colombia-vol.json", "colombia-cs-batch.json")

'Metadata for 556 volumes combined in a CloudSearch batch request available at colombia-cs-batch.json'

In [None]:
#export

def scrape_bucket(bucket_name, prefix=None):
    s3_resource = boto3.resource('s3')
    s3_client = boto3.client('s3')
    bucket = s3_resource.Bucket(bucket_name)
    
    volume_ids = []
    titles = []
    volume_roots = []
    image_counts = []
    has_jpg = []
    has_tif = []
    has_other = []
    other = []
    has_pdf = []
    has_metadata = []
    volume_metadata = []
    
    folders = ["jpg", "tif", "metadata"]   
    
    for obj in bucket.objects.filter(Prefix = prefix):    
        if (len(obj.key.split('/')) >= 4) and (obj.key.split('/')[3].isdigit()) and (obj.key.split('/')[3] != '') and (obj.key.split('/')[3] not in volume_ids):
            volume_ids.append(obj.key.split('/')[3])
            volume_root = obj.key[:obj.key.find('/', obj.key.find(obj.key.split('/')[3]))]
            volume_roots.append(volume_root)            
            has_metadata.append(False)
            has_pdf.append(False)
            has_jpg.append(False)
            has_tif.append(False)
            has_other.append(False)
            other.append(None)
            for volume_obj in bucket.objects.filter(Prefix = volume_root):
                if "DC.xml" in volume_obj.key:                    
                    has_metadata[-1] = True
                    s3_client.download_file(bucket.name, volume_obj.key, "temp.xml")
                    vol_dict = ssda_volume_xml_to_dict("temp.xml", volume_root)
                    volume_metadata.append(vol_dict)
                    if "title" in vol_dict:
                        titles.append(vol_dict["title"])
                    else:
                        titles.append("no title")
                    os.remove("temp.xml")
                elif "pdf" in volume_obj.key.lower():
                    has_pdf[-1] = True
                elif (has_jpg[-1] == False) and (volume_obj.key.lower().split('/')[4] == "jpg"):
                    has_jpg[-1] = True
                elif (has_tif[-1] == False) and (volume_obj.key.lower().split('/')[4] == "tif"):
                    has_tif[-1] = True
                elif (len(volume_obj.key.split('/')) > 5) and (volume_obj.key.lower().split('/')[4] not in folders) and ((other[-1] == None) or (volume_obj.key.lower().split('/')[4] not in other[-1])):
                    has_other[-1] = True
                    if other[-1] == None:
                        other[-1] = volume_obj.key.lower().split('/')[4]
                    else:
                        other[-1] = other[-1] + '|' + volume_obj.key.lower().split('/')[4]
                
            image_metadata = []
            prod_imgs = None
            if has_jpg[-1]:
                prod_imgs = "jpg"
            elif has_tif[-1]:
                prod_imgs = "tif"
                
            if prod_imgs == None:
                volume_metadata[-1]["images"] = []
                image_counts.append(0)
            else:
                bad_images = 0
                for image_obj in bucket.objects.filter(Prefix = volume_root + '/' + prod_imgs.upper()):
                    if ('.' + prod_imgs) in image_obj.key.lower():
                        file_name = image_obj.key[image_obj.key.rfind('/') + 1:image_obj.key.rfind('.')]
                        if not file_name.isdigit():
                            print("found bad image file name at " + image_obj.key)
                            bad_images += 1
                            continue
                        extension = image_obj.key[image_obj.key.rfind('.') + 1:]
                        temp_path = file_name + '.' + extension
                        s3_client.download_file(bucket.name, image_obj.key, temp_path)
                        im = Image.open(temp_path)
                        width, height = im.size
                        im.close()
                        os.remove(temp_path)
                        image = {"file_name": int(file_name), "extension": extension, "height": height, "width": width}
                        image_metadata.append(image)
                volume_metadata[-1]["images"] = image_metadata
                image_counts.append(len(image_metadata) + bad_images)           
            
            print("Completed " + titles[-1])
        elif (len(obj.key.split('/')) >= 5) and (not obj.key.split('/')[3].isdigit()) and (obj.key.split('/')[4] != '') and (obj.key.split('/')[4] not in volume_ids):
            volume_ids.append(obj.key.split('/')[4])
            volume_root = obj.key[:obj.key.find('/', obj.key.find(obj.key.split('/')[4]))]
            volume_roots.append(volume_root)            
            has_metadata.append(False)
            has_pdf.append(False)
            has_jpg.append(False)
            has_tif.append(False)
            has_other.append(False)
            other.append(None)
            for volume_obj in bucket.objects.filter(Prefix = volume_root):
                if "DC.xml" in volume_obj.key:                    
                    has_metadata[-1] = True
                    s3_client.download_file(bucket.name, volume_obj.key, "temp.xml")
                    vol_dict = ssda_volume_xml_to_dict("temp.xml", volume_root)
                    volume_metadata.append(vol_dict)
                    if ("title" in vol_dict) and (vol_dict["title"] != None):
                        titles.append(vol_dict["title"])
                    else:
                        titles.append("no title")
                    os.remove("temp.xml")
                elif "pdf" in volume_obj.key.lower():
                    has_pdf[-1] = True
                elif (has_jpg[-1] == False) and (volume_obj.key.lower().split('/')[5] == "jpg"):
                    has_jpg[-1] = True
                elif (has_tif[-1] == False) and (volume_obj.key.lower().split('/')[5] == "tif"):
                    has_tif[-1] = True
                elif (len(volume_obj.key.split('/')) > 6) and (volume_obj.key.lower().split('/')[5] not in folders) and ((other[-1] == None) or (volume_obj.key.lower().split('/')[5] not in other[-1])):
                    has_other[-1] = True
                    if other[-1] == None:
                        other[-1] = volume_obj.key.lower().split('/')[5]
                    else:
                        other[-1] = other[-1] + '|' + volume_obj.key.lower().split('/')[5]
                        
            if has_metadata[-1] == False:
                titles.append("no title")
                print("Failed to find metadata for " + volume_root)
                
            image_metadata = []
            prod_imgs = None
            if has_jpg[-1]:
                prod_imgs = "jpg"
            elif has_tif[-1]:
                prod_imgs = "tif"
                
            if prod_imgs == None:
                volume_metadata[-1]["images"] = []
                image_counts.append(0)
            else:
                bad_images = 0
                for image_obj in bucket.objects.filter(Prefix = volume_root + '/' + prod_imgs.upper()):
                    if ('.' + prod_imgs) in image_obj.key.lower():
                        file_name = image_obj.key[image_obj.key.rfind('/') + 1:image_obj.key.rfind('.')]
                        if not file_name.isdigit():
                            print("found incorrect image file name at " + image_obj.key)
                            bad_images += 1
                            continue
                        extension = image_obj.key[image_obj.key.rfind('.') + 1:]
                        temp_path = file_name + '.' + extension
                        s3_client.download_file(bucket.name, image_obj.key, temp_path)
                        try:
                            im = Image.open(temp_path)
                            width, height = im.size
                            im.close()                        
                            image = {"file_name": int(file_name), "extension": extension, "height": height, "width": width}
                            image_metadata.append(image)
                        except:
                            print("found bad image file at " + image_obj.key)
                            bad_images += 1
                        os.remove(temp_path)
                volume_metadata[-1]["images"] = image_metadata
                image_counts.append(len(image_metadata) + bad_images)           
            
            try:
                print("Completed " + titles[-1])
            except:
                print("Completed")
                print(titles[-1])
                
    volumes_dict = {"id": volume_ids, "title": titles, "images": image_counts, "s3 root": volume_roots, "metadata": has_metadata, "has pdf": has_pdf, "has jpg": has_jpg, "has tif": has_tif, "has other": has_other, "other": other}
    volumes_df = pd.DataFrame.from_dict(volumes_dict)
    
    return volumes_df, volume_metadata  

In [None]:
#export

def ssda_volume_xml_to_dict(volume_xml, s3_path):
    import xml.etree.ElementTree as ET
    tree = ET.parse(volume_xml)
    root = tree.getroot()
    volume_dict = {}
    volume_dict["s3_path"] = s3_path
    for item in root:
        if "{http://purl.org/dc/elements/1.1/}" in item.tag:
            item.tag = item.tag[item.tag.find('}') + 1:]
        if item.text == None:
            if item.tag not in volume_dict:
                volume_dict[item.tag] = None
            continue
        if item.text[0] == ' ':
            item.text = item.text[1:]        
        if item.tag == "subject":
            if "subject" in volume_dict:
                volume_dict["subject"].append(item.text.split("--"))
            else:
                volume_dict["subject"] = item.text.split("--")
        elif item.tag == "title":
            volume_dict["title"] = item.text
        elif item.tag == "contributor":
            if (item.text.find('(') != -1) and (item.text.find(')') != -1):
                name = item.text[:item.text.find('(')]
                role = item.text[item.text.find('(') + 1:item.text.find(')')]
            else:
                continue
            if "contributor" in volume_dict:
                volume_dict["contributor"].append({"name": name, "role": role})
            else:
                volume_dict["contributor"] = [{"name": name, "role": role}]
        elif item.tag == "identifier":
            volume_dict["identifier"] = item.text[item.text.find(':') + 1:]
        elif item.tag == "coverage":
            if ('.' in item.text) and (',' in item.text) and ("Archives" not in item.text):
                if "coverage" in volume_dict:
                    volume_dict["coverage"]["coords"] = item.text
                else:
                    volume_dict["coverage"] = {"coords": item.text}
            elif "--" in item.text:
                places = item.text.split("--")
                if len(places) == 4:
                    if "coverage" in volume_dict:
                        volume_dict["coverage"]["country"] = places[1]
                        volume_dict["coverage"]["state"] = places[2]
                        volume_dict["coverage"]["city"] = places[3]
                    else:
                        volume_dict["coverage"] = {"country": places[1], "state": places[2], "city": places[3]}                
        elif item.tag == "source":
            if "coverage" in volume_dict:
                volume_dict["coverage"]["institution"] = item.text
            else:
                volume_dict["coverage"] = {"institution": item.text}
        elif ((item.tag == "type") and (item.text == "Text")) or (item.tag == "rights"):
            continue
        elif (item.tag == "creator") and (';' in item.text):            
            creators = item.text.split(';')
            for creator in creators:                
                if (len(creator) > 1) and (creator[0] == ' '):
                    creator = creator[1:]
                if "creator" in volume_dict:
                    volume_dict["creator"].append(creator)
                else:
                    volume_dict["creator"] = [creator]
        elif (item.tag == "language") and (';' in item.text):
            languages = item.text.split(';')
            for language in languages:
                if (len(language) > 1) and (language[0] == ' '):
                    language = language[1:]
                if "language" in volume_dict:
                    volume_dict["language"].append(language)
                else:
                    volume_dict["language"] = [language]
        else:            
            if item.tag in volume_dict:                
                volume_dict[item.tag].append(item.text)                
            else:
                volume_dict[item.tag] = [item.text]       
        
    if "coverage" not in volume_dict:
        volume_dict["coverage"] = {}               
        volume_dict["coverage"]["country"] = volume_dict["s3_path"].split('/')[0].replace('_', ' ')
        volume_dict["coverage"]["state"] = volume_dict["s3_path"].split('/')[1].replace('_', ' ')
        volume_dict["coverage"]["city"] = volume_dict["s3_path"].split('/')[2].replace('_', ' ')
        volume_dict["coverage"]["institution"] = volume_dict["s3_path"].split('/')[3].replace('_', ' ')
    elif "institution" not in volume_dict["coverage"]:
        volume_dict["coverage"]["institution"] = volume_dict["s3_path"].split('/')[3].replace('_', ' ')
        
    return volume_dict

In [None]:
#no_test

prefixes = ["Colombia/Chocó/Quibdó/Notaria_Primera_de_Quibdó/56975"]

for pref in prefixes:
    df, metadata = scrape_bucket("ssda-assets", prefix=pref)
    pref = pref.replace('/', '_')
    pref = pref[:-1]
    df.to_csv(pref.lower() + ".csv", index = False)
    with open(pref.lower() + ".json", 'w', encoding="utf-8") as outfile:
        outfile.write('{\n\"volumes\": \n')
        json.dump(metadata, outfile)
        outfile.write('}')
    print("Finished working on " + pref)

Completed Libro de Venta de Esclavo
Finished working on Colombia_Chocó_Quibdó_Notaria_Primera_de_Quibdó_5697


In [None]:
#no_test

from nbdev.export import notebook2script
notebook2script()

Converted manifest_generation.ipynb.
Converted s3_scrape.ipynb.
Converted s3_scrape_dev.ipynb.
