In [None]:
def train_upside_down_classifier(data_dir, batch_size=32, img_height=180, img_width=180,val_split=0.2):
    import tensorflow as tf

    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.keras.models import Sequential
    
    train_ds = tf.keras.utils.image_dataset_from_directory(
      data_dir,
      validation_split=val_split,
      subset="training",
      seed=123,
      image_size=(img_height, img_width),
      batch_size=batch_size)
    
    val_ds = tf.keras.utils.image_dataset_from_directory(
      data_dir,
      validation_split=0.2,
      subset="validation",
      seed=123,
      image_size=(img_height, img_width),
      batch_size=batch_size)
    
    class_names = train_ds.class_names
    
    AUTOTUNE = tf.data.AUTOTUNE

    train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    
    num_classes = len(class_names)

    model = Sequential([
          layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
          layers.Conv2D(16, 3, padding='same', activation='relu'),
          layers.MaxPooling2D(),
          layers.Conv2D(32, 3, padding='same', activation='relu'),
          layers.MaxPooling2D(),
          layers.Conv2D(64, 3, padding='same', activation='relu'),
          layers.MaxPooling2D(),
          layers.Flatten(),
          layers.Dense(128, activation='relu'),
          layers.Dense(num_classes)
        ])

    model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
    epochs=10
    history = model.fit(
      train_ds,
      validation_data=val_ds,
      epochs=epochs
    )
    
    return img_height, img_width, class_names, model

In [None]:
def download_volume(vol_id, tar_dir="", bucket_name = "ssda-production-jpgs", start_image = 1, end_image = None):
    import boto3
    
    s3_resource = boto3.resource("s3")
    s3_client = boto3.client("s3")    
    bucket = s3_resource.Bucket(bucket_name)
    
    images = 0
    
    for obj in bucket.objects.filter(Prefix = str(vol_id)):
        volume, image = obj.key.split('-')
        image = int(image.split('.')[0])
        if image < start_image:
            continue
        elif end_image != None and image > end_image:
            break
        s3_client.download_file(bucket_name, obj.key, tar_dir + "\\" + obj.key)
        images += 1
        
    print(str(images) + " images downloaded.")

In [None]:
download_volume(8186, tar_dir = 'E:\\ssda-htr-data\\8186_transcribed', start_image = 25, end_image = 64)

40 images downloaded.


In [None]:
def rectify_volume(vol_id, classifier_training_data_dir = "E:\\folio_samples", target_bucket_name = "ssda-production-jpgs"):
    import boto3
    import os
    import shutil
    from PIL import Image
    import tensorflow as tf
    import numpy as np

    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.keras.models import Sequential
    
    s3_client = boto3.client("s3")
    
    os.makedirs("temp")
    download_volume(vol_id, tar_dir = "temp")
    img_height, img_width, class_names, model = train_upside_down_classifier(classifier_training_data_dir)
    print("Upside-down classifier trained.")
    images = 0
    
    for root, dirs, files in os.walk("temp"):
        for file in files:
            shutil.copyfile(os.path.join(root, file), "temp.jpg")
            image = {}
            
            with Image.open("temp.jpg") as im:
                width, height = im.size
                image_size = os.stat("temp.jpg").st_size
                while (image_size > 3000000):
                    width, height = im.size
                    im = im.resize((int(round(width * .75)), int(round(height * .75))))
                    im.save("temp.jpg")
                    image_size = os.stat("temp.jpg").st_size
                if width > height:
                    im = im.transpose(Image.ROTATE_90)
                    width, height = im.size
                    im.save("temp.jpg")

                img = tf.keras.utils.load_img(
                    "temp.jpg", target_size=(img_height, img_width)
                )

                img_array = tf.keras.utils.img_to_array(img)
                img_array = tf.expand_dims(img_array, 0) # Create a batch

                predictions = model.predict(img_array)
                score = tf.nn.softmax(predictions[0])                    

                #if (class_names[np.argmax(score)] == "upside_down") and (np.max(score) >= .8) and (image_count != 0):
                if (class_names[np.argmax(score)] == "upside_down") and (np.max(score) >= .8):
                    im = im.transpose(Image.ROTATE_90)
                    im = im.transpose(Image.ROTATE_90)
                    im.save("temp.jpg")
                    
                image["width"] = width
                image["height"] = height
                    
            images += 1
            s3_client.upload_file("temp.jpg", target_bucket_name, file, ExtraArgs={'ContentType': "image/jpeg", 'Metadata': {"width": str(image["width"]), "height": str(image["height"])}})
                
    os.remove("temp.jpg")
    shutil.rmtree("temp")
    images = str(images)
    print(images + " images downloaded, rectified, and uploaded.")

In [None]:
rectify_volume(704115)

80 images downloaded.
Found 2342 files belonging to 2 classes.
Using 1874 files for training.
Found 2342 files belonging to 2 classes.
Using 468 files for validation.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Upside-down classifier trained.
80 images downloaded, rectified, and uploaded.
