In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))


In [None]:
INPUT_FILES = '/kaggle/input/cropped-cats-and-dogs'

OUTPUT_DATASET_ID = 'augmented-cats-and-dogs'
OUTPUT_DATASET_NAME = 'Augmented Cats and Dogs'
OUTPUT_PATH = './output'

# kaggle_secrets not supported by Google Cloud Platform for Kaggle(Beta) at this time
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
#API_TOKEN = user_secrets.get_secret("Crop Cats and Cogs YOLOv3")

USER_ID = 'KAGGLE-USERNAME' # use your own username
API_TOKEN = 'KAGGLE-API-TOKEN' # use your own kaggle api key

BATCH_SIZE = 16 
ITERATIONS = 10

# final image size
# Same size is used in Crop Cats and Dogs
X_SIZE = 224
Y_SIZE = 224

In [None]:
import os
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

In [None]:
# if pad == True, maintain aspect ratio and pad images otherwise just rescale
def image_resize(image, x_size, y_size, pad):
    
    if pad:
        new_image = np.zeros((y_size, x_size, 3), np.uint8)
        new_image[:, 0:x_size] = (0, 255, 0) # (B, G, R) -- pure green padding

        w,h,c = image.shape
        
        if w > h:
            scale_factor = x_size/w
        else:
            scale_factor = y_size/h
            
        image = cv2.resize(image, (0,0), fx=scale_factor, fy=scale_factor)

        x_offset = int((x_size - image.shape[1])/2)
        y_offset = int((y_size - image.shape[0])/2)

        new_image[ y_offset:y_offset+image.shape[0], x_offset:x_offset+image.shape[1]] = image

        return new_image
    
    else:
        image = cv2.resize(image, (x_size, y_size))
        return image

This section inspired by: [Keras ImageDataGenerator and Data Augmentation](https://www.pyimagesearch.com/2019/07/08/keras-imagedatagenerator-and-data-augmentation/) by [Adrian Rosebrock](https://www.pyimagesearch.com/author/adrian/)


In [None]:
! pip install imutils

In [None]:
import os
import cv2
import time
import imageio
import numpy as np
import matplotlib.pyplot as plt

from imutils import paths

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.utils import to_categorical

In [None]:
# remove all masking artifacts from image
# very slow and inefficient, could be done better
from PIL import Image, ImageDraw

def mask_alpha(img):

    RED, GREEN, BLUE, ALPHA = (0, 1, 2, 3)

    x,y,c = img.shape
    alpha_channel = np.zeros([x,y])

    for i in range(x):

        for j in range(y):

            r=img[i][j][RED]
            g=img[i][j][GREEN]
            b=img[i][j][BLUE]

            if (r == 0.0 and g == 0.0 and b == 0.0) or (r == 0.0 and g == 1.0 and b == 0.0):
                alpha_channel[i][j] = 0.0
            else:
                alpha_channel[i][j] = 1.0

    # erode mask to get rid of more of the green screen
    kernel = np.ones((5,5), np.uint8)  
    alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1)
    alpha_channel = np.expand_dims(alpha_channel,axis=2)

    img = np.concatenate((img, alpha_channel), axis=2)

    # crop image using alpha mask
    pil_img = Image.fromarray((img * 255).astype(np.uint8))
    mask = Image.new("RGBA", pil_img.size, (0, 0, 0, 0))
    bbox_image = Image.composite(pil_img, mask, pil_img)
    bbox = bbox_image.convert("RGBa").getbbox()
    pil_img = pil_img.crop(bbox)
    img = np.array(pil_img)

    return img

In [None]:
# grab the list of images in our dataset directory, then initialize
# the list of data (i.e., images) and class images

print("loading images from:", INPUT_FILES)

data = []
file_names = []
imagePaths = list(paths.list_images(INPUT_FILES))

# loop over the image paths
for imagePath in imagePaths:
    
    label = imagePath.split(os.path.sep)[-2]
    image = cv2.imread(imagePath)
    image = image_resize(image, X_SIZE, Y_SIZE, True)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # switch the color oder

    # output original image as png with mask removed
    original_image = np.array(image, dtype="float") / 255.0
    original_image = mask_alpha(original_image)
    
    file_name, file_extension = os.path.splitext(os.path.basename(imagePath))
    imageio.imwrite(OUTPUT_PATH + '/' + file_name + '-original.png', original_image)

    # update the data and labels lists, respectively
    data.append(image)
    file_names.append(file_name)
    print(file_name)

In [None]:
# convert the data into a NumPy array, then preprocess it by scaling
# all pixel intensities to the range [0, 1]
data = np.array(data, dtype="float") / 255.0

# encode the labels (which are currently strings) as integers and then
# one-hot encode them
le = LabelEncoder()
labels = le.fit_transform(file_names)
print(labels)
#labels = to_categorical(labels)
#print(labels[0])

# partition the data into training and testing splits using 75% of
# the data for training and the remaining 25% for testing
(trainX, testX, trainY, testY) = train_test_split(data, file_names, test_size=0.25, random_state=42)

In [None]:
print(file_names[0])

In [None]:
aug = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="constant",
    cval = 0)

In [None]:
test_gen=aug.flow(trainX, trainY, batch_size=BATCH_SIZE)
images,labels=next(test_gen)  # returns the next batch of images and labels 
for i in range(images.shape[0]):
    print(labels[i])
    img=images[i]   # this is the first image  batch[0][1] would be the next image
    plt.imshow(img)   # shows the first image
    plt.show()


In [None]:
print("Processing Images")
start_time = time.time()

file_num = 0

# we need to break the loop by hand because the generator loops indefinitely
for e in range(ITERATIONS):
    
    batches = 0

    # aug_gen=aug.flow(trainX, trainY, batch_size=BATCH_SIZE)
    #images,labels=next(aug_gen)  # returns the next batch of images and labels 
    # for i in range(images.shape[0]):

    for images, labels in aug.flow(trainX, trainY, batch_size=BATCH_SIZE):
        
        batches += 1
        if batches >= len(trainX) / BATCH_SIZE:
            break

        for i in range(images.shape[0]):
        # for img in images:
            img=images[i]
            img = mask_alpha(img)
             
            file_num = file_num + 1
            imageio.imwrite(OUTPUT_PATH + '/' + labels[i] + '-' + str(file_num) +  '.png', img)

            # plt.imshow(img)   
            # plt.show()        

run_time = time.time()-start_time
print('Done Processing Images - Total Time: {:.1f}'.format(run_time) + ' Secs')

In [None]:
!ls $OUTPUT_PATH

This section insipred by: [kaggle uploader[](http://)](https://www.kaggle.com/donkeys/kaggle-uploader)

In [None]:
! python -m pip install --index-url https://test.pypi.org/simple/ --no-deps kaggle_uploader-screamatthewind

In [None]:
import os
import time

from kaggle_uploader import kaggle_uploader

print("Saving Images")
start_time = time.time()

# kaggle_secrets are not supported by Google Cloud Platform for Kaggle(Beta) at this time
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# api_secret = user_secrets.get_secret("Crop Cats and Cogs YOLOv3")

kaggle_uploader.resources = []
kaggle_uploader.init_on_kaggle(USER_ID, API_TOKEN)
kaggle_uploader.base_path = OUTPUT_PATH
kaggle_uploader.title = OUTPUT_DATASET_NAME
kaggle_uploader.dataset_id = OUTPUT_DATASET_ID
kaggle_uploader.user_id = USER_ID

for filename in os.listdir(kaggle_uploader.base_path):
    kaggle_uploader.add_resource(filename, filename)
    
kaggle_uploader.update("new version")

run_time = time.time()-start_time
print('Done Saving Images - Total Time: {:.1f}'.format(run_time) + ' Secs')
