# Preprocessing

In [1]:
!pip install Pillow



In [2]:
import os
import pandas as pd
from PIL import Image
import numpy as np

from keras.applications.vgg16 import preprocess_input
from keras.utils import img_to_array
from keras.applications import VGG16
from keras.models import Sequential
from keras.layers import Flatten

2024-03-11 03:07:26.721913: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-11 03:07:26.745209: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-11 03:07:26.745229: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-11 03:07:26.745768: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-11 03:07:26.749702: I tensorflow/core/platform/cpu_feature_guar

In [3]:
# Location of images and corresponding metadata
image_dir = 'images/Images'
annotation_dir = 'annotations/Annotation'
folder_to_save = 'theFeaturesByBreed'
SHAPE = (224, 224)

In [4]:
# Load VGG16 model without top layers
base_model = VGG16(weights='imagenet', include_top=True, classifier_activation=None)

# Freeze the convolutional base
base_model.trainable = False

# Create a new model to extract features
feature_extraction_model = Sequential([
    base_model,
    Flatten()
])

2024-03-11 03:07:27.862113: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-11 03:07:27.876096: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-11 03:07:27.876134: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-11 03:07:27.876938: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-11 03:07:27.876969: I external/local_xla/xla/stream_executor

In [5]:
def predict(img):
    return feature_extraction_model.predict(np.expand_dims(img, axis = 0))[0]

In [10]:
# Traverse through image directory

i = 0
for breed_folder in os.listdir(image_dir):
#     if i > 10:  # Used for testing a subset
#         break
    breed_path = os.path.join(image_dir, breed_folder)
    if os.path.isdir(breed_path):
        breed = breed_folder.split('-')[-1].lower()  # Lowercase for uniformity
        
        # double check if we already wrote data for this
        if os.path.exists(os.path.join(folder_to_save, breed + '.csv')):
            continue
        
        print("Breed is:", breed) 
        raw_data = []
        
        # Identify each image in the subfolder
        for image_file in os.listdir(breed_path):
            if image_file.endswith('.jpg'):
                print("Image is:", image_file)
                image_id = os.path.splitext(image_file)[0]
                image_path = os.path.join(breed_path, image_file)
                
                # Find corresponding annotation file
                annotation_file = os.path.join(annotation_dir, breed_folder, image_id)
                # print("Annotation file is:", annotation_file) # FIXME: remove or comment
                # Check if annotation file exists
                if os.path.exists(annotation_file):
                    # Read annotation file
                    with open(annotation_file, 'r') as f:
                        lines = f.readlines()
                    
                    # Extract annotation information
                    width = int(int(lines[7].split('>')[1].split('<')[0]))
                    #print(lines[7], width) # FIXME: remove
                    height = int(lines[8].split('>')[1].split('<')[0])
                    #print(lines[8], height) # FIXME: remove
                    xmin = int(lines[18].split('>')[1].split('<')[0])
                    #print(lines[18], xmin) # FIXME: remove
                    ymin = int(lines[19].split('>')[1].split('<')[0])
                    xmax = int(lines[20].split('>')[1].split('<')[0])
                    ymax = int(lines[21].split('>')[1].split('<')[0])
                    
                    # Open image and crop
                    img = Image.open(image_path)
                    img = img.crop((xmin, ymin, xmax, ymax))
                    img = img.resize(SHAPE)
                    img = img_to_array(img)
                    img = preprocess_input(img)

                    # extract features
                    features = predict(img)
                    
                    # Add the new entry to the list
                    raw_data.append({
                               'Breed': breed,
                               'Features': features
                              })
                # endif
            # endif
        # endfor
        df = pd.DataFrame(raw_data)
        df.to_csv(os.path.join(folder_to_save, breed + '.csv'), index=False)
                    
#     i+= 1
                    
del i

Breed is: chesapeake_bay_retriever
Image is: n02099849_541.jpg
Image is: n02099849_3007.jpg
Image is: n02099849_3289.jpg
Image is: n02099849_4677.jpg
Image is: n02099849_4767.jpg
Image is: n02099849_410.jpg
Image is: n02099849_3220.jpg
Image is: n02099849_4636.jpg
Image is: n02099849_1115.jpg
Image is: n02099849_3128.jpg
Image is: n02099849_3383.jpg
Image is: n02099849_1068.jpg
Image is: n02099849_2706.jpg
Image is: n02099849_1776.jpg
Image is: n02099849_3563.jpg
Image is: n02099849_2130.jpg
Image is: n02099849_669.jpg
Image is: n02099849_1050.jpg
Image is: n02099849_3976.jpg
Image is: n02099849_1234.jpg
Image is: n02099849_1837.jpg
Image is: n02099849_1265.jpg
Image is: n02099849_2581.jpg
Image is: n02099849_671.jpg
Image is: n02099849_1997.jpg
Image is: n02099849_1575.jpg
Image is: n02099849_693.jpg
Image is: n02099849_263.jpg
Image is: n02099849_1922.jpg
Image is: n02099849_3753.jpg
Image is: n02099849_4322.jpg
Image is: n02099849_3016.jpg
Image is: n02099849_3494.jpg
Image is: n020