## Part 1: Load Libraries, functions, and data
First we need to load the required libraries, define any functions that are going to be used repeatedly and then final load in the full train and test data

In [0]:
import numpy as np
import pandas as pd
import os
import math
import random
from matplotlib import pyplot as plt
import shutil
import cv2
from datetime import datetime
from urllib import request
%matplotlib inline

os.chdir('/content/drive/My Drive')

In [0]:
header_names = ['ASIN', 'Filename', 'Image URL', 'Title', 'Author', 'Category ID', 'Category']

with open('Data/new-book30-listing-train.csv', mode='r', encoding='utf-8', errors='ignore') as f:
    train_data = pd.read_csv(f, delimiter=",", header=None, names=header_names)

with open('Data/new-book30-listing-test.csv', mode='r', encoding='utf-8', errors='ignore') as f:
    test_data = pd.read_csv(f, delimiter=",", header=None, names=header_names)

## Part 2: Split out validation data
We need to split out the training data to produce an explicit validation dataset for usage in training

In [0]:
grouped_train = train_data.groupby('Category')

val_data = grouped_train.apply(lambda x: x.sample(frac=0.1, random_state = 1234))

print(f'Validation set contains {val_data.shape[0]} of the records.')

Validation set contains 5130 of the records.


In [0]:
new_train = train_data[~train_data['ASIN'].isin(val_data['ASIN'])]
print(f'New training set has {new_train.shape[0]} records, down from {train_data.shape[0]}')

New training set has 46170 records, down from 51300


In [0]:
#save data
new_train.to_csv('Data/final-book30-listing-train.csv', header = False)
val_data.to_csv('Data/final-book30-listing-valid.csv', header = False)
test_data.to_csv('Data/final-book30-listing-test.csv', header = False)

new_train.to_csv('Data/final-book30-labels-train.csv', header = False, columns = ['Filename', 'Category ID'])
val_data.to_csv('Data/final-book30-labels-valid.csv', header = False, columns = ['Filename', 'Category ID'])
test_data.to_csv('Data/final-book30-labels-test.csv', header = False, columns = ['Filename', 'Category ID'])

## Part 3: Make Folders


In [0]:
folders = ['images/Train', 'images/Test', 'images/Valid', 
           'images/Train/noprep', 'images/Test/noprep', 'images/Valid/noprep', 
           'images/Train/padded', 'images/Test/padded', 'images/Valid/padded', 
           'images/Train/cropped', 'images/Test/cropped', 'images/Valid/cropped']
for fol in folders:
    if not os.path.exists(fol):
        os.makedirs(fol)

## Part 3.5: Manual fixes
Some of the files got a bad request error when we looked for their images, so I need to manually find and download the image for them. This adds more uncertainty and difference between the previous work and this one but there's nothing else to be done.

In [0]:
# Find files that fail to load for whatver reason
train_error_list = []
for i, fcat, fname in zip(range(len(new_train)), new_train["Category"], new_train["Filename"]):
    if i % 100 == 0:
        print(i)
    try:
        current_image = plt.imread(f'images/{fcat}/{fname}')
    except:
        train_error_list.append(fname)

test_error_list = []
for i, fcat, fname in zip(range(len(test_data)), test_data["Category"], test_data["Filename"]):
    if i % 100 == 0:
        print(i)
    try:
        current_image = plt.imread(f'images/{fcat}/{fname}')
    except:
        test_error_list.append(fname)

valid_error_list = []
for i, fcat, fname in zip(range(len(val_data)), val_data["Category"], val_data["Filename"]):
    if i % 100 == 0:
        print(i)
    try:
        current_image = plt.imread(f'images/{fcat}/{fname}')
    except:
        valid_error_list.append(fname)

In [269]:
print('There was a total of', len(train_error_list) + len(test_error_list) + len(valid_error_list), 'errors that need to be manually corrected.')

There was a total of 7 errors that need to be manually corrected.


In [0]:
errors_to_fix = pd.concat([new_train[new_train['Filename'].isin(train_error_list)], 
                           test_data[test_data['Filename'].isin(test_error_list)],
                           val_data[val_data['Filename'].isin(valid_error_list)]])

In [0]:
# save incase needed later
errors_to_fix.to_csv('Data/manual_errors_to_fix.csv', header = True)

In [0]:
# calculate the full file path
to_delete = errors_to_fix.assign(full_path = 'images/' + errors_to_fix['Category'] +'/' + errors_to_fix['Filename'])

In [279]:
# delete issue files
for fl in to_delete['full_path']:
    print('Deleting ', fl)
    os.remove(fl)


Deleting  images/Self-Help/1416298126.jpg
Deleting  images/Calendars/1419717200.jpg
Deleting  images/Self-Help/B008J3X2CS.jpg
Deleting  images/Self-Help/0399170626.jpg
Deleting  images/Calendars/1449469418.jpg
Deleting  images/Arts & Photography/1560109904.jpg


In [0]:
# redownload them
for i in range(len(to_delete)):
    print(i)
    url = to_delete.iloc[i]['Image URL']
    downloaded_img = request.urlopen(url)
    fl_path = to_delete.iloc[i]['full_path']
    f = open(fl_path, mode='wb')
    f.write(downloaded_img.read())
    downloaded_img.close()
    f.close()

## Part 4: Produce 3 different datasets
Here we try 3 different approaches to image pre-processing for use to produce square images as required by the CNNs we will be using. We already have the first, which is just the default non-cropped, non-padded data but we will scale this down a little for filesize before moving it into it's own folder.

We will also produce a verion where the image is padded with black pixels to force a square shape, and finally a cropped version to force a square shape.

In all cases we will downscale to 299x299 pixels before saving. This needs to be reduced further for use by some of the CNNs but is the largest required for any of the 3 CNNS we will use.

We also take this opportunity to produce a file that has some information regarding the image itself for further analysis outside of this notebook.

In [0]:
# Set up some initial stuff
column_names = ["Filename", "Category", "Dataset", "Height", "Width", "Avg_Red", "Avg_Green", "Avg_Blue"]
num_images_train = len(new_train)
num_images_test = len(test_data)
num_images_val = len(val_data)
# Preallocate the space to speed up the work
image_dat_train = pd.DataFrame(index=np.arange(0, num_images_train), columns=column_names )
image_dat_test = pd.DataFrame(index=np.arange(0, num_images_test), columns=column_names )
image_dat_valid = pd.DataFrame(index=np.arange(0, num_images_val), columns=column_names )

In [0]:
def prep_images_and_save(df, df2, pth, n, dataset, resize_shape):
    for i, fname in zip(range(n), df["Filename"]):
        if i % 200 == 0:
            print(f'{datetime.now()} Running image {i} / {n}')
        # Load the image
        fcat = df.iloc[i]["Category"]
        current_image = plt.imread(f'images/{fcat}/{fname}')
        # Some images get loaded in greyscale for some reason so check and convert them
        # The alternative is use cv2.imread but despite being jpgs which by definition have no alpha channel, some have an alpha channel and cause that read to break
        if len(current_image.shape) == 2:
            current_image = cv2.cvtColor(current_image, cv2.COLOR_GRAY2RGB)
        # Get properties of the image and add this to df2
        height = current_image.shape[0]
        width = current_image.shape[1]
        df2.loc[i]= [fname, 
                                fcat,
                                dataset,
                                height, 
                                width, 
                                np.average(current_image[:,:,0]), #average red value
                                np.average(current_image[:,:,1]), 
                                np.average(current_image[:,:,2]) ]
        # If book cover is taller than it is wide
        if height > width:
            to_crop = (height - width)
            each_side = math.floor(to_crop / 2)
            # Deal with odd difference in dimensions
            if to_crop % 2 != 0:
                cropped_image = current_image[each_side + 1:each_side + 1+width,:,:] # crop the image to the image centre
                padded_image = cv2.copyMakeBorder(current_image, 0, 0, each_side + 1, each_side, cv2.BORDER_CONSTANT,value=[255, 255, 255]) # pad the image with white on either side
            else:
                cropped_image = current_image[each_side:each_side+width,:,:]
                padded_image = cv2.copyMakeBorder(current_image, 0, 0, each_side, each_side, cv2.BORDER_CONSTANT,value=[255, 255, 255])

        # If book cover is wider than it is tall
        elif width > height:
            to_crop = (width - height)
            each_side = math.floor(to_crop / 2)
            if to_crop % 2 != 0:
                cropped_image = current_image[:, each_side + 1:each_side + 1+height,:]
                padded_image = cv2.copyMakeBorder(current_image, each_side + 1, each_side, 0, 0 ,cv2.BORDER_CONSTANT,value=[255, 255, 255])
            else:
                cropped_image = current_image[:, each_side:each_side+height,:]
                padded_image = cv2.copyMakeBorder(current_image, each_side, each_side, 0, 0, cv2.BORDER_CONSTANT,value=[255, 255, 255])
        
        else:
            cropped_image = current_image
            padded_image = current_image

        # resize the image version
        not_preped_image = cv2.resize(current_image, dsize=(resize_shape, resize_shape), interpolation=cv2.INTER_CUBIC)
        cropped_image = cv2.resize(cropped_image, dsize=(resize_shape, resize_shape), interpolation=cv2.INTER_CUBIC)
        padded_image = cv2.resize(padded_image, dsize=(resize_shape, resize_shape), interpolation=cv2.INTER_CUBIC)

        # save image
        cv2.imwrite(f'{pth}/noprep/{fname}', cv2.cvtColor(not_preped_image, cv2.COLOR_RGB2BGR))
        cv2.imwrite(f'{pth}/padded/{fname}', cv2.cvtColor(padded_image, cv2.COLOR_RGB2BGR))
        cv2.imwrite(f'{pth}/cropped/{fname}', cv2.cvtColor(cropped_image, cv2.COLOR_RGB2BGR))

In [0]:
prep_images_and_save(test_data, image_dat_test, 'images/Test', num_images_test, 'Test', 299)
image_dat_test.to_csv('Data/dims_and_colours_to_analyse_test.csv', header = True)


In [298]:
prep_images_and_save(val_data, image_dat_valid, 'images/Valid', num_images_val, 'Valid', 299)
image_dat_valid.to_csv('Data/dims_and_colours_to_analyse_valid.csv', header = True)


2020-05-24 20:40:45.068972 Running image 0 / 5130
2020-05-24 20:40:51.903629 Running image 200 / 5130
2020-05-24 20:40:58.610409 Running image 400 / 5130
2020-05-24 20:41:05.501958 Running image 600 / 5130
2020-05-24 20:41:12.719085 Running image 800 / 5130
2020-05-24 20:41:19.518211 Running image 1000 / 5130
2020-05-24 20:41:26.636889 Running image 1200 / 5130
2020-05-24 20:41:33.831603 Running image 1400 / 5130
2020-05-24 20:41:40.837637 Running image 1600 / 5130
2020-05-24 20:41:47.753951 Running image 1800 / 5130
2020-05-24 20:41:54.507005 Running image 2000 / 5130
2020-05-24 20:42:02.287641 Running image 2200 / 5130
2020-05-24 20:42:09.927488 Running image 2400 / 5130
2020-05-24 20:42:16.690382 Running image 2600 / 5130
2020-05-24 20:42:23.822091 Running image 2800 / 5130
2020-05-24 20:42:30.741587 Running image 3000 / 5130
2020-05-24 20:42:37.777843 Running image 3200 / 5130
2020-05-24 20:42:44.676694 Running image 3400 / 5130
2020-05-24 20:42:51.818106 Running image 3600 / 5130


In [299]:
prep_images_and_save(new_train, image_dat_train, 'images/Train', num_images_train, 'Train', 299)
image_dat_train.to_csv('Data/dims_and_colours_to_analyse_train.csv', header = True)

2020-05-24 20:44:15.738417 Running image 0 / 46170
2020-05-24 20:44:22.983781 Running image 200 / 46170
2020-05-24 20:44:30.009870 Running image 400 / 46170
2020-05-24 20:44:37.092980 Running image 600 / 46170
2020-05-24 20:44:44.197505 Running image 800 / 46170
2020-05-24 20:44:51.223005 Running image 1000 / 46170
2020-05-24 20:44:58.392252 Running image 1200 / 46170
2020-05-24 20:45:05.465334 Running image 1400 / 46170
2020-05-24 20:45:12.507543 Running image 1600 / 46170
2020-05-24 20:45:19.825536 Running image 1800 / 46170
2020-05-24 20:45:26.807241 Running image 2000 / 46170
2020-05-24 20:45:34.678250 Running image 2200 / 46170
2020-05-24 20:45:41.892942 Running image 2400 / 46170
2020-05-24 20:45:48.855450 Running image 2600 / 46170
2020-05-24 20:45:55.916979 Running image 2800 / 46170
2020-05-24 20:46:02.957404 Running image 3000 / 46170
2020-05-24 20:46:09.975299 Running image 3200 / 46170
2020-05-24 20:46:16.848234 Running image 3400 / 46170
2020-05-24 20:46:24.586373 Running 