## Part 1: Load Libraries, load data and prep cuts of data
First we need to load all the data in, for the complete dataset and then the exisitng train and test sets. We then create sets of the keys from these data to identify what records have not been yet used in the training or test set as our remaining pool of records for later.

In [0]:
# Import libraries and set workspace location
import numpy as np
import pandas as pd
import os
import shutil
import random
from matplotlib import pyplot as plt
from collections import Counter
from copy import deepcopy
from IPython import display
import time
%matplotlib inline

os.chdir('/content/drive/My Drive')


In [0]:
# Function to take a record number and a dataframe and plot the image
def show_record(ASIN, df):
    cat = df[df['ASIN'] == str(ASIN)].iloc[0]['Category']
    plt.imshow(plt.imread(f'images/{cat}/{str(ASIN).zfill(10)}.jpg'))

In [0]:
# Set the headers and read in the files
header_names = ['ASIN', 'Filename', 'Image URL', 'Title', 'Author', 'Category ID', 'Category']

# All data
with open('Data/book32-listing.csv', mode='r', encoding='utf-8', errors='ignore') as f:
    all_images = pd.read_csv(f, delimiter=",", header=None, names=header_names)

# Training data
with open('Data/book30-listing-train.csv', mode='r', encoding='utf-8', errors='ignore') as f:
    train = pd.read_csv(f, delimiter=",", header=None, names=header_names)

# Test data
with open('Data/book30-listing-test.csv', mode='r', encoding='utf-8', errors='ignore') as f:
    test = pd.read_csv(f, delimiter=",", header=None, names=header_names)
    

In [0]:
# Create sets of ASINs in each group to be able to create new records
categories = train.Category.unique()

all_asins = dict()
train_asins = dict()
test_asins = dict()
pos_asins = dict()

for cat in categories:
    all_asins[cat] = set(all_images.query(f'Category == "{cat}"')['ASIN'])
    train_asins[cat] = set(train.query(f'Category == "{cat}"')['ASIN'])
    test_asins[cat] = set(test.query(f'Category == "{cat}"')['ASIN'])

pos_asins = {cat : all_asins[cat].difference(train_asins[cat]).difference(test_asins[cat]) for cat in categories}


In [5]:
for k in pos_asins.keys():
    print(k, '\t Starting Volume:', len(all_asins[k]), '\t Remaining volume:', len(pos_asins[k]))

Biographies & Memoirs 	 Starting Volume: 4261 	 Remaining volume: 2361
Children's Books 	 Starting Volume: 13605 	 Remaining volume: 11705
Engineering & Transportation 	 Starting Volume: 2672 	 Remaining volume: 772
Christian Books & Bibles 	 Starting Volume: 9139 	 Remaining volume: 7239
Sports & Outdoors 	 Starting Volume: 5968 	 Remaining volume: 4068
Health, Fitness & Dieting 	 Starting Volume: 11886 	 Remaining volume: 9986
Medical Books 	 Starting Volume: 12086 	 Remaining volume: 10186
Science & Math 	 Starting Volume: 9276 	 Remaining volume: 7376
Travel 	 Starting Volume: 18338 	 Remaining volume: 16438
Business & Money 	 Starting Volume: 9965 	 Remaining volume: 8065
Cookbooks, Food & Wine 	 Starting Volume: 8802 	 Remaining volume: 6902
Politics & Social Sciences 	 Starting Volume: 3402 	 Remaining volume: 1502
Crafts, Hobbies & Home 	 Starting Volume: 9934 	 Remaining volume: 8034
Religion & Spirituality 	 Starting Volume: 7559 	 Remaining volume: 5659
Literature & Fiction 

# Part 2: Identify possible box-set books
We want to remove any possible book covers that aren't "front facing" as is often the case with boxset offerings. These will need to be manually reviewed and we'll keep any set that has a single front facing image still, but side on images or side by sides will be removed.

In [13]:
# Identify any potential record we might want to remove
train_issues = []
test_issues = []

search_terms = ['(boxed set)', '(boxset)', '(box set)', '(anthology)', '(bundle)', '\d(-book)', '\d( book)']

for term in search_terms:
    train_issues.extend(train[train['Title'].str.lower().str.contains(term, regex = True)]['ASIN'])
    test_issues.extend(test[test['Title'].str.lower().str.contains(term, regex = True)]['ASIN'])

# Get unique records only
train_issues_clean = list(set(train_issues))
test_issues_clean = list(set(test_issues))
print('Potential number of issues: ', len(train_issues_clean) + len(test_issues_clean))

  return func(self, *args, **kwargs)


Potential number of issues:  264


In [0]:
# Move all files for manual review to it's own filter, and create a document to complete for the manual review
if os.path.exists('images/Manual Review'):
    shutil.rmtree('images/Manual Review')

os.makedirs('images/Manual Review')

for i, asin in zip(range(len(train_issues_clean)), train_issues_clean):
    print('Copying image', i)
    cat = all_images[all_images['ASIN'] == str(asin)].iloc[0]['Category']
    pth = f'images/{cat}/{asin.zfill(10)}.jpg'
    # google drive sorting doesn't work for some reason, so just going to manually number these up...
    shutil.copy(pth, f'images/Manual Review/train_{str(i).zfill(4)}.jpg')

for i, asin in zip(range(len(test_issues_clean)), test_issues_clean):
    print('Copying image', i)
    cat = all_images[all_images['ASIN'] == str(asin)].iloc[0]['Category']
    pth = f'images/{cat}/{asin.zfill(10)}.jpg'
    # google drive sorting doesn't work for some reason, so just going to manually number these up...
    shutil.copy(pth, f'images/Manual Review/test_{str(i).zfill(4)}.jpg')

In [0]:
# Write a file with the new names to review
new_names_train = pd.DataFrame(zip(range(len(train_issues_clean)), train_issues_clean, ['Train']*len(train_issues_clean)), columns = ['new_filename', 'ASIN', 'Dataset'])
new_names_test = pd.DataFrame(zip(range(len(test_issues_clean)), test_issues_clean, ['Test']*len(test_issues_clean)), columns = ['new_filename', 'ASIN', 'Dataset'])
new_names = new_names_train.append(new_names_test)
data_to_write = all_images[all_images['ASIN'].isin(train_issues_clean + test_issues_clean)].merge(new_names, on = 'ASIN', how = 'left').sort_values(['Dataset', 'new_filename'])[['ASIN', 'Dataset', 'new_filename', 'Category', 'Title']]
data_to_write['Remove'] = 0
data_to_write.to_csv('Data/manual_data_file.csv', mode= 'w+')

## Part 3: Load in review data and top up
Here we load back in the data that we manually reviewed, remove any records that were not front facing single images, and then use the remaining pool of records to replace these records with new ones. Importantly, we check these as well to ensure they don't fall into the same problem.

In [20]:
# Load data back in, filter to those that we want to remove
with open('Data/manual_data_file_completed.csv', mode='r', encoding='utf-8', errors='ignore') as f:
    reviewed_data = pd.read_csv(f, delimiter=",", header=0)

reviewed_data = reviewed_data[reviewed_data['Remove'] == 1]
print(f'In total you are removing {len(reviewed_data)} records from your datasets.')

In total you are removing 94 records from your datasets.


In [0]:
# count number to remove per category
num_issues_train = Counter(reviewed_data[reviewed_data['Dataset'] == 'Train']['Category'].tolist())
num_issues_test = Counter(reviewed_data[reviewed_data['Dataset'] == 'Test']['Category'].tolist())

In [0]:
# set seed and sample this many new ones, check the names then change seed if needed...
pos_asins2 = deepcopy(pos_asins)
replacement_asins_train = []
for k, v in num_issues_train.items():
    random.seed(10) # go in loop due to possible different order of loop
    replacement_asins_train.extend(random.sample(pos_asins[k], v)) 
    # make sure to remove before we do the test
    pos_asins2[k] = pos_asins[k].difference(set(replacement_asins_train))

# remove those just selected before doing it for testing
replacement_asins_test = []
for k, v in num_issues_test.items():
    random.seed(10)
    replacement_asins_test.extend(random.sample(pos_asins2[k], v))



In [0]:
# Manually test these book covers as well, just do this locally as we expect it to mostly be fine
for i, record in zip(range(len(replacement_asins_train)), replacement_asins_train):
    print(i)
    show_record(record, all_images)
    display.clear_output(wait=True)
    display.display(plt.gcf())
    time.sleep(1.0)

In [0]:
for i, record in zip(range(len(replacement_asins_test)), replacement_asins_test):
    print(i)
    show_record(record, all_images)
    display.clear_output(wait=True)
    display.display(plt.gcf())
    time.sleep(1.0)

In [0]:
# Need to manually remove 3 records from the new train set
new_to_remove = [replacement_asins_train[0], replacement_asins_train[23], replacement_asins_train[26]]

new_issues_train = Counter(all_images[all_images['ASIN'].isin(new_to_remove)]['Category'])

In [73]:
extra_replacement_asins_train = []
for k, v in new_issues_train.items():
    random.seed(12) # new seed otherwise we'd just get the same ones
    extra_replacement_asins_train.extend(random.sample(pos_asins[k], v)) 

# quick check we're not using them as extras already
print(f'Overlap with new train set: {len(set(extra_replacement_asins_train).intersection(set(replacement_asins_train)))}')
print(f'Overlap with new test set: {len(set(extra_replacement_asins_train).intersection(set(replacement_asins_test)))}')


print(f'New ASINs are: {extra_replacement_asins_train}')

Overlap with new train set: 0
Overlap with new test set: 0
New ASINs are: ['1449465749', '870335898', '1452501475']


In [0]:
replacement_asins_train = list(set(replacement_asins_train).difference(set(new_to_remove))) + extra_replacement_asins_train

replacement_images_train = all_images[all_images['ASIN'].isin(replacement_asins_train)]
replacement_images_test = all_images[all_images['ASIN'].isin(replacement_asins_test)]

In [83]:
# remove from the training and test set and randomly replace with the new records
new_train = train[~train['ASIN'].isin(reviewed_data[reviewed_data['Dataset'] == 'Train']['ASIN'].tolist())].append(replacement_images_train)

new_test = test[~test['ASIN'].isin(reviewed_data[reviewed_data['Dataset'] == 'Test']['ASIN'].tolist())].append(replacement_images_test)

print(f'Number of records in new train: {len(new_train)}')
print(f'Number of records in new test: {len(new_test)}')

Number of records in new train: 51300
Number of records in new test: 5700


In [0]:
# save files and be done
new_train.to_csv('Data/new-book30-listing-train.csv', header = False)
new_train.to_csv('Data/new-bookcover30-labels-train.csv', header = False, columns = ['Filename', 'Category ID'])

new_train.to_csv('Data/new-book30-listing-test.csv', header = False)
new_test.to_csv('Data/new-bookcover30-labels-test.csv', header = False, columns = ['Filename', 'Category ID'])