# Overview
This notebook is designed to prepare image data for use in a machine learning model. The preparation includes loading images, organising them into appropriate structures, and performing any necessary preprocessing steps. This is the first step in the pipeline aimed at training a model for tasks such as image classification or similarity detection.



# Import the necessary libraries

In [22]:
import pandas as pd
import torch
from torchvision.transforms import v2
from clean_tabular_data import LoadTabularData, RemoveNullValues, ConvertPricesToNumericalFormat, RemoveRowsWithZeroPrice


# Processing Tabular Data

## Load Data

### Images

In [23]:
images = LoadTabularData('Images.csv')
print('\n----- Data shape -----')
print(images.shape)
print('\n----- Data information -----')
images.info()
print('\n----- Data sample -----')
images.head()





############## Loading data from source file: Images.csv ##############


----> Success. Data loaded successfully


----- Data shape -----
(12604, 3)

----- Data information -----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12604 entries, 0 to 12603
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  12604 non-null  int64 
 1   id          12604 non-null  object
 2   product_id  12604 non-null  object
dtypes: int64(1), object(2)
memory usage: 295.5+ KB
None

----- Data sample -----
   Unnamed: 0                                    id  \
0           0  912bb259-3ad9-457b-9db1-ce1da9016057   
1           1  b166d305-b852-4bdd-83f4-465b20da94fa   
2           2  68f5a29d-0075-4d60-81c1-ab684a82e50c   
3           3  f6a309d7-d247-446a-9b5e-aceefdd4334d   
4           4  2c2b3a6f-15b3-4289-937a-15482d9f5781   

                             product_id  
0  5f5f57d7-778f-4336-bb10-b43863418c8c  
1  5f5f57d7-7

Unnamed: 0.1,Unnamed: 0,id,product_id
0,0,912bb259-3ad9-457b-9db1-ce1da9016057,5f5f57d7-778f-4336-bb10-b43863418c8c
1,1,b166d305-b852-4bdd-83f4-465b20da94fa,5f5f57d7-778f-4336-bb10-b43863418c8c
2,2,68f5a29d-0075-4d60-81c1-ab684a82e50c,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4
3,3,f6a309d7-d247-446a-9b5e-aceefdd4334d,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4
4,4,2c2b3a6f-15b3-4289-937a-15482d9f5781,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf


### Products

In [24]:
products = LoadTabularData('Products.csv')
print('\n----- Data shape -----')
print(products.shape)
print('\n----- Data information -----')
products.info()
print('\n----- Data sample -----')
products.head()



############## Loading data from source file: Products.csv ##############


----> Success. Data loaded successfully


----- Data shape -----
(7156, 7)

----- Data information -----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7156 entries, 0 to 7155
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           7156 non-null   int64 
 1   id                   7156 non-null   object
 2   product_name         7156 non-null   object
 3   category             7156 non-null   object
 4   product_description  7156 non-null   object
 5   price                7156 non-null   object
 6   location             7156 non-null   object
dtypes: int64(1), object(6)
memory usage: 391.5+ KB
None

----- Data sample -----
   Unnamed: 0                                    id  \
0           0  243809c0-9cfc-4486-ad12-3b7a16605ba9   
1           1  1c58d3f9-8b93-47ea-9415-204fcc2a22e6   
2           2  860673f1

Unnamed: 0.1,Unnamed: 0,id,product_name,category,product_description,price,location
0,0,243809c0-9cfc-4486-ad12-3b7a16605ba9,"Mirror wall art | in Wokingham, Berkshire | Gu...","Home & Garden / Dining, Living Room Furniture ...","Mirror wall art. Posted by Nisha in Dining, Li...",£5.00,"Wokingham, Berkshire"
1,1,1c58d3f9-8b93-47ea-9415-204fcc2a22e6,"Stainless Steel Food Steamer | in Inverness, H...",Home & Garden / Other Household Goods,Morphy Richard’s (model no 48755)Stainless ste...,£20.00,"Inverness, Highland"
2,2,860673f1-57f6-47ba-8d2f-13f9e05b8f9a,"Sun loungers | in Skegness, Lincolnshire | Gum...",Home & Garden / Garden & Patio / Outdoor Setti...,I have 2 of these - collection only as I don’t...,£20.00,"Skegness, Lincolnshire"
3,3,59948726-29be-4b35-ade5-bb2fd7331856,Coffee side table from Ammunition ammo box hai...,"Home & Garden / Dining, Living Room Furniture ...",Great reclaimed army ammunition box used as co...,£115.00,"Radstock, Somerset"
4,4,16dbc860-696e-4cda-93f6-4dd4926573fb,Modern Shannon Sofa for sale at low cost | in ...,"Home & Garden / Dining, Living Room Furniture ...",New Design Shannon Corner sofa 5 Seater Avail...,£450.00,"Delph, Manchester"


## Clean the Data

### Images

In [25]:
# Remove null values
images = RemoveNullValues(images)



############## Cleaning null values ##############

----> Null values removed successfully. 

  ----> 0 columns removed

  ----> 0 rows removed



### Products

In [26]:
# Remove null values
products = RemoveNullValues(products)



############## Cleaning null values ##############

----> Null values removed successfully. 

  ----> 0 columns removed

  ----> 0 rows removed



In [27]:
# Convert the prices into a numerical format
products = ConvertPricesToNumericalFormat(products)



############## Converting price into numerical format: ##############

----> Product price column changed successfully



## Extracting Labels for Classification

### Labels

In [28]:
# Add a new column 'label' to the products data which extracts the root category from the 'category' column
products['label'] = products['category'].str.split(' / ').str[0]
products

Unnamed: 0.1,Unnamed: 0,id,product_name,category,product_description,price,location,label
0,0,243809c0-9cfc-4486-ad12-3b7a16605ba9,"Mirror wall art | in Wokingham, Berkshire | Gu...","Home & Garden / Dining, Living Room Furniture ...","Mirror wall art. Posted by Nisha in Dining, Li...",5.0,"Wokingham, Berkshire",Home & Garden
1,1,1c58d3f9-8b93-47ea-9415-204fcc2a22e6,"Stainless Steel Food Steamer | in Inverness, H...",Home & Garden / Other Household Goods,Morphy Richard’s (model no 48755)Stainless ste...,20.0,"Inverness, Highland",Home & Garden
2,2,860673f1-57f6-47ba-8d2f-13f9e05b8f9a,"Sun loungers | in Skegness, Lincolnshire | Gum...",Home & Garden / Garden & Patio / Outdoor Setti...,I have 2 of these - collection only as I don’t...,20.0,"Skegness, Lincolnshire",Home & Garden
3,3,59948726-29be-4b35-ade5-bb2fd7331856,Coffee side table from Ammunition ammo box hai...,"Home & Garden / Dining, Living Room Furniture ...",Great reclaimed army ammunition box used as co...,115.0,"Radstock, Somerset",Home & Garden
4,4,16dbc860-696e-4cda-93f6-4dd4926573fb,Modern Shannon Sofa for sale at low cost | in ...,"Home & Garden / Dining, Living Room Furniture ...",New Design Shannon Corner sofa 5 Seater Avail...,450.0,"Delph, Manchester",Home & Garden
...,...,...,...,...,...,...,...,...
7151,7151,c4148656-78a9-4f3e-b393-134fdc5ef900,Sony PlayStation VR Move Bundle | in Acocks Gr...,Video Games & Consoles / Consoles / PS4 (Sony ...,Sony PlayStation VR Move Bundle353CASH ON COLL...,260.0,"Acocks Green, West Midlands",Video Games & Consoles
7152,7152,564e3411-768d-4250-a624-b119d696f103,"Playstation VR V2 Bundle | in Acocks Green, We...",Video Games & Consoles / Consoles / PS4 (Sony ...,Playstation VR V2 Bundle355CASH ON COLLECTION ...,235.0,"Acocks Green, West Midlands",Video Games & Consoles
7153,7153,2b0a652b-46a2-4297-b619-5efeeb222787,"Oculus quest 2 256gb | in Montrose, Angus | Gu...",Video Games & Consoles / Other Video Games & C...,Pick up only £250Comes with two pistols stocks...,250.0,"Montrose, Angus",Video Games & Consoles
7154,7154,719fd40a-870e-4144-b324-55dff2e66fb4,Logitech driving force shifter | in Carrickfer...,Video Games & Consoles / Video Game Accessorie...,Bought at christmas from currys retailing at £...,30.0,"Carrickfergus, County Antrim",Video Games & Consoles


### Encoder and Decoder

In [29]:
# Create an encoder for the 'label' column (category to numerical value)
category_encoder = {label: idx for idx, label in enumerate(products['label'].unique())}
category_encoder

{'Home & Garden': 0,
 'Baby & Kids Stuff': 1,
 'DIY Tools & Materials': 2,
 'Music, Films, Books & Games': 3,
 'Phones, Mobile Phones & Telecoms': 4,
 'Clothes, Footwear & Accessories': 5,
 'Other Goods': 6,
 'Health & Beauty': 7,
 'Sports, Leisure & Travel': 8,
 'Appliances': 9,
 'Computers & Software': 10,
 'Office Furniture & Equipment': 11,
 'Video Games & Consoles': 12}

In [30]:
# Create a decoder (number to category)
category_decoder = {idx: category for category, idx in category_encoder.items()}
category_decoder

{0: 'Home & Garden',
 1: 'Baby & Kids Stuff',
 2: 'DIY Tools & Materials',
 3: 'Music, Films, Books & Games',
 4: 'Phones, Mobile Phones & Telecoms',
 5: 'Clothes, Footwear & Accessories',
 6: 'Other Goods',
 7: 'Health & Beauty',
 8: 'Sports, Leisure & Travel',
 9: 'Appliances',
 10: 'Computers & Software',
 11: 'Office Furniture & Equipment',
 12: 'Video Games & Consoles'}

#### Save Encoder and Decoder to a file for future use

In [31]:
import pickle

# Combine the encoder and decoder into a single dictionary for convenience
data = {'encoder': category_encoder, 'decoder': category_decoder}

# Specify the filename
filename = 'image_decoder.pkl'

# Save the dictionary to a pickle file
with open(filename, 'wb') as f:
    pickle.dump(data, f)

print(f'Dictionary saved to {filename}')

Dictionary saved to image_decoder.pkl


## Merge dataframes

In [32]:
# Merge images with products on product_id and id
training_data = pd.merge(images, products[['id', 'label']], left_on='product_id', right_on='id', how='left')

# Drop the extra id column
training_data.drop(columns=['id_y'], inplace=True)

# Rename the columns for clarity
training_data.rename(columns={'Unnamed: 0': 'index', 'id_x': 'image_id', 'id': 'product_id'}, inplace=True)

# Display the merged DataFrame
training_data

Unnamed: 0,index,image_id,product_id,label
0,0,912bb259-3ad9-457b-9db1-ce1da9016057,5f5f57d7-778f-4336-bb10-b43863418c8c,Home & Garden
1,1,b166d305-b852-4bdd-83f4-465b20da94fa,5f5f57d7-778f-4336-bb10-b43863418c8c,Home & Garden
2,2,68f5a29d-0075-4d60-81c1-ab684a82e50c,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,Home & Garden
3,3,f6a309d7-d247-446a-9b5e-aceefdd4334d,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,Home & Garden
4,4,2c2b3a6f-15b3-4289-937a-15482d9f5781,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf,Home & Garden
...,...,...,...,...
12599,12599,cdec1c5c-c4b1-42db-afbe-3fa68ea4b87d,2b0a652b-46a2-4297-b619-5efeeb222787,Video Games & Consoles
12600,12600,dc99e40f-6b15-494d-9fb7-f0d02e9781f9,719fd40a-870e-4144-b324-55dff2e66fb4,Video Games & Consoles
12601,12601,c8488028-bf07-4258-a4c2-56d2fe387835,719fd40a-870e-4144-b324-55dff2e66fb4,Video Games & Consoles
12602,12602,c6113145-89c8-47cd-9211-38f29d016cc7,86d1806b-5575-4a7e-9160-f24f12be6c95,Video Games & Consoles


## Check the distribution of labels

In [33]:
# Count the number of images per label
label_counts = training_data['label'].value_counts()

if 'label_name' in training_data.columns:
    label_name_mapping = training_data.set_index('label')['label_name'].to_dict()
    label_counts.index = label_counts.index.map(label_name_mapping)

# Display the counts
print("Number of images per label:")
print(label_counts)

Number of images per label:
label
Home & Garden                       1471
Office Furniture & Equipment        1177
Computers & Software                1136
Health & Beauty                     1088
Music, Films, Books & Games         1033
DIY Tools & Materials                938
Appliances                           917
Other Goods                          908
Sports, Leisure & Travel             860
Video Games & Consoles               828
Phones, Mobile Phones & Telecoms     786
Clothes, Footwear & Accessories      771
Baby & Kids Stuff                    691
Name: count, dtype: int64


## Encode the labels

In [34]:
# Encode the labels
training_data['label'] = training_data['label'].map(category_encoder)
training_data


Unnamed: 0,index,image_id,product_id,label
0,0,912bb259-3ad9-457b-9db1-ce1da9016057,5f5f57d7-778f-4336-bb10-b43863418c8c,0
1,1,b166d305-b852-4bdd-83f4-465b20da94fa,5f5f57d7-778f-4336-bb10-b43863418c8c,0
2,2,68f5a29d-0075-4d60-81c1-ab684a82e50c,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,0
3,3,f6a309d7-d247-446a-9b5e-aceefdd4334d,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,0
4,4,2c2b3a6f-15b3-4289-937a-15482d9f5781,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf,0
...,...,...,...,...
12599,12599,cdec1c5c-c4b1-42db-afbe-3fa68ea4b87d,2b0a652b-46a2-4297-b619-5efeeb222787,12
12600,12600,dc99e40f-6b15-494d-9fb7-f0d02e9781f9,719fd40a-870e-4144-b324-55dff2e66fb4,12
12601,12601,c8488028-bf07-4258-a4c2-56d2fe387835,719fd40a-870e-4144-b324-55dff2e66fb4,12
12602,12602,c6113145-89c8-47cd-9211-38f29d016cc7,86d1806b-5575-4a7e-9160-f24f12be6c95,12


In [35]:
# Dataset details
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12604 entries, 0 to 12603
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   index       12604 non-null  int64 
 1   image_id    12604 non-null  object
 2   product_id  12604 non-null  object
 3   label       12604 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 394.0+ KB


In [36]:
# Check if there are any not matched images - Display rows with any NaN values
rows_with_nan = training_data[training_data.isna().any(axis=1)]
rows_with_nan

Unnamed: 0,index,image_id,product_id,label


## Final Output for Tabular Data

In [37]:
# Save the training data to a CSV file
import os

target_folder = 'cleaned_data'
if not os.path.exists(target_folder):
    os.makedirs(target_folder)
    print(f'----> Target folder created successfully: {target_folder}\n')
training_data.to_csv(target_folder + '/training_data.csv', index=False)

print(f'----> Success. The training data has been saved to .csv file. \n')

----> Success. The training data has been saved to .csv file. 



# Processing Images

## Set up variables

In [38]:
########## VARIABLES ##########
source_img_dir = 'source_data/images'
cleaned_img_dir = 'cleaned_data/images/'  # Folder containing the cleaned images
final_size = 224    # Final size of the images after cleaning and transformation

transformSourceImg = v2.Compose([
    v2.ToImage(), # Convert to tensor, only needed for PIL images
    v2.RandomResizedCrop(size=(final_size, final_size), antialias=True),
    v2.RandomHorizontalFlip(p=0.5),
    v2.ToDtype(torch.float32, scale=True), # this has replaced ToTensor()
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.255]),
    v2.ToPILImage()
])



## Clean images

In [39]:
from clean_images import clean_image_data

clean_image_data(source_img_dir, cleaned_img_dir, transformSourceImg)



############## Cleaning image data ##############

----> Target folder created successfully: cleaned_data/images/

[############################################################] 100.0%  [12668 / 12668]
----> Images resized and saved successfully in the target folder: cleaned_data/images/

----> Image data cleaning completed successfully

