<a href="https://www.kaggle.com/code/candacevogel/jewelry-recognition-candace-fork?scriptVersionId=264763579" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision.transforms.v2 as transforms
from torchvision import datasets 
from torchvision.transforms import ToTensor

from os import path
from PIL import Image

# This next will will load our data for us and create path to the files

In [2]:
dataset_root = '/kaggle/input/celeba-dataset'

image_path = path.join(dataset_root, 'img_align_celeba/img_align_celeba')
attributes_path = path.join(dataset_root, 'list_attr_celeba.csv')

# Reading the attributes of the pictures

In [3]:
df_attributes = pd.read_csv(attributes_path)
df_attributes.head()

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
1,000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
2,000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
3,000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,1,-1,1,1,-1,1
4,000005.jpg,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1


# Changes the value from -1 to 1 to 0 to 1

In [4]:
df_attributes.replace(-1, 0, inplace = True)
df_attributes['Wearing_Necklace'].head(200)

0      0
1      0
2      0
3      1
4      0
      ..
195    0
196    0
197    0
198    0
199    0
Name: Wearing_Necklace, Length: 200, dtype: int64

# Checking for how many people in the dataset have necklaces

In [5]:
df_attributes['Wearing_Necklace'].value_counts()

Wearing_Necklace
0    177686
1     24913
Name: count, dtype: int64

# Balance dataset

In [6]:
necklace_df = df_attributes[df_attributes['Wearing_Necklace'] == 1 ]
no_necklace_df = df_attributes[ df_attributes['Wearing_Necklace'] == 0]

necklace_count = len(necklace_df)

no_necklace_same_size = no_necklace_df.sample(necklace_count)

df_necklace_training = pd.concat( [necklace_df, no_necklace_same_size], axis = 0)

df_necklaces = df_necklace_training[ [ 'image_id', 'Wearing_Necklace']]
df_necklaces.sample(10)

Unnamed: 0,image_id,Wearing_Necklace
36244,036245.jpg,1
12226,012227.jpg,1
102571,102572.jpg,0
37902,037903.jpg,1
77048,077049.jpg,0
168411,168412.jpg,1
124779,124780.jpg,1
166240,166241.jpg,0
137500,137501.jpg,0
123933,123934.jpg,0


# Create dataset class

In [7]:
class CelebDataset(Dataset):
    def __init__(self, images_path, attributes_dataframe, img_transform=None, attr_transform=None):
        self.images_path = images_path
        self.attributes_dataframe = attributes_dataframe
        self.img_transform = img_transform
        self.attr_transform = attr_transform
        self.image_filename = attributes_dataframe['image_id'].tolist()

    def __getitem__(self, index):
        image_filename = self.image_filename[index]
        image_path = path.join(self.images_path, image_filename)
        img = Image.open(image_path).convert('L')

        attributes = self.attributes_dataframe.iloc[index]
        necklace = attributes.Wearing_Necklace.astype('int')

        if self.img_transform:
            img = self.img_transform(img)

        if self.attr_transform:
            necklace = self.attr_transform(necklace)

        return img, necklace

    def __len__(self):
        return len(self.attributes_dataframe)

# Define image transformations

In [8]:
image_size = 128

image_transform = transforms.Compose([
    transforms.Resize(image_size),
    transforms.CenterCrop([image_size, image_size]),
    transforms.ToTensor(),
])

dataset = CelebDataset(image_path, df_necklace_training, image_transform)




# Divide dataset into training and test data

In [9]:
# 80% training 
# 20% testing
train_dataset, test_dataset = random_split(dataset, (0.8, 0.2) )

# Create dataloaders for training & testing data

In [10]:
batch_size = 32
# need to batch data to train and test in smaller chunks, data is too big
# batch for speed
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
# DataLoader function for sending information to our NN

# Visualize example images