In [19]:
# Data investigation
import os
import pandas as pd

In [9]:
from fastai import *

In [15]:
os.chdir("/Users/sucooper/Documents/brown-bag/clothing-classifier/clothing_classification")
os.listdir()

['export.pkl',
 'apparel-dataset',
 'apparel-dataset.zip',
 '.gitignore',
 'Model_training.ipynb',
 '.git']

In [None]:
# Load dataset
if 'apparel-dataset' not in os.listdir():
    if 'apparel-dataset.zip' not in os.listdir():
        print('Dataset has not been downloaded. Downloading now from kaggle')
        !kaggle datasets download -d kaiska/apparel-dataset
    # unzip
    !unzip apparel-dataset.zip -d ./apparel-dataset
# move dataset into the correct location
else:
    print("Dataset already loaded")

In [54]:
# Tabulate the kinds of data
df = pd.DataFrame([item.split('_') for item in os.listdir('apparel-dataset')])
df.columns = ['Colour', 'Item']
df['Count'] = [len(os.listdir('apparel-dataset/' + item)) for item in os.listdir('apparel-dataset')]
df['Count'] = df['Count'].astype(int)


In [55]:
shaped_df = df.pivot(index='Colour', columns='Item', values='Count')
shaped_df[shaped_df.isna()] = 0

In [56]:
shaped_df

Item,dress,hoodie,pants,shirt,shoes,shorts,skirt,suit
Colour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
black,450.0,0.0,870.0,715.0,766.0,328.0,0.0,320.0
blue,502.0,0.0,798.0,741.0,523.0,299.0,0.0,0.0
brown,0.0,188.0,311.0,0.0,464.0,0.0,0.0,0.0
green,0.0,0.0,227.0,230.0,455.0,135.0,0.0,243.0
pink,0.0,347.0,246.0,0.0,0.0,0.0,513.0,0.0
red,800.0,349.0,308.0,332.0,610.0,0.0,0.0,0.0
silver,0.0,0.0,0.0,0.0,403.0,0.0,361.0,0.0
white,818.0,0.0,274.0,0.0,600.0,120.0,0.0,354.0
yellow,566.0,0.0,0.0,0.0,0.0,195.0,409.0,0.0


In [20]:
# convert to multilabel format
dataset_dir = Path('apparel-dataset')
# create a csv file with | filelocation | labels |
fns = get_image_files(dataset_dir)


In [38]:
labels = [{"fname":image, "labels":image.parent.name.replace('_', ' ')} for image in fns]
csv_labels = pd.DataFrame(labels)
csv_labels.to_csv('multi_labels.csv', index=False)

In [63]:
# Create a sub dataset
subset = [{"fname": 'apparel-dataset/' + cat +'/' + i, "labels":cat.replace('_', ' ')} for cat in os.listdir('apparel-dataset') for i in os.listdir('apparel-dataset/' + cat)[0:10]]
subset_df = pd.DataFrame(subset)
subset_df.to_csv('multilabel_subset.csv', index=False)

In [66]:
# Image Block and Category Block
filename = 'multilabel_subset.csv'
multilables = pd.read_csv(filename)

# Turning it into a multilabel data block
path = Path('.')
def get_x(r): return path/r['fname']
def get_y(r): return r['labels'].split(' ')
dblock = DataBlock(blocks=(ImageBlock, MultiCategoryBlock), get_x = get_x, get_y = get_y, item_tfms=RandomResizedCrop(128, min_scale=0.35))
dsets = dblock.datasets(multilables)

In [67]:
dsets

(#370) [(PILImage mode=RGB size=256x256, TensorMultiCategory([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),(PILImage mode=RGB size=256x256, TensorMultiCategory([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),(PILImage mode=RGB size=256x256, TensorMultiCategory([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),(PILImage mode=RGB size=256x256, TensorMultiCategory([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),(PILImage mode=RGB size=256x256, TensorMultiCategory([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),(PILImage mode=RGB size=256x256, TensorMultiCategory([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),(PILImage mode=RGB size=256x256, TensorMultiCategory([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),(PILImage mode=RGB size=256x256, TensorMultiCategory([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),

In [43]:
len(dsets.train),len(dsets.valid)

(12936, 3234)

In [32]:
# Multilabel training
clothes = DataBlock(
    blocks=(ImageBlock, CategoryBlock), 
    get_items=get_image_files, 
    splitter=RandomSplitter(valid_pct=0.2, seed=42),
    get_y=parent_label,
    item_tfms=RandomResizedCrop(460, min_scale=0.5),
    batch_tfms=aug_transforms(size=224, min_scale=0.7))