In [50]:
import os
import pandas as pd
import json
from pathlib import Path
from collections import defaultdict
from copy import deepcopy

texts = json.load(open('/home/soon/datasets/deepfashion_multimodal/captions.json'))
text_ids = list(texts.keys())

data_file = 'train.txt'

root = Path('/home/soon/datasets/deepfashion_inshop')
image_root = root/'img_highres'
pose_root = root/'smpl'
style_root = root/'styles'


In [51]:
def select_style_dir(person_dir):
    subdir_dict = {}
    max_count = -1
    max_name = None
    for root, dirs, files in os.walk(person_dir):
        if len(dirs) == 0:
            count = len(files)
            name = root.split('/')[-1]
            subdir_dict[name] = count
            if count > max_count:
                max_count = count
                max_name = name
    return max_name

def get_style_folder(image_file):
    f_dir, f_name = os.path.split(image_file)
    ret = f_name.split('.')[0].split('_')
    person_id = ret[0]
    style_id = '_'.join(ret[1:])
    
    person_dir = style_root/f_dir/person_id

    if person_dir.exists():
        if (person_dir/style_id).exists():
            selected = style_id
        else:
            selected = select_style_dir(person_dir)

        style_dir = os.path.join(f_dir, person_id, selected)
    else:
        style_dir = ''
    return str(style_dir)

def get_smpl_id(image_file):
    if (pose_root/image_file).exists():
        return image_file.replace('.jpg','')
    else:
        return ''
    
def get_text_base(text_id):
    i = deepcopy(text_id)
    i = i[:i.rfind('_')]
    i = i[:i.rfind('_')]  
    return i

def get_text_dict(text_ids):
    d = defaultdict(list)
    for text_id in text_ids:
        i = get_text_base(text_id)
        d[i].append(text_id)
    return d

text_dict = get_text_dict(text_ids)

def get_text_id(image_file):
    image_id = image_file.replace('/','-')

    if image_id in text_ids:
        return image_id
    else:
        text_base = get_text_base(image_id)
        text_list = text_dict.get(text_base, [''])
        if len(text_list) == 0:
            return text_list[0]
        else: # search for full description
            for t in text_list:
                if 'full' in t:
                    return t
            return t

In [52]:
df = pd.read_csv(data_file, names=['image'])
image_files = list(df.image)
df['styles'] = [get_style_folder(f) for f in image_files ]
df['pose'] = [get_smpl_id(f) for f in image_files ]
df['text'] = [get_text_id(f) for f in image_files]

drop_indices = df[(df['text']=='') & (df['styles']=='')].index
df = df.drop(drop_indices)

In [53]:
df.to_csv(data_file.replace('.txt','.csv'))

In [54]:
print(drop_indices)

Int64Index([ 2683,  7710,  9092, 11147, 14624, 19551, 19995, 20033, 22127,
            22180, 25533, 27515, 27699, 30135, 31042, 33463, 34144, 40273,
            43636, 43755, 44837, 45178],
           dtype='int64')


In [55]:
row = df.iloc[2]

In [56]:
print(row)

image          MEN/Pants/id_00001611/01_1_front.jpg
styles             MEN/Pants/id_00001611/01/1_front
pose               MEN/Pants/id_00001611/01_1_front
text      MEN-Pants-id_00001611-01_7_additional.jpg
Name: 2, dtype: object


In [57]:
df.head()

Unnamed: 0,image,styles,pose,text
0,WOMEN/Skirts/id_00000629/04_3_back.jpg,WOMEN/Skirts/id_00000629/04/4_full,WOMEN/Skirts/id_00000629/04_3_back,WOMEN-Skirts-id_00000629-04_4_full.jpg
1,WOMEN/Tees_Tanks/id_00007838/05_7_additional.jpg,WOMEN/Tees_Tanks/id_00007838/05/4_full,,WOMEN-Tees_Tanks-id_00007838-05_4_full.jpg
2,MEN/Pants/id_00001611/01_1_front.jpg,MEN/Pants/id_00001611/01/1_front,MEN/Pants/id_00001611/01_1_front,MEN-Pants-id_00001611-01_7_additional.jpg
3,WOMEN/Shorts/id_00006215/07_4_full.jpg,WOMEN/Shorts/id_00006215/07/4_full,WOMEN/Shorts/id_00006215/07_4_full,WOMEN-Shorts-id_00006215-07_4_full.jpg
4,MEN/Tees_Tanks/id_00001222/04_3_back.jpg,MEN/Tees_Tanks/id_00001222/04/4_full,MEN/Tees_Tanks/id_00001222/04_3_back,MEN-Tees_Tanks-id_00001222-04_3_back.jpg


In [61]:
from glob import glob
style_files = glob(str(style_root/row.styles/'*.jpg'))

['/home/soon/datasets/deepfashion_inshop/styles/MEN/Pants/id_00001611/01/1_front/background.jpg',
 '/home/soon/datasets/deepfashion_inshop/styles/MEN/Pants/id_00001611/01/1_front/shoes.jpg',
 '/home/soon/datasets/deepfashion_inshop/styles/MEN/Pants/id_00001611/01/1_front/top.jpg',
 '/home/soon/datasets/deepfashion_inshop/styles/MEN/Pants/id_00001611/01/1_front/bottom.jpg']

In [62]:
style_names = ['face', 'background', 'top', 'bottom', 'shoes', 'accesories']

In [73]:
import torch
from torchvision import transforms as T
clip_transform = T.Compose([
            T.ToTensor(),
            T.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
        ])

style_images = []
for style_name in style_names[:]:
    f_path = style_root/row.styles/f'{style_name}.jpg'
    if f_path.exists():
        style_image = clip_transform(Image.open(f_path))
    else:
        style_image = torch.zeros(3, 224, 224)
    style_images.append(style_image)
torch.stack(style_images)    

In [77]:
row.pose+'.jpg'

'MEN/Pants/id_00001611/01_1_front.jpg'

In [42]:
def get_style_folder(image_file):
    f_dir, f_name = os.path.split(image_file)
    ret = f_name.split('.')[0].split('_')
    person_id = ret[0]
    style_id = '_'.join(ret[1:])
    
    person_dir = style_root/f_dir/person_id
    import pdb
    #pdb.set_trace()
    if person_dir.exists():
        if (person_dir/style_id).exists():
            selected = style_id
        else:
            selected = select_style_dir(person_dir)

        style_dir = os.path.join(f_dir, person_id, selected)
    else:
        style_dir = ''
    return str(style_dir)

get_style_folder(str(row.image))

{'4_full': 4}


'WOMEN/Rompers_Jumpsuits/id_00001045/07/4_full'

In [81]:
len(df[(df.pose == '')].index)

623

In [85]:
df = pd.read_csv('train.csv')

In [86]:
df.iloc[34943]

Unnamed: 0                                             34960
image         WOMEN/Jackets_Coats/id_00000474/02_1_front.jpg
styles                                                   NaN
pose              WOMEN/Jackets_Coats/id_00000474/02_1_front
text          WOMEN-Jackets_Coats-id_00000474-02_1_front.jpg
Name: 34943, dtype: object

In [92]:
len(df[df.styles.isnull()])

2615

In [93]:
len(df)

48018