In [1]:
import pandas as pd
import os
from PIL import Image
from matplotlib.pyplot import imshow
from shutil import copyfile
import numpy as np

%matplotlib inline

# Join Datasets
Unions the aboutyou dataset and fashionID dataset together, based on the columns that they have in common.

In [2]:
data_path = '../../../data/'
df_about = pd.read_csv(os.path.join(data_path, 'aboutyou/img_attr.csv'), sep='\t', encoding='utf-8')
df_zalando = pd.read_csv(os.path.join(data_path, 'zalando/img_attr.csv'), sep='\t', encoding='utf-8')

In [3]:
common_cols = list(set(df_about.columns).intersection(set(df_zalando.columns)))
print(sorted(common_cols))

['ausschnitt_rückenausschnitt', 'ausschnitt_v-ausschnitt', 'category_hosen', 'category_jeans', 'category_kleider', 'category_roecke', 'category_shirts', 'color_beige', 'color_black', 'color_blue', 'color_gray', 'color_green', 'color_pink', 'color_red', 'color_white', 'color_yellow', 'img_path', 'länge_knielang', 'länge_normale länge', 'muster_gepunktet', 'muster_gestreift', 'passform_skinny', 'ärmellänge_dreiviertelarm', 'ärmellänge_halbarm', 'ärmellänge_langarm', 'ärmellänge_ärmellos']


In [4]:
col_names_mappings = {'ausschnitt': 'neckline',
                      'länge': 'length',
                      'ärmellänge': 'sleeves',
                      'passform': 'fit',
                      'muster': 'pattern',
                      'category': 'category',
                      'color': 'color'}

In [5]:
def rename_column_names(df_columns, mapping):
    renamed_cols = []
    
    for col in df_columns:
        col_name, col_value = col.split('_')
        col_name = mapping[col_name] if col_name in mapping else col_name
        renamed_cols.append(col_name + '_' + col_value)
        
    return renamed_cols

In [6]:
df_about.columns = rename_column_names(df_about.columns, col_names_mappings)
df_zalando.columns = rename_column_names(df_zalando.columns, col_names_mappings)

print(df_zalando.columns)

Index(['img_path', 'category_blusen-tuniken', 'category_hosen',
       'category_jacken-maentel', 'category_jeans', 'category_kleider',
       'category_pullover-und-strickjacken', 'category_roecke',
       'category_shirts', 'color_beige', 'color_black', 'color_blue',
       'color_gray', 'color_green', 'color_pink', 'color_red', 'color_white',
       'color_yellow', 'sleeves_dreiviertelarm', 'sleeves_extrakurzer arm',
       'sleeves_extralanger arm', 'sleeves_halbarm', 'sleeves_kurzarm',
       'sleeves_langarm', 'sleeves_spaghettiträger', 'sleeves_ärmellos',
       'length_7/8 länge', 'length_extra kurz', 'length_extra lang',
       'length_knielang', 'length_knöchellang', 'length_kurz', 'length_lang',
       'length_normale länge', 'length_oberschenkellang', 'length_wadenlang',
       'pattern_geblümt', 'pattern_gepunktet', 'pattern_gestreift',
       'pattern_kariert', 'pattern_meliert', 'pattern_print',
       'pattern_unifarben', 'fit_flare', 'fit_körpernah', 'fit_loose fit',
 

In [7]:
def unstack_columns(df, mappings):
    df_stacked = pd.DataFrame()
    
    for key, value in mappings.items():
        cols = [col for col in df.columns if value in col]
        df_col = pd.DataFrame()
        df_col[value] = df.set_index('img_path')[cols].replace({0: None}).idxmax(axis=1)
        df_col[value] = df_col[value].apply(lambda x: str(x).split('_')[-1])
        df_stacked = pd.concat([df_stacked, df_col], axis=1)

    return df_stacked

In [8]:
df_about = unstack_columns(df_about, col_names_mappings)
df_zalando = unstack_columns(df_zalando, col_names_mappings)

df_zalando.head()

Unnamed: 0_level_0,neckline,length,sleeves,fit,pattern,category,color
img_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pullover-und-strickjacken/0RA41G002-C11.jpg,,normale länge,langarm,normal,meliert,pullover-und-strickjacken,gray
hosen/0VB21A009-B11.jpg,,lang,,straight leg,unifarben,hosen,beige
roecke/0VB21B007-K11.jpg,,knielang,,normal,unifarben,roecke,blue
jacken-maentel/0VB21P000-G11.jpg,,wadenlang,extralanger arm,oversized,unifarben,jacken-maentel,red
hosen/10K21A001-Q11.jpg,,knöchellang,,tapered leg,,hosen,black


In [9]:
col_values_mappings = {
    'neckline': {
        'round': ['rundhals', 'tiefer rundhals', 'rundhals-ausschnitt'],
        'v': ['v-ausschnitt', 'tiefer v-ausschnitt', 'cache-coeur'],
        'back': ['rückenausschnitt'],
        'lined': ['eingefasster ausschnitt'],
        'wide': ['weiter ausschnitt', 'carmen', 'tiefer ausschnitt/dekolleté', 'u-boot', 'u-boot-ausschnitt']
    },
    'category': {
        'tops': ['shirts', 'tops'],
        'pants': ['hosen', 'jeans'],
        'jackets': ['jacken-maentel', 'jacken'],
        'dresses': ['kleider'],
        'skirts': ['roecke'],
        'knitwear': ['pullover-und-strickjacken', 'strick'],
        'blouses': ['blusen-tuniken', 'blusen-und-tuniken'],
        'jumpsuits': ['jumpsuits-und-overalls']
    },
    'length': {
        'short': ['extra kurz', 'kurz', 'kurz/mini', 'oberschenkellang', 'kurzer schnitt'],
        'normal': ['normale länge'],
        'knee': ['knielang'],
        '3-4': ['3/4-lang', '7/8 länge', '7/8-lang', 'wadenlang'],
        'long': ['knöchellang', 'extra lang', 'lang', 'lang/maxi', 'langer schnitt']
    },
    'pattern': {
        'floral': ['geblümt', 'geblümt/floral'],
        'polkadots': ['gepunktet'],
        'stripes': ['gestreift'],
        'print': ['kariert', 'meliert', 'print', 'all-over-muster'],
        'lace': ['spitze'],
        'unicolors': ['unifarben']
    },
    'fit': {
        'normal': ['flare', 'normal', 'straight leg', 'regular', 'normale passform'],
        'tight': ['körpernah', 'schmal', 'skinny', 'slim fit', 'tailiert', 'tailliert',
                  'figurbetonte passform', 'slimfit', 'schmale passform'],
        'loose': ['oversized', 'tapered leg', 'weit geschnitten', 'lockere passform',
                 'loose fit', 'loosefit']
    },
    'sleeves': {
        'short': ['extrakurzer arm', 'kurzarm', 'viertelarm'],
        'half': ['dreiviertelarm', 'halbarm'],
        'long': ['extralanger arm', 'langarm'],
        'sleeveless': ['spaghettiträger', 'ärmellos']
    }
}


In [10]:
def rename_column_values(df, col_values_mapping):
    for col_name, col_mappings in col_values_mappings.items():
        for col_key, col_values in col_mappings.items():
            df.loc[df[col_name].isin(col_values), col_name] = col_key
            
    return df

In [11]:
df_about = rename_column_values(df_about, col_values_mappings)
df_zalando = rename_column_values(df_zalando, col_values_mappings)

In [12]:
# check that the two dataframes have all columns in common
diff = list(set(df_about.columns).difference(set(df_zalando.columns)))
print(diff)

[]


## Create new folder with merged dataset

In [13]:
df_about['data_path'] = 'aboutyou'
df_zalando['data_path'] = 'zalando'
df = df_zalando.append(df_about)
print('Num data points', df.shape)
df.head()

Num data points (82783, 8)


Unnamed: 0_level_0,neckline,length,sleeves,fit,pattern,category,color,data_path
img_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
pullover-und-strickjacken/0RA41G002-C11.jpg,,normal,long,normal,print,knitwear,gray,zalando
hosen/0VB21A009-B11.jpg,,long,,normal,unicolors,pants,beige,zalando
roecke/0VB21B007-K11.jpg,,knee,,normal,unicolors,skirts,blue,zalando
jacken-maentel/0VB21P000-G11.jpg,,3-4,long,loose,unicolors,jackets,red,zalando
hosen/10K21A001-Q11.jpg,,long,,loose,,pants,black,zalando


In [None]:
df.loc['blusen-tuniken/2ET21E0L1-A11.jpg']

In [14]:
merged_folder = os.path.join(data_path, 'fashion')
if not os.path.exists(merged_folder):
    os.makedirs(merged_folder)

In [None]:
for idx, img_path in enumerate(df.index):
    if idx % 5000 == 0:
        print('Copied images: ', idx)
    
    src_path = os.path.join(data_path, df.loc[img_path, 'data_path'], img_path)
    dst_path = os.path.join(merged_folder, df.loc[img_path, 'category'], os.path.basename(img_path))
    
    dst_folder = os.path.dirname(dst_path)
    if not os.path.exists(dst_folder):
        os.makedirs(dst_folder)
    
    copyfile(src_path, dst_path)

Change the img paths in the dataframe to match the new category folder structure.

In [15]:
df = df.drop(['data_path'], axis=1)
df = df.reset_index()
df['img_path'] = df['img_path'].apply(lambda x: os.path.basename(x))
df['img_path'] = df['category'] + '/' + df['img_path']
df = df.set_index('img_path')
df.head()

Unnamed: 0_level_0,neckline,length,sleeves,fit,pattern,category,color
img_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
knitwear/0RA41G002-C11.jpg,,normal,long,normal,print,knitwear,gray
pants/0VB21A009-B11.jpg,,long,,normal,unicolors,pants,beige
skirts/0VB21B007-K11.jpg,,knee,,normal,unicolors,skirts,blue
jackets/0VB21P000-G11.jpg,,3-4,long,loose,unicolors,jackets,red
pants/10K21A001-Q11.jpg,,long,,loose,,pants,black


In [16]:
df = df.fillna(0)
df.head()

Unnamed: 0_level_0,neckline,length,sleeves,fit,pattern,category,color
img_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
knitwear/0RA41G002-C11.jpg,,normal,long,normal,print,knitwear,gray
pants/0VB21A009-B11.jpg,,long,,normal,unicolors,pants,beige
skirts/0VB21B007-K11.jpg,,knee,,normal,unicolors,skirts,blue
jackets/0VB21P000-G11.jpg,,3-4,long,loose,unicolors,jackets,red
pants/10K21A001-Q11.jpg,,long,,loose,,pants,black


In [17]:
df_dum = pd.get_dummies(df)
df_dum.head()

Unnamed: 0_level_0,neckline_back,neckline_lined,neckline_nan,neckline_round,neckline_v,neckline_wide,length_3-4,length_knee,length_long,length_nan,...,category_tops,color_beige,color_black,color_blue,color_gray,color_green,color_pink,color_red,color_white,color_yellow
img_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
knitwear/0RA41G002-C11.jpg,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
pants/0VB21A009-B11.jpg,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
skirts/0VB21B007-K11.jpg,0,0,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
jackets/0VB21P000-G11.jpg,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
pants/10K21A001-Q11.jpg,0,0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [18]:
non_nan_cols = [col for col in df_dum.columns if not col.endswith('nan')]
non_nan_cols

['neckline_back',
 'neckline_lined',
 'neckline_round',
 'neckline_v',
 'neckline_wide',
 'length_3-4',
 'length_knee',
 'length_long',
 'length_normal',
 'length_short',
 'sleeves_half',
 'sleeves_long',
 'sleeves_short',
 'sleeves_sleeveless',
 'fit_loose',
 'fit_normal',
 'fit_tight',
 'pattern_floral',
 'pattern_lace',
 'pattern_polkadots',
 'pattern_print',
 'pattern_stripes',
 'pattern_unicolors',
 'category_blouses',
 'category_dresses',
 'category_jackets',
 'category_jumpsuits',
 'category_knitwear',
 'category_pants',
 'category_skirts',
 'category_tops',
 'color_beige',
 'color_black',
 'color_blue',
 'color_gray',
 'color_green',
 'color_pink',
 'color_red',
 'color_white',
 'color_yellow']

In [None]:
df_stats = df_dum.copy()
new_cols = [tuple(col.split('_')) for col in df_dum.columns.values]
df_stats.columns = pd.MultiIndex.from_tuples(new_cols)
df_stats = df_stats.sum() / df_stats.shape[0] * 100
df_stats.to_csv(os.path.join(merged_folder, 'data_stats.csv'))
df_stats

In [19]:
df_dum = df_dum[non_nan_cols]
df_dum.to_csv(os.path.join(merged_folder, 'img_attr.csv'), encoding='utf-8')