In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
pd.set_option('display.max_colwidth', 1000)

# FashionID Dataset

## Attributes Analysis
Loads the data.csv file and prints the most common attributes (tags). Then creates columns for the selected attributes and converts them to dummy variables.

In [None]:
data_path = '../../../data/fashionid'
df = pd.read_csv(os.path.join(data_path, 'data.csv'), sep=';', encoding='utf-8')

In [None]:
df.describe()

In [None]:
df['tags'] = df['attributes'].apply(lambda x: x.lower().split(','))

Get count for each of the tags, if it appears in the attributes list of any of the images.

In [None]:
tag_count = {}
def count_tags(tags):
    for tag in tags:
        t = tag.lower()
        if t in tag_count:
            tag_count[t] += 1
        else:
            tag_count[t] = 1
            
a = df['tags'].apply(count_tags)

Sort by the most used tags and print

In [None]:
tag_df = pd.Series(tag_count).reset_index().rename(columns={'index': 'tag', 0: 'count'})
num_imgs = df.shape[0]
tag_df['ratio'] = (pd.to_numeric(tag_df['count']) / num_imgs * 100).round(2)
tag_df = tag_df[tag_df['ratio'] > 1]
tag_df = tag_df.sort_values('ratio', ascending=False)

In [None]:
tag_df['tag'].unique().tolist()

## Select useful attributes
Find attributes that are used a lot and can be useful for the application.
Create a special column in the dataframe with each of the useful tags.

In [None]:
def create_tag_column(df, tag_list, column_name):
    """ 
        Create a new column in the given dataframe with the specified column name. 
        The column holds the tag if the tag is in the tag_list otherwise it holds np.nan.
    """
    
    df[column_name] = df['tags'].apply(lambda x: [s for s in x if s in tag_list])
    df[column_name] = [x[0].split(':')[-1].strip() if len(x) > 0 else np.nan for x in df[column_name]]
    
    return df

### Ärmellänge

In [None]:
tag_df[tag_df['tag'].str.contains('ärmel')]

In [None]:
aermel_list = tag_df[tag_df['tag'].str.contains('ärmel')]['tag'].tolist()
df = create_tag_column(df, aermel_list, 'ärmellänge')

In [None]:
df['ärmellänge'].unique()

Merging similiar attributes together

In [None]:
df['ärmellänge'] = ['ärmellos' if x == 'ärmelloser schnitt' else x for x in df['ärmellänge']]
df['ärmellänge'] = ['ärmellos' if x == 'ärmelloser' else x for x in df['ärmellänge']]

Renaming attributes according to the aboutyou dataset

In [None]:
df['ärmellänge'] = ['langarm' if x == 'lange ärmel' else x for x in df['ärmellänge']]
df['ärmellänge'] = ['viertelarm' if x == 'kurze ärmel' else x for x in df['ärmellänge']]
df['ärmellänge'] = ['dreiviertelarm' if x == 'dreiviertel-ärmel' else x for x in df['ärmellänge']]
df['ärmellänge'] = ['dreiviertelarm' if x == 'dreiviertel-ärmel mit ausgestellten abschlüssen' else x for x in df['ärmellänge']]

Deleting useless tags

In [None]:
df['ärmellänge'] = [x if x in ['ärmellos', 'langarm', 'viertelarm', 'dreiviertelarm'] else np.nan for x in df['ärmellänge']]

### Muster

In [None]:
tag_df[(tag_df['tag'].str.contains(r'floral|streif|punkt|muster|spitze$'))]

In [None]:
muster_list = tag_df[(tag_df['tag'].str.contains(r'floral|streif|punkt|muster|spitze$'))]['tag'].tolist()
df = create_tag_column(df, muster_list, 'muster')

In [None]:
df['muster'].unique()

In [None]:
df['muster'] = ['gestreift' if x == 'streifenmuster' else x for x in df['muster']]
df['muster'] = ['geblümt/floral' if x == 'florales muster' else x for x in df['muster']]
df['muster'] = ['all-over-muster' if x == 'allover-muster' else x for x in df['muster']]
df['muster'] = ['spitze' if 'spitze' in str(x) else x for x in df['muster']]

In [None]:
df['muster'] = [x if x in ['gestreift', 'geblümt/floral', 'all-over-muster', 'spitze'] else np.nan for x in df['muster']]

## Create dummies
Turn the created attribute columns into dummy variables.

In [None]:
df = df.set_index('img_url')
df_dum = pd.get_dummies(df[['category', 'color', 'ärmellänge', 'muster']]).reset_index()

In [None]:
df_dum.head()

Join the relative image path based on the img_url and drop the img_url. This creates a dataframe that holds the relative image path and all its dummy variables.

In [None]:
df_dum = df_dum.groupby(['img_url']).max().reset_index()
df_paths = df.groupby(['img_url']).first()['img_path'].reset_index()
df_dum = df_paths.merge(df_dum, how='inner').drop(['img_url'], axis=1)

In [None]:
df_dum.head()

In [None]:
df_dum.to_csv(os.path.join(data_path, 'img_attr.csv', sep='\t', encoding='utf-8', index=False)