# Clean Metadata

##  Imports

In [5]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path  # for OS-safe path handling
import os
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
import pickle

In [6]:
# Ignore the warning 'A value is trying to be set on a copy of a slice from a DataFrame'
pd.options.mode.chained_assignment = None

In [7]:
#run_timestamp = f'{datetime.now().strftime("%Y%m%d_%H%M%S")}'
#image_folder = Path('X:\ImageClassifier\ResizedImages')

image_folder = '/home/ubuntu/efs/original-data/images-small-new'


## Pipeline

#### Load raw data from pickle

In [8]:
#path_pickle = Path('data') / 'parsed_records_dump_20200103.p'
path_pickle='parsed_records_dump_20200103.p'
parsed_records = pickle.load(open(path_pickle, 'rb'))

In [9]:
#pickle.dump(parsed_records, open( Path('data') / f'metadata_{datetime.now().strftime("%Y%m%d_%H%M%S")}.p', 'wb' ))

#### Convert to dataframe

In [10]:
meta_data_columns = [
    'globus_id',
    'descr',
    'name',
    'gender',
    'source_color',
    'color',
    'url',
    'material',
    'features',
    'season',
    'hierarchy_full',
    #     'hierarchy_1','hierarchy_2', 'hierarchy_3', 'hierarchy_4', 'hierarchy_5', 'hierarchy_6',
    'href',
    'prod_id'
]

In [11]:
df_metadata = pd.DataFrame(parsed_records, columns=meta_data_columns)

In [12]:
df_metadata

Unnamed: 0,globus_id,descr,name,gender,source_color,color,url,material,features,season,hierarchy_full,href,prod_id
0,367762.0,Buch,Globi,kinder,,,https://apitest.censhare.globus.ch/image/36776...,[],"[globus:pim.heimhaushalt.sprache-deutsch, glob...",B,globus:pim.category.kinder.spielwarencdbuecher,https://apitest.censhare.globus.ch/products/53...,10665956000000
1,367760.0,Buch,Globi,kinder,,,https://apitest.censhare.globus.ch/image/36776...,[],"[globus:pim.heimhaushalt.sprache-deutsch, glob...",B,globus:pim.category.kinder.spielwarencdbuecher,https://apitest.censhare.globus.ch/products/53...,10665992000000
2,367726.0,Buch,H&H No Name,kinder,,,https://apitest.censhare.globus.ch/image/36772...,[],"[globus:pim.heimhaushalt.sprache-deutsch, glob...",B,globus:pim.category.kinder.spielwarencdbuecher,https://apitest.censhare.globus.ch/products/53...,10665994000000
3,,mascaras,Diorshow Mascara,,,,,[],"[globus:pim.beauty.naturkosmetik-False, globus...",B,globus:pim.category.beauty.make-up,https://apitest.censhare.globus.ch/products/53...,BP106660110011298
4,492739.0,HoerbuchCD,Globi,kinder,,,https://apitest.censhare.globus.ch/image/49273...,[globus:pim.general.material-pim-kunststoff],"[globus:pim.heimhaushalt.sprache-deutsch, glob...",B,globus:pim.category.kinder.spielwarencdbuecher,https://apitest.censhare.globus.ch/products/53...,10666015000000
5,22870.0,,Redwood Creek Chardonnay,,,,https://apitest.censhare.globus.ch/image/22870...,[],"[globus:pim.general.ursprungsland-US, globus:p...",B,globus:pim.category.delicatessa.getranke.weine,https://apitest.censhare.globus.ch/products/53...,10666173000000
6,22876.0,,"Redwood Creek, Merlot 2012",,,,https://apitest.censhare.globus.ch/image/22876...,[],"[globus:pim.general.ursprungsland-US, globus:p...",B,globus:pim.category.delicatessa.getranke.weine,https://apitest.censhare.globus.ch/products/53...,10666175000000
7,393497.0,serum,Powercell Serum,,,,https://apitest.censhare.globus.ch/image/39349...,[],"[globus:pim.beauty.naturkosmetik-False, globus...",B,globus:pim.category.beauty.pflege,https://apitest.censhare.globus.ch/products/53...,10666523000000
8,37326.0,toilettenbursten,Ersatz-Toilettenbürstenkopf,,weiss,weiss,https://apitest.censhare.globus.ch/image/37326...,[globus:pim.general.material-pim-kunststoff],[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.heim-haushalt.bad.badacces...,https://apitest.censhare.globus.ch/products/53...,1066657600000
9,393501.0,serum,Powercell Serum,,,,https://apitest.censhare.globus.ch/image/39350...,[],"[globus:pim.beauty.naturkosmetik-False, globus...",B,globus:pim.category.beauty.pflege,https://apitest.censhare.globus.ch/products/53...,10666618000000


In [13]:
len(df_metadata['globus_id'].unique())

141405

#### Treat material as a feature

In [14]:
df_metadata['features'] = df_metadata['material'] + df_metadata['features']
df_metadata.drop(['material'], axis=1, inplace=True)

#### Drop all rows that do not have: url/features/hierarchy/descr


In [15]:
df_metadata.dropna(subset=['url'], inplace=True)
df_metadata.dropna(subset=['features'], inplace=True)
df_metadata.dropna(subset=['hierarchy_full'], inplace=True)
df_metadata.dropna(subset=['descr'], inplace=True)
df_metadata.dropna(subset=['source_color'], inplace=True)

#### Delete duplicates in features.


In [16]:
df_metadata['features'] = df_metadata.apply(
    lambda row: list(set(row['features'])), axis=1)

#### Make GlobusId an INT

In [17]:
df_metadata['globus_id'] = df_metadata['globus_id'].astype('int')

In [18]:
df_metadata

Unnamed: 0,globus_id,descr,name,gender,source_color,color,url,features,season,hierarchy_full,href,prod_id
8,37326,toilettenbursten,Ersatz-Toilettenbürstenkopf,,weiss,weiss,https://apitest.censhare.globus.ch/image/37326...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.heim-haushalt.bad.badacces...,https://apitest.censhare.globus.ch/products/53...,1066657600000
13,37328,zahnburstenbehalter,Zahnbürstenbehälter,,grau,ton,https://apitest.censhare.globus.ch/image/37328...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.heim-haushalt.bad.badacces...,https://apitest.censhare.globus.ch/products/53...,1066668300015
15,37338,seifenschalen,Seifenschale aus,,grau,ton,https://apitest.censhare.globus.ch/image/37338...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.heim-haushalt.bad.badacces...,https://apitest.censhare.globus.ch/products/53...,1066669100015
32,26678,jeansregularfit,G-STAR,damen,blau,dunkelblau,https://apitest.censhare.globus.ch/image/26678...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damen.jeans,https://apitest.censhare.globus.ch/products/53...,1066588200530
35,26653,umhangetasche,MICHAEL MICHAEL KORS,damen,blau,navy,https://apitest.censhare.globus.ch/image/26653...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800528
36,26663,umhangetasche,MICHAEL MICHAEL KORS,damen,schwarz,schwarz,https://apitest.censhare.globus.ch/image/26663...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800739
37,26665,umhangetasche,MICHAEL MICHAEL KORS,damen,braun,braun,https://apitest.censhare.globus.ch/image/26665...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800820
38,26655,umhangetasche,MICHAEL MICHAEL KORS,damen,grün,grün,https://apitest.censhare.globus.ch/image/26655...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800620
39,38482,umhangetasche,MICHAEL MICHAEL KORS,damen,grün,hellgrün,https://apitest.censhare.globus.ch/image/38482...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800601
40,46584,umhangetasche,MICHAEL MICHAEL KORS,damen,weiss,weiss,https://apitest.censhare.globus.ch/image/46584...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800000


#### Split hierarchy into multiple hierarchy-levels


In [19]:
df_tmp = df_metadata['hierarchy_full'].apply(lambda x:
                                             (pd.Series(x.split('.')))[2:])

hierarchy_columns = [
    f'hierarchy_{col+1}' for col in range(len(df_tmp.columns))
]
df_metadata[hierarchy_columns] = df_tmp

#### Only use items that are connected to FASHION

Manual definition of categories to be included in training the model.
These categories were selected since they belong to clothes, have proper
data available and occur often enough. Thus, smaller categories were
excluded but could be included in further development of the algorithm.
For the features, this is handled by the function "listRareFeat" that
returns a list of the features with less than 1'000 occurences, which are
also excluded.

In [20]:
categories_fashion = [
    'taschen', 'schuhe', 'pullover-strick', 'top-shirts-sweats', 'schals',
    'hemden', 'hosen', 'oberteile', 'blusen-tuniken', 'shirts-tops-sweats',
    'kleider'
]

In [21]:
df_clean = df_metadata[df_metadata['hierarchy_2'].isin(categories_fashion)]

####   Fill the gender gaps
Since only gender are missing for lingerie, a simple "fillna" command can be used.

In [22]:
df_clean['gender'].fillna('damen', inplace=True)

#### Fill missing color with source_color

In [23]:
df_clean['color'].fillna(df_clean['source_color'], inplace=True)

#### Extract pattern

In [24]:
fashion_patterns = [
    'gestreift', 'klein gemustert', 'kariert', 'karo', 'Glattleder', 'Lack'
]

In [25]:
def pattern_from_color(color):
    """Check if the color contains one of the specified patterns. else, return NaN"""
    for pattern in fashion_patterns:
        if pattern in color:
            return [pattern]

    return []

In [26]:
df_clean['pattern'] = df_clean['color'].apply(pattern_from_color)

#### Get clean color

In [27]:
def clean_color(color):
    """ Method to remove pattern description from the color. """

    # If a pattern is in the color, remove it.
    for pattern in fashion_patterns:
        color = color.replace(pattern, '')

    return [color]

In [28]:
df_clean['color_clean'] = df_clean['color'].apply(clean_color)

#### Get cleaned seasons

In [29]:
def clean_season(season):
    """Method to transform the season letter into a word."""
    if season == 'W':
        return ['winter']
    elif season == 'S':
        return ['sommer']
    elif season == 'B':
        return ['beidesaison']
    else:
        return []

In [30]:
df_clean['season_clean'] = df_clean['season'].apply(clean_season)

#### Get Fit-Type

In [31]:
fashion_fit_types = [
    'slimfit', 'regularfit', 'skinnyfit', 'loosfit', 'taperedfit', 'tailliert'
]

In [32]:
def get_fit_type(descr):
    """
    Method to find the fit-type if provided in the string
    """
    for fit_type in fashion_fit_types:
        if fit_type in descr:
            return [fit_type]
    return []

In [33]:
df_clean['fit_type'] = df_clean['descr'].apply(get_fit_type)

In [34]:
df_clean

Unnamed: 0,globus_id,descr,name,gender,source_color,color,url,features,season,hierarchy_full,href,prod_id,hierarchy_1,hierarchy_2,hierarchy_3,pattern,color_clean,season_clean,fit_type
35,26653,umhangetasche,MICHAEL MICHAEL KORS,damen,blau,navy,https://apitest.censhare.globus.ch/image/26653...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800528,damenaccessoires,taschen,,[],[navy],[beidesaison],[]
36,26663,umhangetasche,MICHAEL MICHAEL KORS,damen,schwarz,schwarz,https://apitest.censhare.globus.ch/image/26663...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800739,damenaccessoires,taschen,,[],[schwarz],[beidesaison],[]
37,26665,umhangetasche,MICHAEL MICHAEL KORS,damen,braun,braun,https://apitest.censhare.globus.ch/image/26665...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800820,damenaccessoires,taschen,,[],[braun],[beidesaison],[]
38,26655,umhangetasche,MICHAEL MICHAEL KORS,damen,grün,grün,https://apitest.censhare.globus.ch/image/26655...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800620,damenaccessoires,taschen,,[],[grün],[beidesaison],[]
39,38482,umhangetasche,MICHAEL MICHAEL KORS,damen,grün,hellgrün,https://apitest.censhare.globus.ch/image/38482...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800601,damenaccessoires,taschen,,[],[hellgrün],[beidesaison],[]
40,46584,umhangetasche,MICHAEL MICHAEL KORS,damen,weiss,weiss,https://apitest.censhare.globus.ch/image/46584...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800000,damenaccessoires,taschen,,[],[weiss],[beidesaison],[]
46,26647,umhangetasche,MICHAEL MICHAEL KORS,damen,beige,sand,https://apitest.censhare.globus.ch/image/26647...,[globus:pim.general.produktbeschreibungextern-...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800012,damenaccessoires,taschen,,[],[sand],[beidesaison],[]
71,322479,winterschuhe,UGG,kinder,weiss,ecru,https://apitest.censhare.globus.ch/image/32247...,[globus:pim.accessoires.verschlussschuhe-zumsc...,W,globus:pim.category.kinder.schuhe,https://apitest.censhare.globus.ch/products/53...,1066697900003,kinder,schuhe,,[],[ecru],[winter],[]
84,36001,hemdregularfit,TOMMY HILFIGER,herren,blau,navy,https://apitest.censhare.globus.ch/image/36001...,[globus:pim.damenfashion.navmusterung-gemuster...,W,globus:pim.category.herren.hemden,https://apitest.censhare.globus.ch/products/53...,1066570700528,herren,hemden,,[],[navy],[winter],[regularfit]
112,36113,chinohose,TOMMY HILFIGER,herren,rot,bordeaux,https://apitest.censhare.globus.ch/image/36113...,[globus:pim.damenfashion.bundhohe-tiefebundhoh...,W,globus:pim.category.herren.hosen,https://apitest.censhare.globus.ch/products/53...,1066575400330,herren,hosen,,[],[bordeaux],[winter],[]


#### Check if is "Bügelfrei"

In [35]:
def check_if_bugelfrei(descr):
    if 'bugelfrei' in descr:
        return ['bugelfrei']
    else:
        return []

In [36]:
df_clean['is_bugelfrei'] = df_clean['descr'].apply(check_if_bugelfrei)

#### Remove fit-type and bügelfrei from description

In [37]:
def clean_description(descr):
    """Remove keywords from the description string."""
    for keyword in fashion_fit_types + ['bugelfrei', 'ausseide', '3fur2']:
        descr = descr.replace(keyword, '')
    return descr

In [38]:
df_clean['descr_clean'] = df_clean['descr'].apply(clean_description)

#### Add new features (fit-type, pattern, bügelfrei, color_clean, season_clean) to features column

In [39]:
df_clean['features'] = df_clean['features'] + df_clean['fit_type'] + df_clean[
    'pattern'] + df_clean['is_bugelfrei'] + df_clean['color_clean'] + df_clean[
        'season_clean']

# convert color back from list to single entry
df_clean['color_clean'] = df_clean['color_clean'].apply(lambda x: x[0])

In [40]:
df_clean[df_clean['globus_id']==36001]

Unnamed: 0,globus_id,descr,name,gender,source_color,color,url,features,season,hierarchy_full,...,prod_id,hierarchy_1,hierarchy_2,hierarchy_3,pattern,color_clean,season_clean,fit_type,is_bugelfrei,descr_clean
84,36001,hemdregularfit,TOMMY HILFIGER,herren,blau,navy,https://apitest.censhare.globus.ch/image/36001...,[globus:pim.damenfashion.navmusterung-gemuster...,W,globus:pim.category.herren.hemden,...,1066570700528,herren,hemden,,[],navy,[winter],[regularfit],[],hemd


In [41]:
df_clean[df_clean['globus_id']==36001]['fit_type']

84    [regularfit]
Name: fit_type, dtype: object

In [42]:
pd.set_option('display.max_colwidth', -1)
df_clean[df_clean['globus_id']==36001]['features']

84    [globus:pim.damenfashion.navmusterung-gemustertodermitdruck, globus:pim.general.pflegehinweis.trocknen-1, globus:pim.damenfashion.armellange-langarm, globus:pim.general.material-CO, globus:pim.general.pflegehinweis.bleichen-0, globus:pim.general.pflegehinweis.buegeln-2, globus:pim.damenfashion.anlass-casual, globus:pim.general.pflegehinweis.chemReinigen-4, globus:pim.general.passform-001, globus:pim.herrenfashion.hemdenkragenform-buttondownkragen, globus:pim.damenfashion.musterung-karomuster, globus:pim.general.pflegehinweis.waschen-3, globus:pim.heimhaushalt.qualitaet-100baumwolle, regularfit, navy, winter]
Name: features, dtype: object

#### Standardize Hierarchy

In [43]:
df_clean['hierarchy_clean'] = df_clean['hierarchy_1'] + '/' + df_clean[
    'hierarchy_2'] + '/' + df_clean['descr_clean'] + '/' + df_clean[
        'globus_id'].apply(str)

#### Remove umlaute and special characters

In [44]:
# Skipped for now

#### Drop rare (<500 occurences) features

In [52]:

# create dummy variables out of all features
mlb = MultiLabelBinarizer()
df_dummy_features = pd.DataFrame(mlb.fit_transform(df_clean['features']),
                                 columns=mlb.classes_,
                                 index=df_clean.index)
print(f'All: {len(df_dummy_features.columns)}')
# drop all columns which occur less than 1000 times
df_dummy_features.drop(
    [col for col, val in df_dummy_features.sum().iteritems() if val < 1],
    axis=1,
    inplace=True)
print(f'Non-Rare: {len(df_dummy_features.columns)}')


All: 738
Non-Rare: 738


#### Concat the remaining features, clean prefix and typo

In [53]:
def existing_features_to_list(row, feature_names):
    feature_list = [
        feature_names[i] for i in range(len(feature_names)) if row[i] != 0
    ]
    # remove 'pim-' prefix, correct typo in mntel
    feature_list = [
        feature.replace('pim-', '').replace('mntel', 'mantel')
        for feature in feature_list
    ]
    return list(set(feature_list))

In [54]:
feature_names = df_dummy_features.columns
df_clean['features_common'] = df_dummy_features.apply(
    lambda row: existing_features_to_list(row, feature_names), axis=1)

#### Get rid of descriptions in features and vice versa.

In [55]:
def descr_in_feature(row):
    reduced_features = [
        f for f in row['features_common']
        if not row['descr_clean'] in f 
        and not f in row['descr_clean']
    ]
    return reduced_features

In [56]:
df_clean['features_clean'] = df_clean[['descr_clean', 'features_common'
                                       ]].apply(descr_in_feature, axis=1)

In [57]:
df_clean['features_clean'] = df_clean['features_clean'].apply(lambda x: [el.split('.')[-1] for el in x])

#### Define Image path

In [58]:
# Skipped, location is based on run timestamp and globus ID

# df_clean['image_path'] = df_clean['globus_id'].apply(
#     lambda x: image_folder / f'run_{run_timestamp}' / f'{x}.png')

root = '/home/ubuntu/efs/original-data/images-small-new'
df_indexed = df_clean.set_index(['globus_id'])

def pic_path(globus_id):
    """
    takes a globus id and returns a path to the 500x500 small .png
    """
    # index the df for quick lookup speed
    
    
    hier1 = df_indexed.loc[globus_id, 'hierarchy_1']
    hier2 = df_indexed.loc[globus_id, 'hierarchy_2']
    
    path = os.path.join(root, hier1, hier2, str(globus_id) +'.png')    
    
    return path


df_clean['image_path']= [pic_path(glob_id) for glob_id in df_clean['globus_id']]


## Find image paths that are valid and filter the df so that only valid paths remain

In [59]:
%%time
paths_valid=[os.path.exists(path) for path in df_clean['image_path']]

CPU times: user 216 ms, sys: 600 ms, total: 816 ms
Wall time: 45.7 s


In [60]:
np.sum(paths_valid)

33354

In [61]:
df_clean=df_clean[paths_valid]

#### Define Image Classification

Hierarchy without the lowest level (i.e. the Globus ID)

In [62]:
df_clean['img_class'] = df_clean['hierarchy_clean'].apply(
        os.path.dirname).str.replace('/', '_').str.replace('-', '')

#### Get unique colors

In [63]:
df_colors_unique = pd.DataFrame(data=df_clean['color_clean'].unique(),
                                columns=['colors_unique'])

#### Select relevant columns

In [64]:
relevant_columns = [
    'globus_id', 'hierarchy_clean', 'features_clean', 'color_clean', 'url', 'img_class', 'image_path', 'hierarchy_2'
]

# skipped image_path

In [65]:
df_final = df_clean[relevant_columns]

df_final.columns = ['globus_id', 'hierarchy', 'features', 'color', 'image_url', 'img_class', 'image_path', 'hierarchy_2']

In [66]:
df_final

Unnamed: 0,globus_id,hierarchy,features,color,image_url,img_class,image_path,hierarchy_2
35,26653,damenaccessoires/taschen/umhangetasche/26653,"[produktbeschreibungextern-False, verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, navy, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",navy,https://apitest.censhare.globus.ch/image/26653/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26653.png,taschen
36,26663,damenaccessoires/taschen/umhangetasche/26663,"[produktbeschreibungextern-False, verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, schwarz, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",schwarz,https://apitest.censhare.globus.ch/image/26663/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26663.png,taschen
37,26665,damenaccessoires/taschen/umhangetasche/26665,"[produktbeschreibungextern-False, verschlussartfashion-reisverschluss, braun, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",braun,https://apitest.censhare.globus.ch/image/26665/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26665.png,taschen
38,26655,damenaccessoires/taschen/umhangetasche/26655,"[produktbeschreibungextern-False, verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, grün, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",grün,https://apitest.censhare.globus.ch/image/26655/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26655.png,taschen
39,38482,damenaccessoires/taschen/umhangetasche/38482,"[produktbeschreibungextern-False, verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, hellgrün, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",hellgrün,https://apitest.censhare.globus.ch/image/38482/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/38482.png,taschen
40,46584,damenaccessoires/taschen/umhangetasche/46584,"[produktbeschreibungextern-False, weiss, verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",weiss,https://apitest.censhare.globus.ch/image/46584/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/46584.png,taschen
46,26647,damenaccessoires/taschen/umhangetasche/26647,"[produktbeschreibungextern-False, verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, sand, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",sand,https://apitest.censhare.globus.ch/image/26647/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26647.png,taschen
71,322479,kinder/schuhe/winterschuhe/322479,"[verschlussschuhe-zumschlupfen, decksohle-textil, produktbeschreibungextern-False, ecru, material-L10, formschuhe-runderschuhspitze, qualitaet-100wolle, laufsohle-leder]",ecru,https://apitest.censhare.globus.ch/image/322479/webp,kinder_schuhe_winterschuhe,/home/ubuntu/efs/original-data/images-small-new/kinder/schuhe/322479.png,schuhe
84,36001,herren/hemden/hemd/36001,"[navmusterung-gemustertodermitdruck, trocknen-1, armellange-langarm, material-CO, bleichen-0, buegeln-2, anlass-casual, chemReinigen-4, passform-001, navy, regularfit, musterung-karomuster, waschen-3, winter, qualitaet-100baumwolle]",navy,https://apitest.censhare.globus.ch/image/36001/webp,herren_hemden_hemd,/home/ubuntu/efs/original-data/images-small-new/herren/hemden/36001.png,hemden
112,36113,herren/hosen/chinohose/36113,"[bordeaux, bundhohe-tiefebundhohe, trocknen-1, chemReinigen-0, verschlussartfashion-reisverschluss, material-EL, navmusterung-unifarben, material-CO, bleichen-0, buegeln-2, musterung-unifarben, taschen-schragertascheneingriff, waschen-4, hosenlange-normalelange, qualitaet-naturfasermitelasthan, taschen-aufgesetztetaschen, passform-004, winter, hosenbeinform-geradegeschnittenesbein, navhosen-chinos]",bordeaux,https://apitest.censhare.globus.ch/image/36113/webp,herren_hosen_chinohose,/home/ubuntu/efs/original-data/images-small-new/herren/hosen/36113.png,hosen


In [67]:
df_final[df_final['globus_id']==36001]

Unnamed: 0,globus_id,hierarchy,features,color,image_url,img_class,image_path,hierarchy_2
84,36001,herren/hemden/hemd/36001,"[navmusterung-gemustertodermitdruck, trocknen-1, armellange-langarm, material-CO, bleichen-0, buegeln-2, anlass-casual, chemReinigen-4, passform-001, navy, regularfit, musterung-karomuster, waschen-3, winter, qualitaet-100baumwolle]",navy,https://apitest.censhare.globus.ch/image/36001/webp,herren_hemden_hemd,/home/ubuntu/efs/original-data/images-small-new/herren/hemden/36001.png,hemden


# Remove all colors and weird features

In [68]:
list_of_features_to_remove = ['blau', 'grau', 'navy','schwarz','produktbeschreibungextern-False','rot','weiss']

feature_filter = lambda features: [feature for feature in features if feature not in list_of_features_to_remove]

df_final['features'] = df_final.features.apply(feature_filter)

In [69]:
df_final

Unnamed: 0,globus_id,hierarchy,features,color,image_url,img_class,image_path,hierarchy_2
35,26653,damenaccessoires/taschen/umhangetasche/26653,"[verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",navy,https://apitest.censhare.globus.ch/image/26653/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26653.png,taschen
36,26663,damenaccessoires/taschen/umhangetasche/26663,"[verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",schwarz,https://apitest.censhare.globus.ch/image/26663/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26663.png,taschen
37,26665,damenaccessoires/taschen/umhangetasche/26665,"[verschlussartfashion-reisverschluss, braun, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",braun,https://apitest.censhare.globus.ch/image/26665/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26665.png,taschen
38,26655,damenaccessoires/taschen/umhangetasche/26655,"[verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, grün, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",grün,https://apitest.censhare.globus.ch/image/26655/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26655.png,taschen
39,38482,damenaccessoires/taschen/umhangetasche/38482,"[verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, hellgrün, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",hellgrün,https://apitest.censhare.globus.ch/image/38482/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/38482.png,taschen
40,46584,damenaccessoires/taschen/umhangetasche/46584,"[verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",weiss,https://apitest.censhare.globus.ch/image/46584/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/46584.png,taschen
46,26647,damenaccessoires/taschen/umhangetasche/26647,"[verschlussartfashion-reisverschluss, navmusterung-unifarben, musterung-unifarben, beidesaison, material-L1, sand, qualitaet-echtesleder, innenausstattung-kreditkartenfacher, navdamentaschen-umhangeschultertaschen]",sand,https://apitest.censhare.globus.ch/image/26647/webp,damenaccessoires_taschen_umhangetasche,/home/ubuntu/efs/original-data/images-small-new/damenaccessoires/taschen/26647.png,taschen
71,322479,kinder/schuhe/winterschuhe/322479,"[verschlussschuhe-zumschlupfen, decksohle-textil, ecru, material-L10, formschuhe-runderschuhspitze, qualitaet-100wolle, laufsohle-leder]",ecru,https://apitest.censhare.globus.ch/image/322479/webp,kinder_schuhe_winterschuhe,/home/ubuntu/efs/original-data/images-small-new/kinder/schuhe/322479.png,schuhe
84,36001,herren/hemden/hemd/36001,"[navmusterung-gemustertodermitdruck, trocknen-1, armellange-langarm, material-CO, bleichen-0, buegeln-2, anlass-casual, chemReinigen-4, passform-001, regularfit, musterung-karomuster, waschen-3, winter, qualitaet-100baumwolle]",navy,https://apitest.censhare.globus.ch/image/36001/webp,herren_hemden_hemd,/home/ubuntu/efs/original-data/images-small-new/herren/hemden/36001.png,hemden
112,36113,herren/hosen/chinohose/36113,"[bordeaux, bundhohe-tiefebundhohe, trocknen-1, chemReinigen-0, verschlussartfashion-reisverschluss, material-EL, navmusterung-unifarben, material-CO, bleichen-0, buegeln-2, musterung-unifarben, taschen-schragertascheneingriff, waschen-4, hosenlange-normalelange, qualitaet-naturfasermitelasthan, taschen-aufgesetztetaschen, passform-004, winter, hosenbeinform-geradegeschnittenesbein, navhosen-chinos]",bordeaux,https://apitest.censhare.globus.ch/image/36113/webp,herren_hosen_chinohose,/home/ubuntu/efs/original-data/images-small-new/herren/hosen/36113.png,hosen


### Write to CSV

In [70]:
# Code for Globus:
#df_final.to_csv(Path('data') / f'metadata_cleaned.csv')
#df_colors_unique.to_csv(Path('data') / f'colors_cleaned.csv')

df_final.to_csv('metadata_cleaned3.csv')
