In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path  # for OS-safe path handling
import os
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
import pickle

# Load the raw metadata

In [2]:
#Globus path:
#path_pickle = Path('data') / '../1_cleaning/parsed_records_dump_20200103.p'

path_pickle = '../../efs/parsed_records_dump_20200103.p'
parsed_records = pickle.load(open(path_pickle, 'rb'))

In [3]:
meta_data_columns = [
    'globus_id',
    'descr',
    'name',
    'gender',
    'source_color',
    'color',
    'url',
    'material',
    'features',
    'season',
    'hierarchy_full',
    #     'hierarchy_1','hierarchy_2', 'hierarchy_3', 'hierarchy_4', 'hierarchy_5', 'hierarchy_6',
    'href',
    'prod_id'
]

In [4]:
df_metadata = pd.DataFrame(parsed_records, columns=meta_data_columns)

In [5]:
df_metadata[['name','features', 'hierarchy_full']]

Unnamed: 0,name,features,hierarchy_full
0,Globi,"[globus:pim.heimhaushalt.sprache-deutsch, glob...",globus:pim.category.kinder.spielwarencdbuecher
1,Globi,"[globus:pim.heimhaushalt.sprache-deutsch, glob...",globus:pim.category.kinder.spielwarencdbuecher
2,H&H No Name,"[globus:pim.heimhaushalt.sprache-deutsch, glob...",globus:pim.category.kinder.spielwarencdbuecher
3,Diorshow Mascara,"[globus:pim.beauty.naturkosmetik-False, globus...",globus:pim.category.beauty.make-up
4,Globi,"[globus:pim.heimhaushalt.sprache-deutsch, glob...",globus:pim.category.kinder.spielwarencdbuecher
5,Redwood Creek Chardonnay,"[globus:pim.general.ursprungsland-US, globus:p...",globus:pim.category.delicatessa.getranke.weine
6,"Redwood Creek, Merlot 2012","[globus:pim.general.ursprungsland-US, globus:p...",globus:pim.category.delicatessa.getranke.weine
7,Powercell Serum,"[globus:pim.beauty.naturkosmetik-False, globus...",globus:pim.category.beauty.pflege
8,Ersatz-Toilettenbürstenkopf,[globus:pim.general.produktbeschreibungextern-...,globus:pim.category.heim-haushalt.bad.badacces...
9,Powercell Serum,"[globus:pim.beauty.naturkosmetik-False, globus...",globus:pim.category.beauty.pflege


In [6]:
def split_hierarchy(x):
    try:
        return (pd.Series(x.split('.')))[3:4]
    except:
        return ''

dfx = df_metadata['hierarchy_full'].head(10).apply(split_hierarchy)
                                             

# hierarchy_columns = [
#     f'hierarchy_{col+1}' for col in range(len(df_tmp.columns))
# ]
# df_metadata[hierarchy_columns] = df_tmp

In [7]:
df_metadata.dropna(subset=['url'], inplace=True)
df_metadata.dropna(subset=['features'], inplace=True)
df_metadata.dropna(subset=['hierarchy_full'], inplace=True)
df_metadata.dropna(subset=['descr'], inplace=True)
df_metadata.dropna(subset=['source_color'], inplace=True)

## Get only Fashion Entries

In [8]:
categories_fashion = [
    'taschen', 'schuhe', 'pullover-strick', 'top-shirts-sweats', 'schals',
    'hemden', 'hosen', 'oberteile', 'blusen-tuniken', 'shirts-tops-sweats',
    'kleider'
]

In [9]:
df_tmp = df_metadata['hierarchy_full'].apply(lambda x:
                                             (pd.Series(x.split('.')))[2:])

hierarchy_columns = [
    f'hierarchy_{col+1}' for col in range(len(df_tmp.columns))
]
df_metadata[hierarchy_columns] = df_tmp

In [10]:
df_metadata[df_metadata['hierarchy_2'].isin( categories_fashion)]

Unnamed: 0,globus_id,descr,name,gender,source_color,color,url,material,features,season,hierarchy_full,href,prod_id,hierarchy_1,hierarchy_2,hierarchy_3
35,26653.0,umhangetasche,MICHAEL MICHAEL KORS,damen,blau,navy,https://apitest.censhare.globus.ch/image/26653...,[globus:pim.general.material-L1],[globus:pim.accessoires.navdamentaschen-umhang...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800528,damenaccessoires,taschen,
36,26663.0,umhangetasche,MICHAEL MICHAEL KORS,damen,schwarz,schwarz,https://apitest.censhare.globus.ch/image/26663...,[globus:pim.general.material-L1],[globus:pim.accessoires.navdamentaschen-umhang...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800739,damenaccessoires,taschen,
37,26665.0,umhangetasche,MICHAEL MICHAEL KORS,damen,braun,braun,https://apitest.censhare.globus.ch/image/26665...,[globus:pim.general.material-L1],[globus:pim.accessoires.navdamentaschen-umhang...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800820,damenaccessoires,taschen,
38,26655.0,umhangetasche,MICHAEL MICHAEL KORS,damen,grün,grün,https://apitest.censhare.globus.ch/image/26655...,[globus:pim.general.material-L1],[globus:pim.accessoires.navdamentaschen-umhang...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800620,damenaccessoires,taschen,
39,38482.0,umhangetasche,MICHAEL MICHAEL KORS,damen,grün,hellgrün,https://apitest.censhare.globus.ch/image/38482...,[globus:pim.general.material-L1],[globus:pim.accessoires.navdamentaschen-umhang...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800601,damenaccessoires,taschen,
40,46584.0,umhangetasche,MICHAEL MICHAEL KORS,damen,weiss,weiss,https://apitest.censhare.globus.ch/image/46584...,[globus:pim.general.material-L1],[globus:pim.accessoires.navdamentaschen-umhang...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800000,damenaccessoires,taschen,
46,26647.0,umhangetasche,MICHAEL MICHAEL KORS,damen,beige,sand,https://apitest.censhare.globus.ch/image/26647...,[globus:pim.general.material-L1],[globus:pim.accessoires.navdamentaschen-umhang...,B,globus:pim.category.damenaccessoires.taschen,https://apitest.censhare.globus.ch/products/53...,1066588800012,damenaccessoires,taschen,
71,322479.0,winterschuhe,UGG,kinder,weiss,ecru,https://apitest.censhare.globus.ch/image/32247...,[globus:pim.general.material-L10],"[globus:pim.accessoires.decksohle-textil, glob...",W,globus:pim.category.kinder.schuhe,https://apitest.censhare.globus.ch/products/53...,1066697900003,kinder,schuhe,
84,36001.0,hemdregularfit,TOMMY HILFIGER,herren,blau,navy,https://apitest.censhare.globus.ch/image/36001...,[globus:pim.general.material-CO],"[globus:pim.general.passform-001, globus:pim.g...",W,globus:pim.category.herren.hemden,https://apitest.censhare.globus.ch/products/53...,1066570700528,herren,hemden,
112,36113.0,chinohose,TOMMY HILFIGER,herren,rot,bordeaux,https://apitest.censhare.globus.ch/image/36113...,"[globus:pim.general.material-CO, globus:pim.ge...","[globus:pim.general.pflegehinweis.waschen-4, g...",W,globus:pim.category.herren.hosen,https://apitest.censhare.globus.ch/products/53...,1066575400330,herren,hosen,


# Get the dataframe with all the Globus attribute group names as column name: 

In [11]:
def values_per_hierarchy2(features):
    h2_dict = {}
    for feature in features:
        split_tuple = feature.split('-', 1)
        h2_dict[split_tuple[0]] = split_tuple[1]
    return h2_dict

In [12]:
all_features=df_metadata['features']

In [13]:
exp_all_dictionaries = all_features.apply(values_per_hierarchy2)

In [14]:
exp_all_flat = exp_all_dictionaries.apply(pd.Series)

In [15]:
exp_all_flat.shape

(87055, 309)

In [16]:
exp_all_flat.columns[exp_all_flat.isin(['unifarben']).any()] 

Index(['globus:pim.damenfashion.musterung', 'globus:pim.damenfashion.navmusterung'], dtype='object')

# Load the feature name of the model output layer & format it

In [17]:
features = pd.read_csv('../4_post_processing/features.csv', header=None )
features.columns=['features']

## Split the unique feature names (the output labels of our model) into 2 parts: "attribute-group" and "attribute"

In [18]:
list_features=features['features']
attr_group=[]
attributes=[]

for i in list_features:
    if ('-' in i):
        attr_group.append(i.split('-')[0])
        attributes.append(i.split('-')[1])
        
    else:
        attr_group.append('attribute not known')
        attributes.append(i)

In [19]:
features['attribute_group']=attr_group
features['attribute']=attributes

In [20]:
features.head()

Unnamed: 0,features,attribute_group,attribute
0,absatzart-blockabsatz,absatzart,blockabsatz
1,absatzart-flach,absatzart,flach
2,anlass-abend,anlass,abend
3,anlass-business,anlass,business
4,anlass-casual,anlass,casual


# Map the Globus attribute group names to the features

In [21]:

# Check whether the unique feature (listed in features['attribute']) is included in the table of exp_all_flat. 
# If feature is found, take the column name 'globus_attr_group' and add it to list_groups.
list_groups=[]

for col in features['attribute']:
    names=exp_all_flat.columns[exp_all_flat.isin([col]).any()]
    list_groups.append(names)
        

In [22]:
features['globus_attr_group']=list_groups

In [23]:
features['globus_attr_group']=features['globus_attr_group'].apply(list)

In [24]:
features.head()

Unnamed: 0,features,attribute_group,attribute,globus_attr_group
0,absatzart-blockabsatz,absatzart,blockabsatz,[globus:pim.accessoires.absatzart]
1,absatzart-flach,absatzart,flach,[globus:pim.accessoires.absatzart]
2,anlass-abend,anlass,abend,[globus:pim.damenfashion.anlass]
3,anlass-business,anlass,business,[globus:pim.damenfashion.anlass]
4,anlass-casual,anlass,casual,[globus:pim.damenfashion.anlass]


## Cleaning is required, because the same feature name (e.g "unifarben") can be mapped to multiple globus attribute groups ('globus:pim.damenfashion.navmusterung' or 'globus:pim.damenfashion.musterung')

In [49]:
features

Unnamed: 0,features,attribute_group,attribute,globus_attr_group
0,absatzart-blockabsatz,absatzart,blockabsatz,[globus:pim.accessoires.absatzart]
1,absatzart-flach,absatzart,flach,[globus:pim.accessoires.absatzart]
2,anlass-abend,anlass,abend,[globus:pim.damenfashion.anlass]
3,anlass-business,anlass,business,[globus:pim.damenfashion.anlass]
4,anlass-casual,anlass,casual,[globus:pim.damenfashion.anlass]
5,anlass-freizeit,anlass,freizeit,[globus:pim.kinder.anlass]
6,anthrazit,attribute not known,anthrazit,[]
7,armabschluss-manschettenmiteinemknopf,armabschluss,manschettenmiteinemknopf,[globus:pim.damenfashion.armabschluss]
8,armabschluss-mitverstellbarenmanschetten,armabschluss,mitverstellbarenmanschetten,[globus:pim.damenfashion.armabschluss]
9,armelform-raglanarmel,armelform,raglanarmel,[globus:pim.damenfashion.armelform]


In [25]:
# cleaning - Remove the group name if the "attribute_group" is not in the name of "globus attribute group":
def cleaning_function(row):
    keep=[el for el in row['globus_attr_group'] if row['attribute_group'] in el]
    return keep
    

In [28]:
features.iloc[155]

features                                              trocknen-0
attribute_group                                         trocknen
attribute                                                      0
globus_attr_group    [globus:pim.general.pflegehinweis.trocknen]
Name: 155, dtype: object

In [26]:
features['globus_attr_group']=features.apply(cleaning_function, axis=1)

In [27]:
features.head()

Unnamed: 0,features,attribute_group,attribute,globus_attr_group
0,absatzart-blockabsatz,absatzart,blockabsatz,[globus:pim.accessoires.absatzart]
1,absatzart-flach,absatzart,flach,[globus:pim.accessoires.absatzart]
2,anlass-abend,anlass,abend,[globus:pim.damenfashion.anlass]
3,anlass-business,anlass,business,[globus:pim.damenfashion.anlass]
4,anlass-casual,anlass,casual,[globus:pim.damenfashion.anlass]


In [29]:
pd.set_option('display.max_colwidth',-1)
features[features['globus_attr_group'].str.len()>1]

Unnamed: 0,features,attribute_group,attribute,globus_attr_group
24,besonderheiten-applikationen,besonderheiten,applikationen,"[globus:pim.herrenfashion.besonderheiten, globus:pim.damenfashion.besonderheiten]"
26,besonderheiten-glitzerapplikationen,besonderheiten,glitzerapplikationen,"[globus:pim.accessoires.besonderheiten, globus:pim.damenfashion.besonderheiten]"
99,musterung-unifarben,musterung,unifarben,"[globus:pim.damenfashion.musterung, globus:pim.damenfashion.navmusterung]"


In [30]:
len(features[features['globus_attr_group'].str.len()>1])

3

In [31]:
features[features['globus_attr_group'].str.len()==0]

Unnamed: 0,features,attribute_group,attribute,globus_attr_group
6,anthrazit,attribute not known,anthrazit,[]
20,beidesaison,attribute not known,beidesaison,[]
21,beige,attribute not known,beige,[]
35,bordeaux,attribute not known,bordeaux,[]
36,braun,attribute not known,braun,[]
47,ecru,attribute not known,ecru,[]
52,futtermaterial-L18,futtermaterial,L18,[]
53,futtermaterial-PES,futtermaterial,PES,[]
54,futtermaterial-textil,futtermaterial,textil,[]
58,grün,attribute not known,grün,[]


In [32]:
len(features[features['globus_attr_group'].str.len()==0])

35

# These features (colors, material codes, seasons) have no globus attribute group name: 

In [33]:
exp_all_flat.columns[exp_all_flat.isin(['pink']).any()]

Index([], dtype='object')

In [34]:
exp_all_flat.columns[exp_all_flat.isin(['slimfit']).any()]

Index([], dtype='object')

In [35]:
exp_all_flat.columns[exp_all_flat.isin(['regularfit']).any()]

Index([], dtype='object')

In [36]:
exp_all_flat.columns[exp_all_flat.isin(['L18']).any()]

Index([], dtype='object')

In [37]:
exp_all_flat.columns[exp_all_flat.isin(['grün']).any()]

Index([], dtype='object')

# Keep only the features, which can be clearly mapped to 1 single globus attribute group:

In [38]:
features_with_attribute_group=features[features['globus_attr_group'].str.len()==1]

In [39]:
len(features_with_attribute_group)

146

In [41]:
features_with_attribute_group.head()

Unnamed: 0,features,attribute_group,attribute,globus_attr_group
0,absatzart-blockabsatz,absatzart,blockabsatz,[globus:pim.accessoires.absatzart]
1,absatzart-flach,absatzart,flach,[globus:pim.accessoires.absatzart]
2,anlass-abend,anlass,abend,[globus:pim.damenfashion.anlass]
3,anlass-business,anlass,business,[globus:pim.damenfashion.anlass]
4,anlass-casual,anlass,casual,[globus:pim.damenfashion.anlass]


In [None]:
features_with_attribute_group.to_csv('features_mapped_to_globus_attribute_groups.csv')

# Features with 0 or more than 1 attribute groups:

In [42]:
features_with_0_attribute_group=features[features['globus_attr_group'].str.len()==0]
features_with_multiple_attribute_groups=features[features['globus_attr_group'].str.len()>1]


In [43]:
features_with_0_attribute_group.to_csv('features_with_0_attribute_groups.csv')
features_with_multiple_attribute_groups.to_csv('features_with_multiple_attribute_groups.csv')

# Add the column with the Globus attribute group name and save the csv

In [44]:
features['globus_attr_group_to_use']=features['globus_attr_group'].where(features['globus_attr_group'].str.len()==1, '') 


In [45]:
features_with_attribute_group=features[['features','attribute_group', 'attribute', 'globus_attr_group_to_use']]

In [46]:
features_with_attribute_group.shape

(184, 4)

In [47]:
features_with_attribute_group.to_csv('features_mapped_to_globus_attribute_groups.csv')