In [54]:
import pandas as pd
import numpy as np
import math
import shutil

In [55]:
hnm_file_format = 'data/HnM_{}.parquet'
hnm_img_file = 'images/HnM_{}{}'
cna_file_format = 'data/CnA_{}.parquet'
cna_img_file = 'images/CnA_{}{}'
cna_img_file_test = 'images_test/CnA_{}{}'

In [4]:
cats = pd.read_parquet(cna_file_format.format('Categories'))
filters = pd.read_parquet(cna_file_format.format('Filters'))
items = pd.read_parquet(cna_file_format.format('Items'))
attributes = pd.read_parquet(cna_file_format.format('FilterAttributes'))

In [5]:
cats.head()

Unnamed: 0,Category,URL,Parent_Category,Top_Category,Skip
2,Kleider & Jumpsuits,https://www.c-and-a.com/de/de/shop/damen-bekle...,,Women,False
3,Blusen,https://www.c-and-a.com/de/de/shop/damen-bekle...,,Women,False
4,T-Shirts & Tops,https://www.c-and-a.com/de/de/shop/damen-bekle...,,Women,False
5,Pullover & Strickjacken,https://www.c-and-a.com/de/de/shop/damen-bekle...,,Women,False
6,Jeans,https://www.c-and-a.com/de/de/shop/damen-bekle...,,Women,False


### Mapping category names between CnA and HnM

In [7]:
## load the data frame
df = pd.read_parquet(hnm_file_format.format('Final_Input_Df'))
df['Category_Name'] = df['Category_Name'].astype('category')
df['Category_Name_Code'] = df['Category_Name'].cat.codes

In [11]:
df['Category_Name'].unique().tolist()

['Women_Shirts-&-Tops',
 'Women_Hemden-&-Blusen',
 'Women_Cardigans-&-Pullover',
 'Women_Jacken-&-Mäntel',
 'Women_Hosen',
 'Women_Jeans',
 'Women_Shorts',
 'Women_Röcke',
 'Women_Kleider',
 'Women_Schuhe',
 'Women_Accessoires',
 'Women_Bademode',
 'Women_Unterwäsche',
 'Women_Nachtwäsche',
 'Women_Socken-&-Strumpfhosen',
 'Women_Sportbekleidung',
 'Women_Beauty',
 'Women_Hoodies-&-Sweatshirts',
 'Men_Hoodies-&-Sweatshirts',
 'Men_Cardigans-&-Pullover',
 'Men_Hemden',
 'Men_T-Shirts',
 'Men_Hosen',
 'Men_Schuhe',
 'Men_Accessoires',
 'Baby_Kleidung',
 'Baby_Accessoires',
 'Kids_Kleidung',
 'Kids_Outdoor',
 'Kids_Accessoires',
 'Kids_Schuhe']

In [20]:
cat_mapping = {
    'Women_Kleider & Jumpsuits': 'Women_Kleider',
    'Women_Blusen': 'Women_Hemden-&-Blusen',
    'Women_T-Shirts & Tops': 'Women_Shirts-&-Tops',
    'Women_Pullover & Strickjacken': 'Women_Cardigans-&-Pullover',
    'Women_Jeans': 'Women_Jeans',
    'Women_Shorts': 'Women_Shorts',
    'Women_Hosen': 'Women_Hosen',
    'Women_Röcke': 'Women_Röcke',
    'Women_Sportbekleidung': 'Women_Sportbekleidung',
    'Women_Loungewear': None,
    'Women_Jacken': 'Women_Jacken-&-Mäntel',
    'Women_Blazer': 'Women_Jacken-&-Mäntel',
    'Women_Bademode': 'Women_Bademode',
    'Women_Unterwäsche': 'Women_Unterwäsche',
    'Women_Nachtwäsche': 'Women_Nachtwäsche',
    'Women_Socken & Strumpfhosen': 'Women_Socken-&-Strumpfhosen',
    'Women_Basics': None,
    'Women_Business-Mode': None,
    'Women_Abendmode': None,
    'Women_Kaschmir': None,
    'Men_T-Shirts & Polos': 'Men_T-Shirts',
    'Men_Hemden': 'Men_Hemden',
    'Men_Sweatshirts & Sweatjacken': 'Men_Hoodies-&-Sweatshirts',
    'Men_Pullover & Strickjacken': 'Men_Cardigans-&-Pullover',
    'Men_Hosen': 'Men_Hosen',
    'Men_Jeans': 'Men_Hosen',
    'Men_Shorts': None,
    'Men_Anzüge': None,
    'Men_Jacken': None,
    'Men_Sportbekleidung': None,
    'Men_Bademode': None,
    'Men_Pyjamas': None,
    'Men_Unterwäsche': None,
    'Men_Socken': None,
    'Men_Basics': None,
    'Baby_Outfits': 'Baby_Kleidung',
    'Baby_Oberteile': None,
    'Baby_Unterteile': None,
    'Baby_Kleider & Röcke': 'Baby_Kleidung',
    'Baby_Multipacks': None,
    'Baby_Bademode': None,
    'Baby_Nachtwäsche': None,
    'Baby_Socken & Strumpfhosen': None,
    'Baby_Accessoires': 'Baby_Accessoires',
    'Baby_Schuhe': None,
    'Baby_Jacken': None,
    'Baby_Schneeanzüge': None,
    'Kids-Girls_Multipacks': None,
    'Kids-Girls_T-Shirts & Blusen': None,
    'Kids-Girls_Kleider & Röcke': 'Kids_Kleidung',
    'Kids-Girls_Shorts': None,
    'Kids-Girls_Jeans': None,
    'Kids-Girls_Hosen': None,
    'Kids-Girls_Wäsche': None,
    'Kids-Girls_Sets': None,
    'Kids-Girls_Bademode': None,
    'Kids-Girls_Socken': None,
    'Kids-Girls_Pullover & Sweatshirts': None,
    'Kids-Girls_Jacken': None,
    'Kids-Girls_Accessoires': 'Kids_Accessoires',
    'Kids-Girls_Skibekleidung': None,
    'Kids-Girls_Basics': None,
    'Kids-Boys_Multipacks': None,
    'Kids-Boys_T-Shirts & Hemden': None,
    'Kids-Boys_Shorts': None,
    'Kids-Boys_Jeans': None,
    'Kids-Boys_Hosen': None,
    'Kids-Boys_Wäsche': None,
    'Kids-Boys_Sets': None,
    'Kids-Boys_Bademode': None,
    'Kids-Boys_Pullover & Sweatshirts': None,
    'Kids-Boys_Socken': None,
    'Kids-Boys_Jacken': None,
    'Kids-Boys_Accessoires': 'Kids_Accessoires',
    'Kids-Boys_Skibekleidung': None,
    'Kids-Boys_Basics': None
}

In [16]:
cats['Cat_Name'] = cats.apply(lambda row: f"{row['Top_Category']}_{row['Category']}", axis=1)

In [22]:
cats['Cat_Name_Mapped'] = cats.apply(lambda row: cat_mapping.get(row['Cat_Name'], None), axis=1)

In [29]:
cats.isna().sum()

Category             0
URL                  0
Parent_Category     76
Top_Category         0
Skip                 0
Cat_Name             0
Cat_Name_Mapped    467
dtype: int64

In [37]:
parent_cats_mapped = cats[cats['Parent_Category'].isna()][~cats['Cat_Name_Mapped'].isna()].index.tolist()

  parent_cats_mapped = cats[cats['Parent_Category'].isna()][~cats['Cat_Name_Mapped'].isna()].index.tolist()


### Getting some items

In [28]:
# add parent category into the items dataframe
items = items.merge(cats[['Parent_Category', 'Cat_Name_Mapped']], how='left', left_on='Category', right_index=True)

In [51]:
items[items['Parent_Category'].isin(parent_cats_mapped)].groupby('Parent_Category')['Code'].count().min()

56

In [52]:
sub_items = items[items['Parent_Category'].isin(parent_cats_mapped)].groupby('Parent_Category').sample(50)

In [53]:
sub_items.shape

(1350, 7)

In [63]:
# sub_items = sub_items.merge(cats['Cat_Name_Mapped'], how='left', left_on='Parent_Category', right_index=True)

In [68]:
sub_items

Unnamed: 0,Code,Name,URL,Img_URL,Img_Ext,Category,Parent_Category,Cat_Name_Mapped
416,2158240_1,CLOCKHOUSE - Kleid - kariert,https://www.c-and-a.com/de/de/shop/clockhouse-...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,75.0,2.0,Women_Kleider
328,2173987_2,CLOCKHOUSE - Kleid - geblümt,https://www.c-and-a.com/de/de/shop/clockhouse-...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,75.0,2.0,Women_Kleider
116,2165021_2,Strickkleid,https://www.c-and-a.com/de/de/shop/strickkleid...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,73.0,2.0,Women_Kleider
658,2155417_1,Kleid,https://www.c-and-a.com/de/de/shop/kleid-21554...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,81.0,2.0,Women_Kleider
164,2130669_1,Strickkleid,https://www.c-and-a.com/de/de/shop/strickkleid...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,73.0,2.0,Women_Kleider
...,...,...,...,...,...,...,...,...
41668,2161629_1,Sonnenbrille,https://www.c-and-a.com/de/de/shop/sonnenbrill...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,1006.0,882.0,Kids_Accessoires
41675,2182663_1,Slazenger - Sneaker,https://www.c-and-a.com/de/de/shop/slazenger-s...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,1009.0,882.0,Kids_Accessoires
41650,2149725_1,Paw Patrol - Set - Mütze und Handschuhe - 2 te...,https://www.c-and-a.com/de/de/shop/paw-patrol-...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,1001.0,882.0,Kids_Accessoires
41646,2110896_1,Paw Patrol - Mütze,https://www.c-and-a.com/de/de/shop/paw-patrol-...,https://www.c-and-a.com/productimages/b_rgb:EB...,.jpg,1000.0,882.0,Kids_Accessoires


### Copy these images into a separate folder and export the dataframe as a test df

In [58]:
i = 0
for idx, row in sub_items.iterrows():
    src = cna_img_file.format(row['Code'], row['Img_Ext'])
    dst = cna_img_file_test.format(row['Code'], row['Img_Ext'])
    shutil.copy(src, dst)
    i += 1
    if i % 100 == 0:
        print(f"Completed {i}")

Completed 100
Completed 200
Completed 300
Completed 400
Completed 500
Completed 600
Completed 700
Completed 800
Completed 900
Completed 1000
Completed 1100
Completed 1200
Completed 1300


In [70]:
sub_items.to_parquet(cna_file_format.format('Final_Df_Test'))