In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import requests
from PIL import Image
from io import BytesIO
import os

In [2]:
Drop_file_path = "Data/dataset_droped.csv" 
df_drop = pd.read_csv(Drop_file_path)
Imputedfile_path = "Data/dataset_imputed.csv" 
df_imputed = pd.read_csv(Drop_file_path)

# Data Preprocessing

In [3]:
multi_label_columns = ['category', 'style', 'colors', 'gender', 'pattern',
       'occasion','fit', 'Type', 'lenghth']

In [4]:
def transform_to_label_lists(df, columns):
    for col in columns:
        df[col] = df[col].astype(str).apply(lambda x: x.split(";") if pd.notna(x) else [])
    return df

In [5]:
df_transformed_Imputed = transform_to_label_lists(df_imputed.copy(), multi_label_columns)

In [6]:
df_transformed_Droped = transform_to_label_lists(df_drop.copy(), multi_label_columns)

In [7]:
df_transformed_Imputed.head()

Unnamed: 0,image_url_1,category,style,colors,gender,pattern,occasion,barcode,brand,fit,Type,lenghth
0,https://fotos.skm.be/article/489205/489205_02_...,[shirt],[casual],"[magenta, white]",[female],[shapes],"[semi-formal, summer]",48920502040101,ETERNA,[Regular_Fit],"[straight-sleeve, high-neck]","[short, long-sleeve]"
1,https://fotos.skm.be/article/507927/507927_01_...,[sweater],[casual],"[coral, magenta]",[female],"[chevron, harlequin]",[winter],50792701020101,Geisha,[Regular_Fit],"[straight-sleeve, turtle-neck]","[short, long-sleeve]"
2,https://fotos.skm.be/article/505938/505938_01_...,[trousers_and_chinos],[business],[gray],[female],[houndstooth],[semi-formal],50593801040101,TONI,[Slim_Fit],"[high-waist, classic-trousers]",[full-length]
3,https://fotos.skm.be/article/487940/487940_01_...,[sweatshirt],[casual],"[bordeaux, gray]",[female],"[marled, varsity-striped]","[home-lounge, winter]",48794001050101,Rabe,[Regular_Fit],"[straight-sleeve, crew-neck]","[short, long-sleeve]"
4,https://fotos.skm.be/article/499751/499751_01_...,[shirt],[casual],"[beige, black]",[female],[geometric],"[semi-formal, summer]",49975101020101,ONLY,[Regular_Fit],[straight-sleeve],"[short, long-sleeve]"


In [8]:
df_transformed_Droped.head()

Unnamed: 0,image_url_1,category,style,colors,gender,pattern,occasion,barcode,brand,fit,Type,lenghth
0,https://fotos.skm.be/article/489205/489205_02_...,[shirt],[casual],"[magenta, white]",[female],[shapes],"[semi-formal, summer]",48920502040101,ETERNA,[Regular_Fit],"[straight-sleeve, high-neck]","[short, long-sleeve]"
1,https://fotos.skm.be/article/507927/507927_01_...,[sweater],[casual],"[coral, magenta]",[female],"[chevron, harlequin]",[winter],50792701020101,Geisha,[Regular_Fit],"[straight-sleeve, turtle-neck]","[short, long-sleeve]"
2,https://fotos.skm.be/article/505938/505938_01_...,[trousers_and_chinos],[business],[gray],[female],[houndstooth],[semi-formal],50593801040101,TONI,[Slim_Fit],"[high-waist, classic-trousers]",[full-length]
3,https://fotos.skm.be/article/487940/487940_01_...,[sweatshirt],[casual],"[bordeaux, gray]",[female],"[marled, varsity-striped]","[home-lounge, winter]",48794001050101,Rabe,[Regular_Fit],"[straight-sleeve, crew-neck]","[short, long-sleeve]"
4,https://fotos.skm.be/article/499751/499751_01_...,[shirt],[casual],"[beige, black]",[female],[geometric],"[semi-formal, summer]",49975101020101,ONLY,[Regular_Fit],[straight-sleeve],"[short, long-sleeve]"


# Label encoding

In [9]:
def multi_hot_encode(df, columns):
    mlb_encoders = {}
    df_encoded = df.copy()
    
    for col in columns:
        mlb = MultiLabelBinarizer()
        encoded = mlb.fit_transform(df[col])
        
        # Store each row as a binary list representing multi-hot encoding
        df_encoded[col] = encoded.tolist()
        
        # Store the encoder for reverse transformation
        mlb_encoders[col] = mlb

    return df_encoded, mlb_encoders

In [10]:
def multi_hot_decode(df_encoded, mlb_encoders):
    df_decoded = df_encoded.copy()
    
    for col, mlb in mlb_encoders.items():
        # Convert multi-hot encoded lists back to NumPy array for decoding
        encoded_values = np.array(df_encoded[col].tolist())
        
        # Perform inverse transformation
        decoded_labels = mlb.inverse_transform(encoded_values)
        
        # Restore original column with lists of labels
        df_decoded[col] = decoded_labels
    
    return df_decoded


In [11]:
df_drop_encoded, mlb_encoders = multi_hot_encode(df_transformed_Droped, multi_label_columns)
df_drop_decoded = multi_hot_decode(df_drop_encoded, mlb_encoders)


In [12]:
print(df_drop_encoded['colors'][0])
print(df_transformed_Droped['colors'][0])
print(df_drop_decoded['colors'][0])


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
['magenta', 'white']
('magenta', 'white')


In [13]:
df_imputed_encoded, mlb_encoders = multi_hot_encode(df_transformed_Imputed, multi_label_columns)
df_imputed_decoded = multi_hot_decode(df_drop_encoded, mlb_encoders)

In [14]:
print(df_imputed_encoded['colors'][0])
print(df_transformed_Imputed['colors'][0])
print(df_imputed_decoded['colors'][0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
['magenta', 'white']
('magenta', 'white')


# Downloding the Image

In [15]:
def download_images_and_create_id(df, image_dir,image_col="image_url_1", barcode_col="barcode"):
    df = df.copy()
    image_ids = []
    image_paths = []

    for index, row in df.iterrows():
        image_url = row[image_col]
        barcode = str(row[barcode_col])
        image_id = f"image_{barcode}"
        image_filename = f"{image_id}.jpg"
        image_path = os.path.join(image_dir, image_filename)

        # Download the image
        try:
            response = requests.get(image_url, timeout=10)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content))
                img.save(image_path)
                image_paths.append(image_path)
            else:
                image_paths.append(None)
        except Exception as e:
            image_paths.append(None)

        image_ids.append(image_id)

    # Insert new columns at the beginning
    df.insert(0, "Image_id", image_ids)
    df.insert(1, "Image_Path", image_paths)

    return df

In [16]:
image_dir_imputed = "data/images_imputed"
os.makedirs(image_dir_imputed, exist_ok=True)

In [17]:
df_imputed_encoded = download_images_and_create_id(df_imputed_encoded,image_dir_imputed)

In [18]:
image_dir_droped = "data/images_droped"
os.makedirs(image_dir_droped, exist_ok=True)

In [19]:
df_drop_encoded = download_images_and_create_id(df_drop_encoded,image_dir_droped)

In [21]:
df_drop_encoded.to_csv("data/df_drop_encoded.csv", index=False)
df_imputed_encoded.to_csv("data/df_imputed_encoded.csv", index=False)