# Loading dataset and splitting into train, val and test

First, set the column width to be wider so that the DataFrame is displayed more clearly

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
#pd.reset_option('all')

Then its time to load the datest for preprocessing

In [None]:
from datasets import load_dataset

In [14]:
dataset = load_dataset("smartcat/Amazon_Sample_Metadata_2023", name="product2query_V1", split="train")

In [30]:
df = pd.DataFrame(dataset)

Then, set the column 'parent_asin' to be the first in line as its the product index

In [31]:
new_column_order = ["parent_asin"] + [col for col in df.columns if col != "parent_asin"] 
df = df[new_column_order]

In [32]:
df.head(3)

Unnamed: 0,parent_asin,main_category,title,features,description,price,store,brand,manufacturer,short_query,long_query
0,B0C1ND3FVR,All Beauty,Murray & Lanman Florida Water Plastic Bottle 7.5 oz,Unisex\nNot tested on animals,"The enduring popularity of world-famous Florida Water is due to its wonderful light floral scent with lemon overtones. Unchanged since 1808, this revitalizing scent is used as an invigorating splash, aftershave, or fragrant addition to the bath. For more than 200 years men and women have loved the Florida Water scent, named for the Fountain of Youth.",,Murray & Lanman,Murray & Lanman,Atlas Ethnic,Florida Water Fragrance,Murray & Lanman Florida Water 7.5 oz Plastic Bottle with Classic Floral Scent
1,B001PIVFU0,All Beauty,"GiGi Professional Multi-Purpose Wax Warmer, with See-Through Cover, White",Thermostat-controlled heating,GiGi Professional Multi-Purpose Wax Warmer with See-Through Cover,25.87,GiGi,GiGi,Barg Engine,GiGi Wax Warmer,GiGi Professional Multi-Purpose Wax Warmer with See-Through Cover
2,B004ZWH3XG,All Beauty,"Garnier Fructis Color Sealer, Instant, Lightweight Leave-In, Color Shield, For Color-Treated Hair, 6 oz.",Seals In Color And Conditions For Extra Softness\nInfused With Acai Berry And Grape Seed Oil\nFor Color Treated Hair\nLightweight Formula,"Proven to stop dry-out. Fight fade-out. The lightweight formula, infused with acai berry and grape seed oil, instantly seals in color for longer-lasting vibrancy while improving manageability and delivering lasting softness. Uva and uvb protectant help protect hair. Garnier fructis color shield instant color sealer lightweight leave-in instantly seals in color and conditions for extra softness.",24.0,Garnier,Garnier LLC,Garnier LLC,Garnier Fructis Color Sealer,Garnier Fructis Color Shield Instant Color Sealer for Color-Treated Hair


Some helper functions to check missing values

In [33]:
def check_na(df):
    """_summary_
    Prints sum of NA by columns in DataFrame.

    Args:
       - df (DataFrame): the DataFrame chunk

    Prints:
       - Number of NA values by column.
    """
    NANs = df.isnull().sum()
    print("--------------------")
    print("Nulls in dataset:\n", NANs)

def check_empty_lists(df):
    """
    Prints sum of empty lists by columns in DataFrame.

    Parameters:
        - df (DataFrame): the DataFrame chunk

    Prints:
        - Number of empty lists by column.
    """
    empty_list_counts = df.apply(lambda col: col.apply(lambda x: x == []).sum())
    print("--------------------")
    print("Empty lists:")

    print(empty_list_counts)


def check_empty_strings(df):
    """
    Prints sum of empty strings by columns in DataFrame.

    Parameters:
        - df (DataFrame): the DataFrame chunk

    Prints:
        - Number of empty strings by column.
    """
    empty_strings = (df == "").sum()
    print("--------------------")
    print("Empty strings:")
    print(empty_strings[empty_strings > 0])


def check_empty_dictionaries(df):
    """
    Prints the count of empty dictionaries by column in the DataFrame.

    Parameters:
        - df (pd.DataFrame): The DataFrame to check for empty dictionaries.

    Prints:
        - Number of empty dictionaries by columns.
    """
    # Count the number of empty dictionaries in each column
    empty_dicts = df.apply(
        lambda x: x.apply(lambda y: isinstance(y, dict) and len(y) == 0).sum()
    )

    print("--------------------")
    print("Empty dictionaries:")
    print(empty_dicts[empty_dicts > 0])

In [34]:
check_na(df)
check_empty_lists(df)
check_empty_dictionaries(df)
check_empty_strings(df)

--------------------
Nulls in dataset:
 parent_asin           0
main_category         0
title                 0
features              0
description           0
price            143604
store                 0
brand             62970
manufacturer      62970
short_query           0
long_query            0
dtype: int64
--------------------
Empty lists:
parent_asin      0
main_category    0
title            0
features         0
description      0
price            0
store            0
brand            0
manufacturer     0
short_query      0
long_query       0
dtype: int64
--------------------
Empty dictionaries:
Series([], dtype: int64)
--------------------
Empty strings:
main_category    14537
title               22
features         10974
store              659
short_query        482
long_query         482
dtype: int64


We need to to focus on products that have a Title, and every row that doesn't will be dropped.

In [41]:
df_empty_title = df[df["title"] == '']
df_empty_title["parent_asin"]

df = df[~df["parent_asin"].isin(df_empty_title["parent_asin"].to_list())]

Same goes for 'short_query' cause it will be our label for training

In [44]:
df_short_query = df[df["short_query"] == '']
df_short_query["parent_asin"]

df = df[~df["parent_asin"].isin(df_short_query["parent_asin"].to_list())]

In [45]:
check_empty_strings(df)

--------------------
Empty strings:
main_category    14504
features         10956
store              659
dtype: int64


Empty categories are set to 'Unknown'

In [46]:
df.loc[df['main_category'] == '', 'main_category'] = 'Unknown'

In [47]:
check_empty_strings(df)

--------------------
Empty strings:
features    10956
store         659
dtype: int64


In [48]:
def analyze_ordinal_columns(df, process_columns=None, skip_columns=None):
    """
    Analyzes specified columns in the DataFrame for unique values and occurrences.

    Parameters:
        - df (pd.DataFrame): The DataFrame to analyze.
        - process_columns (list): List of column names to process. If None, all columns except skip_columns will be processed.
        - skip_columns (list): List of column names to skip.

    Prints:
        - Number of unique values in each processed column.
        - Unique values in each processed column.
        - Number of occurrences for each unique value in each processed column.
    """
    if skip_columns is None:
        skip_columns = []
    if process_columns is None:
        process_columns = df.columns.tolist()

    # Process the specified columns
    for column in process_columns:
        if column in skip_columns:
            continue  # Skip the column if it is in the skip list
        print("--------------------")
        print(f"Analyzing column: '{column}'")
        print("--------------------")
        # Number of unique values
        num_unique = df[column].nunique()
        unique_values = df[column].unique()

        print(f"  Number of unique values: {num_unique}")
        print("--------------------")
        print(f"  Unique values: {unique_values}")
        # Count occurrences of each unique value
        value_counts = df[column].value_counts()
        print("--------------------")
        for value, count in value_counts.items():
            print(f"  Number of occurrences of '{value}': {count}")

        print()  # Newline for readability between columns

In [49]:
analyze_ordinal_columns(df, ["main_category"])

--------------------
Analyzing column: 'main_category'
--------------------
  Number of unique values: 36
--------------------
  Unique values: ['All Beauty' 'Industrial & Scientific' 'Health & Personal Care' 'Baby'
 'AMAZON FASHION' 'Amazon Home' 'Arts, Crafts & Sewing'
 'Tools & Home Improvement' 'Unknown' 'Office Products'
 'Sports & Outdoors' 'Automotive' 'Grocery' 'All Electronics'
 'Car Electronics' 'Camera & Photo' 'Pet Supplies' 'Toys & Games'
 'Digital Music' 'Handmade' 'Cell Phones & Accessories' 'Computers'
 'Musical Instruments' 'Premium Beauty' 'Appliances' 'Movies & TV'
 'Amazon Devices' 'Books' 'Video Games' 'Home Audio & Theater'
 'Collectible Coins' 'Portable Audio & Accessories' 'GPS & Navigation'
 'Sports Collectibles' 'Collectibles & Fine Art' 'Amazon Fire TV']
--------------------
  Number of occurrences of 'AMAZON FASHION': 168511
  Number of occurrences of 'All Beauty': 36412
  Number of occurrences of 'Sports & Outdoors': 22845
  Number of occurrences of 'Unknow

### TODO:
- all categories that have under 150 products should go to test split
- check the percentage of matches between store, brand, manufacturer
- split the dataset
- shuffle the dataset
- upload the df to Hugging Face