# Loading dataset and splitting into train, val and test

First import the requiered libraries, and then set the column width to be wider so that the DataFrame is displayed more clearly

In [1]:
import pandas as pd
from datasets import DatasetDict, Dataset, load_dataset

pd.set_option('display.max_colwidth', 500)
#To reset this option, uncomment the line below and re-run the cell
#pd.reset_option('all')

Then it's time to load the dataset for preprocessing

In [2]:
#Load dataset from Hugging Face
dataset = load_dataset("smartcat/Amazon_Sample_Metadata_2023", name="product2query_V1", split="train")

Convert dataset into Pandas DataFrame for easier processing

In [3]:
#Convert dataset to pandas DataFrame
df = pd.DataFrame(dataset)

Then, set the column 'parent_asin' to be the first in line, as it's the product id.

In [4]:
new_column_order = ["parent_asin"] + [col for col in df.columns if col != "parent_asin"] 
df = df[new_column_order]

In [5]:
df.head(3)

Unnamed: 0,parent_asin,main_category,title,features,description,price,store,brand,manufacturer,short_query,long_query
0,B0C1ND3FVR,All Beauty,Murray & Lanman Florida Water Plastic Bottle 7.5 oz,Unisex\nNot tested on animals,"The enduring popularity of world-famous Florida Water is due to its wonderful light floral scent with lemon overtones. Unchanged since 1808, this revitalizing scent is used as an invigorating splash, aftershave, or fragrant addition to the bath. For more than 200 years men and women have loved the Florida Water scent, named for the Fountain of Youth.",,Murray & Lanman,Murray & Lanman,Atlas Ethnic,Florida Water Fragrance,Murray & Lanman Florida Water 7.5 oz Plastic Bottle with Classic Floral Scent
1,B001PIVFU0,All Beauty,"GiGi Professional Multi-Purpose Wax Warmer, with See-Through Cover, White",Thermostat-controlled heating,GiGi Professional Multi-Purpose Wax Warmer with See-Through Cover,25.87,GiGi,GiGi,Barg Engine,GiGi Wax Warmer,GiGi Professional Multi-Purpose Wax Warmer with See-Through Cover
2,B004ZWH3XG,All Beauty,"Garnier Fructis Color Sealer, Instant, Lightweight Leave-In, Color Shield, For Color-Treated Hair, 6 oz.",Seals In Color And Conditions For Extra Softness\nInfused With Acai Berry And Grape Seed Oil\nFor Color Treated Hair\nLightweight Formula,"Proven to stop dry-out. Fight fade-out. The lightweight formula, infused with acai berry and grape seed oil, instantly seals in color for longer-lasting vibrancy while improving manageability and delivering lasting softness. Uva and uvb protectant help protect hair. Garnier fructis color shield instant color sealer lightweight leave-in instantly seals in color and conditions for extra softness.",24.0,Garnier,Garnier LLC,Garnier LLC,Garnier Fructis Color Sealer,Garnier Fructis Color Shield Instant Color Sealer for Color-Treated Hair


Some helper functions to check missing values

In [6]:
def check_na(df):
    """_summary_
    Prints sum of NA by columns in DataFrame.

    Args:
       - df (DataFrame): the DataFrame chunk

    Prints:
       - Number of NA values by column.
    """
    NANs = df.isnull().sum()
    print("--------------------")
    print("Nulls in dataset:\n", NANs)

def check_empty_lists(df):
    """
    Prints sum of empty lists by columns in DataFrame.

    Parameters:
        - df (DataFrame): the DataFrame chunk

    Prints:
        - Number of empty lists by column.
    """
    empty_list_counts = df.apply(lambda col: col.apply(lambda x: x == []).sum())
    print("--------------------")
    print("Empty lists:")

    print(empty_list_counts)


def check_empty_strings(df):
    """
    Prints sum of empty strings by columns in DataFrame.

    Parameters:
        - df (DataFrame): the DataFrame chunk

    Prints:
        - Number of empty strings by column.
    """
    empty_strings = (df == "").sum()
    print("--------------------")
    print("Empty strings:")
    print(empty_strings[empty_strings > 0])


def check_empty_dictionaries(df):
    """
    Prints the count of empty dictionaries by column in the DataFrame.

    Parameters:
        - df (pd.DataFrame): The DataFrame to check for empty dictionaries.

    Prints:
        - Number of empty dictionaries by columns.
    """
    empty_dicts = df.apply(
        lambda x: x.apply(lambda y: isinstance(y, dict) and len(y) == 0).sum()
    )

    print("--------------------")
    print("Empty dictionaries:")
    print(empty_dicts[empty_dicts > 0])

In [7]:
check_na(df)
check_empty_lists(df)
check_empty_dictionaries(df)
check_empty_strings(df)

--------------------
Nulls in dataset:
 parent_asin           0
main_category         0
title                 0
features              0
description           0
price            143604
store                 0
brand             62970
manufacturer      62970
short_query           0
long_query            0
dtype: int64
--------------------
Empty lists:
parent_asin      0
main_category    0
title            0
features         0
description      0
price            0
store            0
brand            0
manufacturer     0
short_query      0
long_query       0
dtype: int64
--------------------
Empty dictionaries:
Series([], dtype: int64)
--------------------
Empty strings:
main_category    14537
title               22
features         10974
store              659
short_query        482
long_query         482
dtype: int64


We need to to focus on products that have a Title, and every row that doesn't have it will be dropped.

In [8]:
df_empty_title = df[df["title"] == '']
df_empty_title["parent_asin"]

df = df[~df["parent_asin"].isin(df_empty_title["parent_asin"].to_list())]

The same goes for 'short_query' because it will be our label for training.

In [9]:
df_short_query = df[df["short_query"] == '']
df_short_query["parent_asin"]

df = df[~df["parent_asin"].isin(df_short_query["parent_asin"].to_list())]

In [10]:
check_empty_strings(df)

--------------------
Empty strings:
main_category    14504
features         10956
store              659
dtype: int64


Empty categories are set to 'Unknown'

In [11]:
df.loc[df['main_category'] == '', 'main_category'] = 'Unknown'

In [12]:
check_empty_strings(df)

--------------------
Empty strings:
features    10956
store         659
dtype: int64


Analyzing ordinal columns

In [13]:
def analyze_ordinal_columns(df, process_columns=None, skip_columns=None):
    """
    Analyzes specified columns in the DataFrame for unique values and occurrences.

    Parameters:
        - df (pd.DataFrame): The DataFrame to analyze.
        - process_columns (list): List of column names to process. If None, all columns except skip_columns will be processed.
        - skip_columns (list): List of column names to skip.

    Prints:
        - Number of unique values in each processed column.
        - Unique values in each processed column.
        - Number of occurrences for each unique value in each processed column.
    """
    if skip_columns is None:
        skip_columns = []
    if process_columns is None:
        process_columns = df.columns.tolist()

    # Process the specified columns
    for column in process_columns:
        if column in skip_columns:
            continue  # Skip the column if it is in the skip list
        print("--------------------")
        print(f"Analyzing column: '{column}'")
        print("--------------------")
        # Number of unique values
        num_unique = df[column].nunique()
        unique_values = df[column].unique()

        print(f"  Number of unique values: {num_unique}")
        print("--------------------")
        print(f"  Unique values: {unique_values}")
        # Count occurrences of each unique value
        value_counts = df[column].value_counts()
        print("--------------------")
        for value, count in value_counts.items():
            print(f"  Number of occurrences of '{value}': {count}")

        print()  # Newline for readability between columns

In [14]:
analyze_ordinal_columns(df, ["main_category"])

--------------------
Analyzing column: 'main_category'
--------------------
  Number of unique values: 36
--------------------
  Unique values: ['All Beauty' 'Industrial & Scientific' 'Health & Personal Care' 'Baby'
 'AMAZON FASHION' 'Amazon Home' 'Arts, Crafts & Sewing'
 'Tools & Home Improvement' 'Unknown' 'Office Products'
 'Sports & Outdoors' 'Automotive' 'Grocery' 'All Electronics'
 'Car Electronics' 'Camera & Photo' 'Pet Supplies' 'Toys & Games'
 'Digital Music' 'Handmade' 'Cell Phones & Accessories' 'Computers'
 'Musical Instruments' 'Premium Beauty' 'Appliances' 'Movies & TV'
 'Amazon Devices' 'Books' 'Video Games' 'Home Audio & Theater'
 'Collectible Coins' 'Portable Audio & Accessories' 'GPS & Navigation'
 'Sports Collectibles' 'Collectibles & Fine Art' 'Amazon Fire TV']
--------------------
  Number of occurrences of 'AMAZON FASHION': 168511
  Number of occurrences of 'All Beauty': 36412
  Number of occurrences of 'Sports & Outdoors': 22845
  Number of occurrences of 'Unknow

From dataset review, we see that the brand and manufacturer were similair columns and wanted to check how many of exact matches are there.

In [15]:
df_lower = df.astype(str).apply(lambda x: x.str.lower())
total_rows = len(df)

In [16]:
brand_store_match = (df_lower["brand"] == df_lower["store"]).sum() / total_rows * 100
brand_manufacturer_match = (df_lower["brand"] == df_lower["manufacturer"]).sum() / total_rows * 100
store_manufacturer_match = (df_lower["store"] == df_lower["manufacturer"]).sum() / total_rows * 100

In [17]:
print(f"Brand & Store match: {brand_store_match:.2f}%")
print(f"Brand & Manufacturer match: {brand_manufacturer_match:.2f}%")
print(f"Store & Manufacturer match: {store_manufacturer_match:.2f}%")

Brand & Store match: 61.35%
Brand & Manufacturer match: 89.89%
Store & Manufacturer match: 51.76%


In [18]:
null_counts  = df[df[["brand", "manufacturer"]].isnull().all(axis=1)]

In [19]:
df["brand"] = df["brand"].fillna(df["store"])

Dropping unused columns

In [20]:
df.drop(columns=["store", "manufacturer", "price"], inplace=True)

Adding images and embellished description

In [21]:
temp_ds = load_dataset("smartcat/Amazon_Sample_Metadata_2023", name="combined_description_formatted", split="train")

In [22]:
temp_df = pd.DataFrame(temp_ds)

In [23]:
result = pd.merge(df, temp_df[['parent_asin', 'images', 'text']], on='parent_asin', how='left')

In [24]:
result = result.rename(columns={'text': 'embellished_description'})

In [25]:
result = result[['parent_asin', 'main_category', 'title',  'description', 'features', 'embellished_description',
       'brand', 'images', 'short_query', 'long_query']]

In [26]:
df = result

Calculating the number of categories with low distibution so they can be saved in a list and later in the test split of the dataset so the accuracy of the model could be tested.

In [27]:
value_counts = df["main_category"].value_counts()
print("--------------------")
small_count = 0
small_counts_list = []
for value, count in value_counts.items():
    print(f"  Number of occurrences of '{value}': {count}")
    if count < 150:
        small_count += count
        small_counts_list.append(value)
print(small_count)
print(small_counts_list)

--------------------
  Number of occurrences of 'AMAZON FASHION': 168513
  Number of occurrences of 'All Beauty': 36412
  Number of occurrences of 'Sports & Outdoors': 22845
  Number of occurrences of 'Unknown': 14504
  Number of occurrences of 'Health & Personal Care': 5218
  Number of occurrences of 'Amazon Home': 3369
  Number of occurrences of 'Tools & Home Improvement': 1882
  Number of occurrences of 'Automotive': 972
  Number of occurrences of 'Industrial & Scientific': 644
  Number of occurrences of 'Office Products': 482
  Number of occurrences of 'Arts, Crafts & Sewing': 336
  Number of occurrences of 'Pet Supplies': 332
  Number of occurrences of 'Toys & Games': 297
  Number of occurrences of 'Cell Phones & Accessories': 246
  Number of occurrences of 'All Electronics': 188
  Number of occurrences of 'Premium Beauty': 159
  Number of occurrences of 'Grocery': 158
  Number of occurrences of 'Baby': 154
  Number of occurrences of 'Computers': 117
  Number of occurrences of 'Ca

In [28]:
# Percentage of category counts under 150
print(small_count * 100 /len(df),"%")

0.13071539446093516 %


In [29]:
df_small_counts = df[df["main_category"].isin(small_counts_list)]

In [30]:
df_small_counts = df_small_counts.sample(frac=1, random_state=42)

In [31]:
df_without_small = df[~df["parent_asin"].isin(df_small_counts["parent_asin"].to_list())]

In [32]:
df_without_small = df_without_small.sample(frac=1, random_state=42)

Making the train, validation and test split using the 80-10-10 split where, 80% of the dataset will be taken for the train, 10% of dataset for the validation and the last 10% (including the small distibution categories) will be taken for the test split.

In [33]:
# Compute exact train, validation, and test sizes
total_rows = len(df)
train_len = int(total_rows * 0.8)  # 80% of total rows
validation_len = int(total_rows * 0.1)  # 10% of total rows

# Ensure all rows are allocated by assigning remaining rows to test
test_len = total_rows - (train_len + validation_len + small_count)  # Remaining rows go to test

print(train_len, validation_len, test_len + small_count)


205637 25704 25706


In [34]:
train_df = df_without_small[:train_len]

In [35]:
validation_df = df_without_small[train_len:train_len + validation_len]

In [36]:
test_df = df_without_small[(train_len + validation_len):]

In [37]:
test_df = pd.concat([test_df, df_small_counts])

In [38]:
train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

Converting data and splits (train, validation, test) into Dataset from pandas and making one final dataset with those 3 splits

In [39]:
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

In [40]:
final_dataset = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

Finally dataset is ready for uploading on hugging face.

In [43]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
final_dataset.push_to_hub("smartcat/Amazon-2023-GenQ")