In [1]:
import os
import pandas as pd

#Here we are importing the Amazon Data set

# ✅ Set working directory to repo root
os.chdir("/Users/arvidsson/Desktop/Predicting Customer Dissatisfaction/Safeify/summer-2025-safeify")

# ✅ Define correct file path relative to repo root
file_path = "Data/amazon_meta.json"

# ✅ Confirm and load the data
print("🛠️ Now working from:", os.getcwd())
print("✅ Exists?", os.path.exists(file_path))

print("📥 Loading Amazon metadata...")
amazon_df = pd.read_json(file_path, lines=True)
print("✅ Loaded", len(amazon_df), "rows")
print("📦 Columns:", amazon_df.columns.tolist())
print("🧪 Sample:")
print(amazon_df.head(3))


🛠️ Now working from: /Users/arvidsson/Desktop/Predicting Customer Dissatisfaction/Safeify/summer-2025-safeify
✅ Exists? True
📥 Loading Amazon metadata...
✅ Loaded 633883 rows
📦 Columns: ['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details']
🧪 Sample:
                                  category tech1  \
0  [Toys & Games, Puzzles, Jigsaw Puzzles]         
1                                       []         
2                                       []         

                                         description fit  \
0  [Three Dr. Suess' Puzzles: Green Eggs and Ham,...       
1  [<b>Prepare to be Afraid!</b><br /><br />The B...       
2                                                 []       

                                            title also_buy tech2  \
0  Dr. Suess 19163 Dr. Seuss Puzzle 3 Pack Bundle       []         
1     Pa

In [2]:
# Importing all Recall Data, combining the three files

def load_clean_csv(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # ✅ Find the first line that contains all expected headers (starts with 'Report No.')
    header_index = next(i for i, line in enumerate(lines) if 'Report No.' in line)

    # ✅ Load CSV from that line forward
    return pd.read_csv(path, skiprows=header_index)

# ✅ Paths to all three files
recall_files = [
    "Data/Current Version of Toys Incidence+Recall/Toysandchildren_ArtsandCrafts.csv",
    "Data/Current Version of Toys Incidence+Recall/Toysandchildren_Riding_Toys.csv",
    "Data/Current Version of Toys Incidence+Recall/Toysandchildren_Toys.csv"
]

# ✅ Load all cleanly
recall_dfs = [load_clean_csv(path) for path in recall_files]

# ✅ Merge into single df
recalls_df = pd.concat(recall_dfs, ignore_index=True)

# ✅ Preview
print("✅ Combined recall records:", len(recalls_df))
print("📦 Columns:", recalls_df.columns.tolist())
recalls_df.head(3)


✅ Combined recall records: 2514
📦 Columns: ['Report No.', 'Report Date', 'Sent to Manufacturer / Importer / Private Labeler', 'Publication Date', 'Category of Submitter', 'Product Description', 'Product Category', 'Product Sub Category', 'Product Type', 'Product Code', 'Manufacturer / Importer / Private Labeler Name', 'Brand', 'Model Name or Number', 'Serial Number', 'UPC', 'Date Manufactured', 'Manufacturer Date Code', 'Retailer', 'Retailer State', 'Purchase Date', 'Purchase Date Is Estimate', 'Incident Description', 'City', 'State', 'ZIP', 'Location', '(Primary) Victim Severity', "(Primary) Victim's Sex", 'My Relation To The (Primary) Victim', "(Primary) Victim's Age (years)", 'Submitter Has Product', 'Product Was Damaged Before Incident', 'Damage Description', 'Damage Repaired', 'Product Was Modified Before Incident', 'Have You Contacted The Manufacturer', 'If Not Do You Plan To', 'Answer Explanation', 'Company Comments', 'Associated Report Numbers']


Unnamed: 0,Report No.,Report Date,Sent to Manufacturer / Importer / Private Labeler,Publication Date,Category of Submitter,Product Description,Product Category,Product Sub Category,Product Type,Product Code,...,Submitter Has Product,Product Was Damaged Before Incident,Damage Description,Damage Repaired,Product Was Modified Before Incident,Have You Contacted The Manufacturer,If Not Do You Plan To,Answer Explanation,Company Comments,Associated Report Numbers
0,20231002-D13C7-2147344911,10/2/2023,10/27/2023,11/20/2023,Consumer,Slime globe with colored spheres which resembl...,Toys & Children,Arts & Crafts,Molding Compounds (1376),1376,...,,,,,,,,,TOYSMITH: Thank you for the opportunity to rev...,
1,20180126-BBF18-2147393362,1/26/2018,4/13/2018,4/27/2018,Consumer,Slime kit from Nickelodeon by Cra-Z-Art,Toys & Children,Arts & Crafts,Molding Compounds (1376),1376,...,Yes,No,,,No,Yes,,I already have explained.,We at Cra-Z-Art® are very sorry that the cons...,
2,20141231-7C153-2147437145,12/31/2014,1/9/2015,1/26/2015,Consumer,Lalaloopsy Color Me Doll ( Squiggles N. Shapes...,Toys & Children,Arts & Crafts,Crayons or Chalk (5010),5010,...,,,,,,No,No,,The chalk markers included with this item are ...,


In [3]:
# Features we can use for fuzzy matching.

print(amazon_df.columns)
print(recalls_df.columns.tolist())

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')
['Report No.', 'Report Date', 'Sent to Manufacturer / Importer / Private Labeler', 'Publication Date', 'Category of Submitter', 'Product Description', 'Product Category', 'Product Sub Category', 'Product Type', 'Product Code', 'Manufacturer / Importer / Private Labeler Name', 'Brand', 'Model Name or Number', 'Serial Number', 'UPC', 'Date Manufactured', 'Manufacturer Date Code', 'Retailer', 'Retailer State', 'Purchase Date', 'Purchase Date Is Estimate', 'Incident Description', 'City', 'State', 'ZIP', 'Location', '(Primary) Victim Severity', "(Primary) Victim's Sex", 'My Relation To The (Primary) Victim', "(Primary) Victim's Age (years)", 'Submitter Has Product', 'Product Was Damaged Before Incident', 'Damage Description', 'Damage Repaired

The code below creates a dictionary with each entry being a tuple containing two dataframes 
(recalls_df['Brand'], matched_amazon_by_brand) 
Here matched_amazon_by_brand is a Dataframe containing all Amazon entries that Fuzzy match 'brand' with 'Brand' above a certain threshold, here it is 90. To save time, and if you like to run this as an experiment, the block below uses a sample of 500 brands and 30 000 Amazon entries. The full dataset is explored at the end of this notebook.

In [4]:
from rapidfuzz import fuzz
import pandas as pd

# Step 1: Clean and sample brand names from recalls_df
sampled_brands = pd.Series(recalls_df['Brand'].dropna().unique()[:500])
sampled_brands = sampled_brands.astype(str).str.lower().str.strip()

# Step 2: Sample and clean Amazon data
amazon_sample = amazon_df.sample(30000, random_state=42).copy()
amazon_sample['brand'] = amazon_sample['brand'].astype(str).str.lower().str.strip()

# Step 3: Clean 'Brand' in recalls_df for matching
recalls_df['Brand'] = recalls_df['Brand'].astype(str).str.lower().str.strip()

# Step 4: Prepare dictionary to store tuples (Amazon matches, recall row(s))
matched_amazon_by_brand = {}

# Step 5: For each brand, store Amazon matches + associated recall row(s)
for recall_brand in sampled_brands:
    # Find Amazon entries that match this brand (fuzzy match ≥ 90)
    matched_rows = amazon_sample[amazon_sample['brand'].apply(
        lambda x: fuzz.token_sort_ratio(x, recall_brand) >= 90
    )]

    # Get corresponding recall row(s) for this brand
    recall_rows = recalls_df[recalls_df['Brand'] == recall_brand]

    # Only store if there are matched Amazon entries
    if not matched_rows.empty:
        matched_amazon_by_brand[recall_brand] = (recall_rows.copy(), matched_rows.copy())

# ✅ matched_amazon_by_brand now maps each brand → (recall_rows_df, matched_amazon_rows_df)

# Optional: Print summary
print(f"✅ Found matches for {len(matched_amazon_by_brand)} recall brand entries.")


✅ Found matches for 130 recall brand entries.


The two blocks below do dictionaries based on 'Product Description' from the recall_df and fuzzy much with title and description respectively. You can skip them, they don't give great matches. We should probably include these two steps with a high fuzz ratio in the end result.

In [8]:
#Function matching Amazon titles to recall product descriptions and creating a dictionary of matches
#This code does the same as the above but for title and description
from rapidfuzz import fuzz
import pandas as pd

# Step 1: Clean and sample Product Descriptions from recalls_df
sampled_descriptions = pd.Series(recalls_df['Product Description'].dropna().unique()[:500])
sampled_descriptions = sampled_descriptions.astype(str).str.lower().str.strip()

# Step 2: Sample and clean Amazon data
amazon_sample['title'] = amazon_sample['title'].astype(str).str.lower().str.strip()

# Step 3: Clean 'Product Description' in recalls_df for matching
recalls_df['Product Description'] = recalls_df['Product Description'].astype(str).str.lower().str.strip()

# Step 4: Prepare dictionary to store tuples (Amazon matches, recall row(s))
matched_amazon_by_description = {}

# Step 5: For each description, store Amazon matches + associated recall row(s)
for recall_description in sampled_descriptions:
    # Find Amazon entries where title fuzzily matches description (≥ 90)
    matched_rows = amazon_sample[amazon_sample['title'].apply(
        lambda x: fuzz.token_sort_ratio(x, recall_description) >= 80
    )]

    # Get corresponding recall row(s) for this description
    recall_rows = recalls_df[recalls_df['Product Description'] == recall_description]

    # Only store if there are matched Amazon entries
    if not matched_rows.empty:
        matched_amazon_by_description[recall_description] = (recall_rows.copy(), matched_rows.copy())

# ✅ matched_amazon_by_description now maps each product description → (recall_rows_df, matched_amazon_rows_df)

# Optional: Print summary
print(f"✅ Found matches for {len(matched_amazon_by_description)} recall product description entries.")


✅ Found matches for 3 recall product description entries.


In [9]:
#Function matching Amazon titles to recall product descriptions and creating a dictionary of matches
# This code does the same as the above but for feature and description
from rapidfuzz import fuzz
import pandas as pd

# Step 1: Clean and sample Product Descriptions from recalls_df
sampled_descriptions = pd.Series(recalls_df['Product Description'].dropna().unique()[:500])
sampled_descriptions = sampled_descriptions.astype(str).str.lower().str.strip()

# Step 2: Sample and clean Amazon data
amazon_sample['feature'] = amazon_sample['feature'].astype(str).str.lower().str.strip()

# Step 3: Clean 'Product Description' in recalls_df for matching
recalls_df['Product Description'] = recalls_df['Product Description'].astype(str).str.lower().str.strip()

# Step 4: Prepare dictionary to store tuples (Amazon matches, recall row(s))
matched_amazon_by_feature = {}

# Step 5: For each description, store Amazon matches + associated recall row(s)
for recall_description in sampled_descriptions:
    # Find Amazon entries where title fuzzily matches description (≥ 90)
    matched_rows = amazon_sample[amazon_sample['feature'].apply(
        lambda x: fuzz.token_sort_ratio(x, recall_description) >= 80
    )]

    # Get corresponding recall row(s) for this description
    recall_rows = recalls_df[recalls_df['Product Description'] == recall_description]

    # Only store if there are matched Amazon entries
    if not matched_rows.empty:
        matched_amazon_by_feature[recall_description] = (recall_rows.copy(), matched_rows.copy())

# ✅ matched_amazon_by_description now maps each product description → (recall_rows_df, matched_amazon_rows_df)

# Optional: Print summary
print(f"✅ Found matches for {len(matched_amazon_by_feature)} recall product description entries.")


✅ Found matches for 1 recall product description entries.


Now we continue with the Brand matched dictionary, we filter it for 'Product Description' and 'title' match. The matches are much more reliable on this subset where brands are already matched. 

In [11]:
# 🔢 Stats on number of Amazon listings matched per recall brand
match_counts = [len(amazon_df) for _, amazon_df in matched_amazon_by_brand.values()]

if match_counts:
    avg_matches = sum(match_counts) / len(match_counts)
    min_matches = min(match_counts)
    max_matches = max(match_counts)

    print(f"📊 Average # of Amazon listings matched per brand: {avg_matches:.2f}")
    print(f"🔽 Minimum # of matches: {min_matches}")
    print(f"🔼 Maximum # of matches: {max_matches}")
else:
    print("⚠️ No matches found to compute stats.")


📊 Average # of Amazon listings matched per brand: 29.73
🔽 Minimum # of matches: 1
🔼 Maximum # of matches: 529


In [12]:
# This is the number of unique Amazon listings matched by brand to the brand category in the recall data set.
# Collect all matched Amazon listings across all brands
all_amazon_matches = pd.concat(
    [amazon_df_matches for _, amazon_df_matches in matched_amazon_by_brand.values()],
    ignore_index=True
)

# Drop duplicates by ASIN to count unique listings
total_unique_amazon_matches = all_amazon_matches.drop_duplicates(subset='asin').shape[0]

print(f"📦 Total unique asin nr Amazon listings matched by brand: {total_unique_amazon_matches}")


📦 Total unique asin nr Amazon listings matched by brand: 3456


Now the idea is to do Fuzzy matching on each entry of the Dictionary we have created, using 'Product Description' from the recall data and some interesting category from the Amazon data. Let's have a look on 'title', 'feature' and 'description'.

In [None]:
i = 10 #choosing brand nr 10 from dictionary, change i to see different brands
brand = sampled_brands.iloc[i]
print(f"\n🔍 Brand selected (index {i}): '{brand}'")

if brand in matched_amazon_by_brand:
    recall_df, amazon_df_matches = matched_amazon_by_brand[brand]

    # Display the dataframes (normal truncated view)
    display(recall_df)
    display(amazon_df_matches) 

    # Display full text of selected columns
    print("\n📌 Product Descriptions (Recall):")
    print(recall_df['Product Description'].to_string(index=False))

    print("\n📌 Amazon 'description':")
    print(amazon_df_matches['description'].to_string(index=False))

    print("\n📌 Amazon 'title':")
    print(amazon_df_matches['title'].to_string(index=False))

    print("\n📌 Amazon 'feature':")
    print(amazon_df_matches['feature'].to_string(index=False))
else:
    print(f"No matches found for brand: {brand}")



After some inspection and experimentation it seems like the 'title' category is the best option. Below we use a fuzzy match of 80, which on the brand -- matched dataframes seem to do the right thing, look at the code below. I originally used 70, which might have been to low. 

In [13]:
from rapidfuzz import fuzz
import pandas as pd

all_matched_asins = set()  # ✅ Declare outside the loop to accumulate globally

# Loop over all brands in sampled_brands
for i, brand in enumerate(sampled_brands):
    if brand in matched_amazon_by_brand:
        recall_df, amazon_df_matches = matched_amazon_by_brand[brand]

        # Clean and prepare text columns
        recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
        amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

        matches = []

        # Compare each product description in recall_df to each Amazon title
        for recall_idx, recall_desc in recall_df['Product Description'].items():
            for amazon_idx, amazon_title in amazon_df_matches['title'].items():
                score = fuzz.token_set_ratio(recall_desc, amazon_title)
                if score >= 70:
                    matches.append({
                        'recall_idx': recall_idx,
                        'amazon_idx': amazon_idx,
                        'recall_desc': recall_desc,
                        'amazon_title': amazon_title,
                        'score': score
                    })

        if matches:
            matched_desc_df = pd.DataFrame(matches).sort_values(by='score', ascending=False)
            print(f"\n🔍 [{i}] Brand: '{brand}'")
            print(f"✅ Found {len(matched_desc_df)} title matches")
            display(matched_desc_df.head(2))  # Limit display to top 2
            unique_asins = amazon_df_matches.loc[matched_desc_df['amazon_idx'], 'asin'].nunique()
            print(f"🔢 Total unique matched ASINs: {unique_asins}")
            all_matched_asins.update(amazon_df_matches.loc[matched_desc_df['amazon_idx'], 'asin'].dropna().unique())

# ✅ Final summary after all brands
print(f"\n🧮 Total unique ASINs across all matched brands: {len(all_matched_asins)}")



🔍 [4] Brand: 'nickelodeon'
✅ Found 7 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
2,1914,334226,childs bed tent with push light,dora the explorer bed tent with push light fea...,87.272727
5,2142,270893,"teenage mutant ninja turtles teepee, pillow & ...",teenage mutant ninja turtles foot soldier,81.15942


🔢 Total unique matched ASINs: 5

🔍 [7] Brand: 'crayola'
✅ Found 7 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,8,486730,crayon,"crayola tin crayon box, holds 64 crayons, 6&qu...",100.0
1,8,431986,crayon,crayola &quot;metallic magic&quot; 16 count cr...,100.0


🔢 Total unique matched ASINs: 7

🔍 [12] Brand: 'playskool'
✅ Found 4 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,2033,2327,playskool mr. potato head pirate spud,playskool mrs. potato head,89.361702
1,2033,11926,playskool mr. potato head pirate spud,playskool mrs. potato head,89.361702


🔢 Total unique matched ASINs: 3

🔍 [25] Brand: 'radio flyer'
✅ Found 9 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
7,1467,448525,radio flyer little red wagon,radio flyer little red wagon 12-1/4 in. x 7-1/...,100.0
4,292,4208,radio flyer 10” classic red tricycle,radio flyer classic red scooter,85.185185


🔢 Total unique matched ASINs: 4

🔍 [27] Brand: 'fisher price'
✅ Found 53 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
27,924,481744,fisher-price little people yellow school bus (...,fisher-price little people artist,88.135593
19,753,70889,fisher price laugh & learn learning kitchen,fisher price laugh &amp; learn learning gift s...,86.842105


🔢 Total unique matched ASINs: 32

🔍 [35] Brand: 'kiddieland'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,52,509042,toy train thomas foot-to-floor ride-on,my first thomas the train ride-on,75.471698


🔢 Total unique matched ASINs: 1

🔍 [36] Brand: 'kiddieland'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,52,509042,toy train thomas foot-to-floor ride-on,my first thomas the train ride-on,75.471698


🔢 Total unique matched ASINs: 1

🔍 [49] Brand: 'little people'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,331,14941,fisher price little people firetruck toy from ...,fisher price little people fun park,85.245902


🔢 Total unique matched ASINs: 1

🔍 [62] Brand: 'kiddieland'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,52,509042,toy train thomas foot-to-floor ride-on,my first thomas the train ride-on,75.471698


🔢 Total unique matched ASINs: 1

🔍 [65] Brand: 'disney'
✅ Found 80 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
76,2020,374231,disney doc mcstuffins 6-pc. figure setheads of...,disney doc mcstuffins drum,89.361702
68,1859,154238,disney mickey mouse clubhouse magic reveal gam...,mickey mouse clubhouse bingo,88.0


🔢 Total unique matched ASINs: 60

🔍 [80] Brand: 'best choice products'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,242,628490,best choice products 12v kids ride on truck ca...,best choice products ride on fire truck speeds...,74.782609


🔢 Total unique matched ASINs: 1

🔍 [81] Brand: 'fisher-price'
✅ Found 35 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
30,1849,543884,fisher price chatter telephone,fisher-price chatter telephone,96.666667
34,2431,314146,fisher-price laugh & learn remix record player...,fisher-price laugh &amp; learning music player,93.023256


🔢 Total unique matched ASINs: 17

🔍 [84] Brand: 'hot wheels'
✅ Found 4 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
1,838,585065,hot wheels track builder system,hot wheels track builder system power booster kit,100.0
0,838,367023,hot wheels track builder system,hot wheels track builder essentials launch pack,87.272727


🔢 Total unique matched ASINs: 2

🔍 [89] Brand: 'little tikes'
✅ Found 27 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,153,39581,little tikes cozy coupe,little tikes dora the explorer cozy coupe,100.0
5,153,11651,little tikes cozy coupe,little tikes cozy coupe,100.0


🔢 Total unique matched ASINs: 9

🔍 [91] Brand: 'bright starts'
✅ Found 4 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
2,472,176574,bright starts take n shake cow,bright starts bright starts take n' shake stro...,88.888889
0,118,218338,bright starts having a ball pop and roll roads...,bright starts having a ball roll and chase bum...,80.898876


🔢 Total unique matched ASINs: 2

🔍 [97] Brand: 'little tikes'
✅ Found 27 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,153,39581,little tikes cozy coupe,little tikes dora the explorer cozy coupe,100.0
5,153,11651,little tikes cozy coupe,little tikes cozy coupe,100.0


🔢 Total unique matched ASINs: 9

🔍 [113] Brand: 'power wheels'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,221,545660,barbie power wheels camper,power wheels batman dune racer,71.428571


🔢 Total unique matched ASINs: 1

🔍 [115] Brand: 'vtech'
✅ Found 5 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,642,155219,vtech yellow count and learn school busbrand v...,vtech - count and learn school bus,90.322581
0,583,359617,vtech helicopter toy,french language vtech explore helicopter,88.888889


🔢 Total unique matched ASINs: 4

🔍 [119] Brand: 'rockin’ rider'
✅ Found 2 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
1,161,503845,rockin’ rider buttons jr. rocking horse,rockin' rider ribbons jr. rocking horse ride on,81.395349
0,161,416884,rockin’ rider buttons jr. rocking horse,ranger rocking horse,78.787879


🔢 Total unique matched ASINs: 2

🔍 [124] Brand: 'best choice products'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,242,628490,best choice products 12v kids ride on truck ca...,best choice products ride on fire truck speeds...,74.782609


🔢 Total unique matched ASINs: 1

🔍 [128] Brand: 'fisher price'
✅ Found 53 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
27,924,481744,fisher-price little people yellow school bus (...,fisher-price little people artist,88.135593
19,753,70889,fisher price laugh & learn learning kitchen,fisher price laugh &amp; learn learning gift s...,86.842105


🔢 Total unique matched ASINs: 32

🔍 [129] Brand: 'tonka'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,173,21589,tonka mighty dump truck by dynacraft yellow an...,tonka mighty blaze,80.0


🔢 Total unique matched ASINs: 1

🔍 [136] Brand: 'barbie'
✅ Found 8 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
2,2052,375564,hula hair barbie,barbie hair challenge board game,81.481481
3,2052,433494,hula hair barbie,"barbie long hair doll, brunette",81.481481


🔢 Total unique matched ASINs: 8

🔍 [140] Brand: 'melissa & doug'
✅ Found 19 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
9,2474,398108,melissa & doug ms. rachel puzzle,melissa &amp; doug 500-piece square meals heal...,74.509804
10,2474,139179,melissa & doug ms. rachel puzzle,melissa &amp; doug - 13268 - puzzle - fish col...,74.509804


🔢 Total unique matched ASINs: 19

🔍 [141] Brand: 'power wheels'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,221,545660,barbie power wheels camper,power wheels batman dune racer,71.428571


🔢 Total unique matched ASINs: 1

🔍 [150] Brand: 'radio flyer'
✅ Found 9 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
7,1467,448525,radio flyer little red wagon,radio flyer little red wagon 12-1/4 in. x 7-1/...,100.0
4,292,4208,radio flyer 10” classic red tricycle,radio flyer classic red scooter,85.185185


🔢 Total unique matched ASINs: 4

🔍 [160] Brand: 'power wheels'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,221,545660,barbie power wheels camper,power wheels batman dune racer,71.428571


🔢 Total unique matched ASINs: 1

🔍 [165] Brand: 'fisher price'
✅ Found 53 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
27,924,481744,fisher-price little people yellow school bus (...,fisher-price little people artist,88.135593
19,753,70889,fisher price laugh & learn learning kitchen,fisher price laugh &amp; learn learning gift s...,86.842105


🔢 Total unique matched ASINs: 32

🔍 [167] Brand: 'fisher price'
✅ Found 53 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
27,924,481744,fisher-price little people yellow school bus (...,fisher-price little people artist,88.135593
19,753,70889,fisher price laugh & learn learning kitchen,fisher price laugh &amp; learn learning gift s...,86.842105


🔢 Total unique matched ASINs: 32

🔍 [184] Brand: 'radio flyer'
✅ Found 9 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
7,1467,448525,radio flyer little red wagon,radio flyer little red wagon 12-1/4 in. x 7-1/...,100.0
4,292,4208,radio flyer 10” classic red tricycle,radio flyer classic red scooter,85.185185


🔢 Total unique matched ASINs: 4

🔍 [192] Brand: 'hape'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,741,585181,"wooden pole with handle, spinning butterflies ...",award winning hape racing stripes wooden push ...,70.212766


🔢 Total unique matched ASINs: 1

🔍 [204] Brand: 'radio flyer'
✅ Found 9 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
7,1467,448525,radio flyer little red wagon,radio flyer little red wagon 12-1/4 in. x 7-1/...,100.0
4,292,4208,radio flyer 10” classic red tricycle,radio flyer classic red scooter,85.185185


🔢 Total unique matched ASINs: 4

🔍 [223] Brand: 'hexbug'
✅ Found 5 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,322,339517,hexbug ant,hexbug aquabot single,75.0
1,322,490702,hexbug ant,hexbug kids vex power racers kit,75.0


🔢 Total unique matched ASINs: 5

🔍 [250] Brand: 'manhattan toy'
✅ Found 3 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
2,385,619519,manhattan toy stacker rocket baby and toddler ...,manhattan toy wooden rattle and baby bead toy,82.857143
0,362,596010,"manhattan toy feeding set for baby stella, inc...",manhattan toy wee baby stella peach 12&quot; s...,71.428571


🔢 Total unique matched ASINs: 2

🔍 [252] Brand: 'manhattan toy'
✅ Found 3 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
2,385,619519,manhattan toy stacker rocket baby and toddler ...,manhattan toy wooden rattle and baby bead toy,82.857143
0,362,596010,"manhattan toy feeding set for baby stella, inc...",manhattan toy wee baby stella peach 12&quot; s...,71.428571


🔢 Total unique matched ASINs: 2

🔍 [255] Brand: 'toy story'
✅ Found 2 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
1,366,136353,talking buzz lightyear,disney toy story signature collection buzz lig...,100.0
0,366,6445,talking buzz lightyear,toy story electronic buzz lightyear,77.777778


🔢 Total unique matched ASINs: 2

🔍 [265] Brand: 'manhatten toy'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,377,619533,manhatten toy soft book find a bear,manhattan toy soft baby photo book,72.463768


🔢 Total unique matched ASINs: 1

🔍 [266] Brand: 'disney'
✅ Found 80 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
76,2020,374231,disney doc mcstuffins 6-pc. figure setheads of...,disney doc mcstuffins drum,89.361702
68,1859,154238,disney mickey mouse clubhouse magic reveal gam...,mickey mouse clubhouse bingo,88.0


🔢 Total unique matched ASINs: 60

🔍 [278] Brand: 'hasbro'
✅ Found 4 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
2,1450,109833,hasbro star wars titanium series die cast vehi...,star wars,100.0
3,1450,43447,hasbro star wars titanium series die cast vehi...,hasbro titanium series star wars ultra x-wing,83.116883


🔢 Total unique matched ASINs: 4

🔍 [294] Brand: 'vtech'
✅ Found 5 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,642,155219,vtech yellow count and learn school busbrand v...,vtech - count and learn school bus,90.322581
0,583,359617,vtech helicopter toy,french language vtech explore helicopter,88.888889


🔢 Total unique matched ASINs: 4

🔍 [298] Brand: 'star wars'
✅ Found 7 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,1187,12461,star wars lightsaber forge darth maul double-b...,star wars episode 1 darth maul electronic doub...,91.803279
0,1187,284234,star wars lightsaber forge darth maul double-b...,star wars darth maul double bladed fx lightsaber,78.481013


🔢 Total unique matched ASINs: 6

🔍 [307] Brand: 'kidoozie'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,2054,348327,kidoozie “my first purse” by epoch everlasting...,kidoozie my first purse,93.023256


🔢 Total unique matched ASINs: 1

🔍 [313] Brand: 'nerf'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,1948,288119,nerf rival apollo xv-700 blaster (blue),nerf n-strike jolt blaster (blue),73.076923


🔢 Total unique matched ASINs: 1

🔍 [315] Brand: 'imaginarium'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,1242,471731,imaginarium city central train set / table fro...,imaginarium 6v express train,75.555556


🔢 Total unique matched ASINs: 1

🔍 [317] Brand: 'sesame street'
✅ Found 2 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,439,229306,elmo with a guitar,sesame street let's rock elmo with bonus guitar,94.117647
1,668,98079,red elmo plush toy,sesame street plush elmo doll in overalls - toy,87.5


🔢 Total unique matched ASINs: 2

🔍 [336] Brand: 'munchkin'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,462,475401,bobble head bee infant toy by munchkin toy com...,munchkin bobble bee suction toy - 2 count,71.875


🔢 Total unique matched ASINs: 1

🔍 [337] Brand: 'disney'
✅ Found 80 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
76,2020,374231,disney doc mcstuffins 6-pc. figure setheads of...,disney doc mcstuffins drum,89.361702
68,1859,154238,disney mickey mouse clubhouse magic reveal gam...,mickey mouse clubhouse bingo,88.0


🔢 Total unique matched ASINs: 60

🔍 [344] Brand: 'american girl'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,2189,204848,american girl doll #79 has toxic fumes and dan...,american girl kit's aviator doll,72.0


🔢 Total unique matched ASINs: 1

🔍 [350] Brand: 'plantoys'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,792,127076,plantoys vegetable set. item #3601 toy wooden ...,plantoys chef play set,70.588235


🔢 Total unique matched ASINs: 1

🔍 [352] Brand: 'baby einstein'
✅ Found 6 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,1225,521112,this product is called baby einstein take alon...,baby einstein carry along caterpillar toy,92.105263
4,1225,127243,this product is called baby einstein take alon...,baby einstein music mirror,84.444444


🔢 Total unique matched ASINs: 3

🔍 [358] Brand: 'vtech'
✅ Found 5 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,642,155219,vtech yellow count and learn school busbrand v...,vtech - count and learn school bus,90.322581
0,583,359617,vtech helicopter toy,french language vtech explore helicopter,88.888889


🔢 Total unique matched ASINs: 4

🔍 [385] Brand: 'ambi toys'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,524,13854,"ambi toys one man band made in dongguan, china...",ambi toys one man band musical toy,78.571429


🔢 Total unique matched ASINs: 1

🔍 [391] Brand: 'orange tree toys'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,532,501871,animal train puzzle,orange tree toys peter rabbit wooden puzzle train,77.419355


🔢 Total unique matched ASINs: 1

🔍 [393] Brand: 'bright starts'
✅ Found 4 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
2,472,176574,bright starts take n shake cow,bright starts bright starts take n' shake stro...,88.888889
0,118,218338,bright starts having a ball pop and roll roads...,bright starts having a ball roll and chase bum...,80.898876


🔢 Total unique matched ASINs: 2

🔍 [403] Brand: 'kidoozie'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,2054,348327,kidoozie “my first purse” by epoch everlasting...,kidoozie my first purse,93.023256


🔢 Total unique matched ASINs: 1

🔍 [405] Brand: 'fisher-price'
✅ Found 35 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
30,1849,543884,fisher price chatter telephone,fisher-price chatter telephone,96.666667
34,2431,314146,fisher-price laugh & learn remix record player...,fisher-price laugh &amp; learning music player,93.023256


🔢 Total unique matched ASINs: 17

🔍 [414] Brand: 'fisher price'
✅ Found 53 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
27,924,481744,fisher-price little people yellow school bus (...,fisher-price little people artist,88.135593
19,753,70889,fisher price laugh & learn learning kitchen,fisher price laugh &amp; learn learning gift s...,86.842105


🔢 Total unique matched ASINs: 32

🔍 [445] Brand: 'dandee'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,599,456327,musical plush toy- rudolph the red nose reindeer,2014 rudolph the red nosed reindeer 20&quot; p...,76.923077


🔢 Total unique matched ASINs: 1

🔍 [460] Brand: 'batman'
✅ Found 8 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,621,114422,batman action figure 30 to 31,batman the animated series sky dive batman 5&q...,81.632653
1,621,45981,batman action figure 30 to 31,batman returns &gt; deep dive batman action fi...,81.632653


🔢 Total unique matched ASINs: 8

🔍 [467] Brand: 'nerf'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,1948,288119,nerf rival apollo xv-700 blaster (blue),nerf n-strike jolt blaster (blue),73.076923


🔢 Total unique matched ASINs: 1

🔍 [469] Brand: 'vtech'
✅ Found 5 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,642,155219,vtech yellow count and learn school busbrand v...,vtech - count and learn school bus,90.322581
0,583,359617,vtech helicopter toy,french language vtech explore helicopter,88.888889


🔢 Total unique matched ASINs: 4

🔍 [473] Brand: 'baby einstein'
✅ Found 6 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
3,1225,521112,this product is called baby einstein take alon...,baby einstein carry along caterpillar toy,92.105263
4,1225,127243,this product is called baby einstein take alon...,baby einstein music mirror,84.444444


🔢 Total unique matched ASINs: 3

🔍 [480] Brand: 'hexbug'
✅ Found 5 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,322,339517,hexbug ant,hexbug aquabot single,75.0
1,322,490702,hexbug ant,hexbug kids vex power racers kit,75.0


🔢 Total unique matched ASINs: 5

🔍 [494] Brand: 'sesame street'
✅ Found 2 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,439,229306,elmo with a guitar,sesame street let's rock elmo with bonus guitar,94.117647
1,668,98079,red elmo plush toy,sesame street plush elmo doll in overalls - toy,87.5


🔢 Total unique matched ASINs: 2

🔍 [496] Brand: 'munchkin'
✅ Found 1 title matches


Unnamed: 0,recall_idx,amazon_idx,recall_desc,amazon_title,score
0,462,475401,bobble head bee infant toy by munchkin toy com...,munchkin bobble bee suction toy - 2 count,71.875


🔢 Total unique matched ASINs: 1

🧮 Total unique ASINs across all matched brands: 210


We see that on a sample of 30 000 listings a full 3456 was matched by brand to the recall data, after that only 210 got matched further by title and description. 

We will now have a look at the entries in the recall data that has an empty or irrelevand Brand entry.

In [14]:
# Any brands missing?
print("Missing brands in Amazon Meta Data:", amazon_df['brand'].isna().sum())

Missing brands in Amazon Meta Data: 0


In [15]:
# Any brands missing in Recall data?
print("Missing brands in Recall Data:", recall_df['Brand'].isna().sum())

Missing brands in Recall Data: 0


In [16]:
# Show 20 sample entries from the 'Brand' column in recalls_df
print("Sample 'Brand' values in recalls_df:")
print(recalls_df['Brand'].head(20).to_list())

# Check for empty strings or obviously irrelevant values
empty_or_irrelevant = recalls_df['Brand'].isin(['', ' ', 'nan', 'NaN', 'none', 'None', None])
num_empty_or_irrelevant = empty_or_irrelevant.sum()
print(f"\nNumber of empty or irrelevant 'Brand' values: {num_empty_or_irrelevant}")

# Optionally, show those rows if any exist
if num_empty_or_irrelevant > 0:
    print("\nRows with empty or irrelevant 'Brand' values:")
    display(recalls_df.loc[empty_or_irrelevant, ['Brand', 'Product Description', 'Product Type', 'Model Name or Number']])


Sample 'Brand' values in recalls_df:
['polka drop slime', 'nickledodeon slime', 'lalaloopsy color me ( squiggles n. shapes )', 'play doh', 'nickelodeon', 'moon dough', 'amaco', 'crayola', 'crayola', 'crayola', 'block crayon 6020', 'block crayon', 'creative kids', 'castle molds from wabafun.com  2014waba fun, llc', 'crayola', 'play doh', 'playskool', 'nan', 'flarp', 'crayola twistables']

Number of empty or irrelevant 'Brand' values: 150

Rows with empty or irrelevant 'Brand' values:


Unnamed: 0,Brand,Product Description,Product Type,Model Name or Number
17,,my son received a goody bag from school with p...,Molding Compounds (1376),BSJIXI0618
32,,disney princess activity ride-onfeel like a re...,"Wheeled Riding Toys, Unpowered (1398)",
55,,pink barbie camper hot wheel ride on toy.,Powered Riding Toys (1330),
89,,mickey mouse clubhouse 4-in-1 ride on - minnie...,"Nonwheeled Riding Toys, Unpowered (1327)",Mickey Mouse Clubhouse 4-in-1 Ride On - Minnie...
102,,peanuts flying ace ride-on toys. wing and whe...,"Wheeled Riding Toys, Unpowered (1398)",BCHTAR614A13-0515
...,...,...,...,...
2320,,5 magnetic balls (see picture),Building Sets (1345),
2321,,bucky balls,Building Sets (1345),
2386,,small plastic bell open sides with a small bal...,Baby Rattles (1517),
2434,,"the product was an ~6 diameter, plastic circul...","Toys, Not Elsewhere Classified (1381)",


In the code below we do a computation heavy Fuzzy Match for each non relevant brand entry in the recalls data. For those entries we Fuzzy match the 'Product Description' to a string consisting of 'brand' and 'Description'. This does not give good matches for fuzz_token_set_ratio (90) and no matches at all for fuzz_token_sort_ratio (80). 

In [17]:
from rapidfuzz import fuzz

# Prepare to store matches
brand_desc_matches = []

# Get all unique brands from empty_or_irrelevant rows
brands_to_check = recalls_df.loc[empty_or_irrelevant, 'Brand'].dropna().unique()

# Loop over those brands
for brand in brands_to_check:
    # Get recall rows for this brand from empty_or_irrelevant
    recall_rows = recalls_df[(recalls_df['Brand'] == brand) & empty_or_irrelevant]
    recall_rows = recall_rows.copy()
    recall_rows['Clean_Desc'] = recall_rows['Product Description'].astype(str).str.lower().str.strip()

    # Loop over recall rows
    for _, recall_row in recall_rows.iterrows():
        recall_str = recall_row['Clean_Desc']
        # Fuzzy match with Amazon sample
        for idx, amazon_row in amazon_sample.iterrows():
            amazon_str = f"{amazon_row['brand']} {amazon_row['title']}".lower().strip()
            score = fuzz.token_sort_ratio(recall_str, amazon_str)
            if score >= 80:
                brand_desc_matches.append({
                    'recall_brand': brand,
                    'recall_desc': recall_row['Product Description'],
                    'amazon_idx': idx,
                    'amazon_brand': amazon_row['brand'],
                    'amazon_title': amazon_row['title'],
                    'score': score
                })

# Convert to DataFrame for inspection
brand_desc_matches_df = pd.DataFrame(brand_desc_matches)
print(f"✅ Found {len(brand_desc_matches_df)} matches.")
display(brand_desc_matches_df.head(10))


✅ Found 0 matches.


Below we take an approach where we do the loser fuzz_token_set_ratio (88) but we first filter out all entries with Product Description less than or equal to three words. Now matches seem better but still not great, on the sample there is only 4 and only one seems like a real match. 

In [18]:
# Here we take another approach and ignore recalls without Brand and 
# with Product Descriptions consisting of at most three words. 
# Prepare to store matches
brand_desc_matches = []

# Get all unique brands from empty_or_irrelevant rows
brands_to_check = recalls_df.loc[empty_or_irrelevant, 'Brand'].dropna().unique()

# Loop over those brands
for brand in brands_to_check:
    # Get recall rows for this brand from empty_or_irrelevant
    recall_rows = recalls_df[(recalls_df['Brand'] == brand) & empty_or_irrelevant].copy()
    recall_rows['Clean_Desc'] = recall_rows['Product Description'].astype(str).str.lower().str.strip()

    # Filter out descriptions with 3 words or fewer
    recall_rows = recall_rows[recall_rows['Clean_Desc'].str.split().str.len() > 3]

    # Loop over recall rows
    for _, recall_row in recall_rows.iterrows():
        recall_str = recall_row['Clean_Desc']
        # Fuzzy match with Amazon sample
        for idx, amazon_row in amazon_sample.iterrows():
            amazon_str = f"{amazon_row['brand']} {amazon_row['title']}".lower().strip()
            score = fuzz.token_set_ratio(recall_str, amazon_str)
            if score >= 88:
                brand_desc_matches.append({
                    'recall_brand': brand,
                    'recall_desc': recall_row['Product Description'],
                    'amazon_idx': idx,
                    'amazon_brand': amazon_row['brand'],
                    'amazon_title': amazon_row['title'],
                    'score': score
                })

# Convert to DataFrame for inspection
brand_desc_matches_df = pd.DataFrame(brand_desc_matches)
print(f"✅ Found {len(brand_desc_matches_df)} matches.")
display(brand_desc_matches_df.head(10))


✅ Found 4 matches.


Unnamed: 0,recall_brand,recall_desc,amazon_idx,amazon_brand,amazon_title,score
0,,mickey mouse clubhouse 4-in-1 ride on - minnie...,200129,disney,mickey mouse 4-in-1 ride on,88.52459
1,,plastic pump and spray watergun from the dolla...,345707,greenbrier international,greenbrier international plastic skull,91.428571
2,,hasbro star wars light sabre toytoy # ce c-325...,109833,hasbro,star wars,100.0
3,,micro cessna 781 2ch electric rtf remote contr...,366095,amazing tech depot,cessna 781 infrared rc airplane micro 2ch flig...,93.793103


Below we work on the whole Data, you only need to run the two first cells on the top of the notebook to run this.

I will now run the code matching first on brand and then on title on the full Amazon data and using the full recall data, this took me 8 min 45s. 

In [19]:
# How many products do we have in the Amazon dataset?
print(f"📦 Total products in the dataset: {len(amazon_df)}")

📦 Total products in the dataset: 633883


The code below creates a dictionary matched on brand for our whole dataset (8 min 45s).

In [20]:
from rapidfuzz import fuzz
import pandas as pd
# This creates a Dictionary that maps each brand in the recall data set to a tuple of (recall_rows, amazon_matches).
# To run this you only need to run first two cells of note book
# Ensure brand fields are cleaned
amazon_df['brand'] = amazon_df['brand'].astype(str).str.lower().str.strip()
recalls_df['Brand'] = recalls_df['Brand'].astype(str).str.lower().str.strip()

# 🧼 Create the list of all unique brands from the entire recalls_df
all_recall_brands = pd.Series(recalls_df['Brand'].dropna().unique())

# Prepare dictionary to store (recall_rows, amazon_matches)
matched_amazon_by_brand = {}

# Loop over all brands from the full recall list
for recall_brand in all_recall_brands:
    # Find Amazon entries where the brand fuzzy-matches (≥ 90)
    matched_rows = amazon_df[amazon_df['brand'].apply(
        lambda x: fuzz.token_sort_ratio(x, recall_brand) >= 90
    )]

    # Get recall rows with that brand
    recall_rows = recalls_df[recalls_df['Brand'] == recall_brand]

    if not matched_rows.empty:
        matched_amazon_by_brand[recall_brand] = (recall_rows.copy(), matched_rows.copy())

# ✅ Summary
print(f"✅ Found matches for {len(matched_amazon_by_brand)} recall brand entries.")


✅ Found matches for 586 recall brand entries.


When matching with 70, I got around 6000 matches, below we do for 80

In [None]:
# Step 2: Tag Amazon entries in the above created dictionary that also match product descriptions

# Initialize column to 0
amazon_df['Is match'] = 0

# Store indices of matched rows
matched_amazon_indices = set()

# Loop over matched brands
for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    # Clean relevant columns
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, title in amazon_df_matches['title'].items():
            score = fuzz.token_set_ratio(recall_desc, title)
            if score >= 80:
                matched_amazon_indices.add(idx)

# Update original amazon_df with match indicator
amazon_df.loc[amazon_df.index.isin(matched_amazon_indices), 'Is match'] = 1

# ✅ Summary
print(f"✅ Total matched Amazon listings: {len(matched_amazon_indices)}")
print("🧾 Is match counts:\n", amazon_df['Is match'].value_counts())


✅ Total matched Amazon listings: 2153
🧾 Is match counts:
 Is match
0    631730
1      2153
Name: count, dtype: int64


In [23]:
# This is the number of unique Amazon listings matched by brand to the brand category in the recall data set.
# Collect all matched Amazon listings across all brands
all_amazon_matches = pd.concat(
    [amazon_df_matches for _, amazon_df_matches in matched_amazon_by_brand.values()],
    ignore_index=True
)

# Drop duplicates by ASIN to count unique listings
total_unique_amazon_matches = all_amazon_matches.drop_duplicates(subset='asin').shape[0]

print(f"📦 Total unique asin nr Amazon listings matched by brand: {total_unique_amazon_matches}")

📦 Total unique asin nr Amazon listings matched by brand: 131192


In [24]:
from rapidfuzz import fuzz
import pandas as pd
import random

# Collect all valid matches first
example_matches = []

for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, title in amazon_df_matches['title'].items():
            if idx in matched_amazon_indices:  # Only show already matched indices
                score = fuzz.token_set_ratio(recall_desc, title)
                if score >= 80:
                    example_matches.append({
                        'brand': brand,
                        'amazon_index': idx,
                        'amazon_title': title,
                        'recall_description': recall_desc,
                        'score': score
                    })

# 🎯 Pick 15 random matches from all collected examples
random_examples = random.sample(example_matches, min(15, len(example_matches)))

# Convert to DataFrame for display
examples_df = pd.DataFrame(random_examples)
print("\n📦 15 Random example matches:")
print(examples_df[['brand', 'amazon_title', 'recall_description', 'score']])



📦 15 Random example matches:
                 brand                                       amazon_title  \
0           hot wheels  hot wheels track builder exploding shed stunt ...   
1          radio flyer  radio flyer little red wagon 12-1/4 in. x 7-1/...   
2            playskool                   playskool musical sit &amp; spin   
3                 syma  balance bar for the double horse 9101 gyro hel...   
4         fisher price                             fisher price choo choo   
5                 syma  syma s009g ah-64 apache 3 channel indoor helic...   
6         fisher-price              fisher-price little people dump truck   
7   pacific play tents                   pacific play tents fun zone tent   
8              crayola  crayola 52-3281 large washable crayons assorte...   
9               disney                            disney minnie mouse cup   
10           infantino                            infantino activity ball   
11                syma  the new syma double ho

In [25]:
# Collect examples to show
example_matches = []

for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, title in amazon_df_matches['title'].items():
            if idx in matched_amazon_indices:  # Only show already matched indices
                score = fuzz.token_set_ratio(recall_desc, title)
                if score >= 80:
                    example_matches.append({
                        'brand': brand,
                        'amazon_index': idx,
                        'amazon_title': title,
                        'recall_description': recall_desc,
                        'score': score
                    })
                if len(example_matches) >= 10:
                    break
        if len(example_matches) >= 10:
            break
    if len(example_matches) >= 10:
        break

# Convert to DataFrame for display
examples_df = pd.DataFrame(example_matches)
print("\n📦 First 10 example matches:")
print(examples_df[['brand', 'amazon_title', 'recall_description', 'score']])



📦 First 10 example matches:
         brand                                       amazon_title  \
0     play doh  play doh disney princess design a dress boutiq...   
1  nickelodeon  1 x turtles teenage mutant ninja turtles mutag...   
2  nickelodeon  nickelodeon teenage mutant ninja turtle heli b...   
3  nickelodeon  dora the explorer bed tent with push light fea...   
4  nickelodeon              teenage mutant ninja turtles splinter   
5  nickelodeon              teenage mutant ninja turtles leonardo   
6  nickelodeon              teenage mutant ninja turtles shredder   
7  nickelodeon          teenage mutant ninja turtles foot soldier   
8  nickelodeon            teenage mutant ninja turtles cups 4 set   
9  nickelodeon             teenage mutant ninja turtles kickboard   

                                  recall_description       score  
0                                           play doh  100.000000  
1  playmate's teenage mutant ninja turtle nickelo...   84.033613  
2  playmat

Below we match 'Product Description' with 'Description' also for only the already matched by brand dictionary. Not super useful

In [None]:
# Step 2: Tag Amazon entries in the brand dictionary that match product descriptions with features

# Initialize column to 0
amazon_df['Is match'] = 0

# Store indices of matched rows
matched_amazon_indices = set()

# Loop over matched brands
for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    # Clean relevant columns
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['feature'] = amazon_df_matches['feature'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, feature in amazon_df_matches['feature'].items():
            score = fuzz.token_set_ratio(recall_desc, feature)
            if score >= 80:
                matched_amazon_indices.add(idx)

# Update original amazon_df with match indicator
amazon_df.loc[amazon_df.index.isin(matched_amazon_indices), 'Is match'] = 1

# ✅ Summary
print(f"✅ Total matched Amazon listings: {len(matched_amazon_indices)}")
print("🧾 Is match counts:\n", amazon_df['Is match'].value_counts())


✅ Total matched Amazon listings: 805
🧾 Is match counts:
 Is match
0    633078
1       805
Name: count, dtype: int64


In [27]:
from rapidfuzz import fuzz
import pandas as pd
import random

# Collect all valid matches first
example_matches = []

for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['feature'] = amazon_df_matches['feature'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, feature in amazon_df_matches['feature'].items():
            if idx in matched_amazon_indices:  # Only show already matched indices
                score = fuzz.token_set_ratio(recall_desc, feature)
                if score >= 90:
                    example_matches.append({
                        'brand': brand,
                        'amazon_index': idx,
                        'amazon_feature': feature,
                        'recall_description': recall_desc,
                        'score': score
                    })

# 🎯 Pick 15 random matches from all collected examples
random_examples = random.sample(example_matches, min(15, len(example_matches)))

# Convert to DataFrame for display
examples_df = pd.DataFrame(random_examples)
print("\n📦 15 Random example matches:")
print(examples_df[['brand', 'amazon_feature', 'recall_description', 'score']])



📦 15 Random example matches:
      brand                                     amazon_feature  \
0   crayola  ['convenient, reusable, stackable plastic tray...   
1   crayola  ['includes :', 'color wonder sprayer with 4 ca...   
2   crayola  ['extra bright, smooth and blendable education...   
3   crayola  ["sidewalk chalk that won't roll away: the ant...   
4   crayola  ['72 chalk sticks, 48 unique & different color...   
5      brio  ['brio magnetic bell signal', 'the bell rings ...   
6   crayola  ['my first colored pencils: strong, extra-thic...   
7   crayola  ['expands customization options to jumbo size ...   
8   crayola  ['crayola provides unique worry-free solutions...   
9   crayola  ['easy to mold, easy to clean: resealable plas...   
10  crayola  ['color variety with 30 colors', "kid's love t...   
11  crayola  ['large size easel: crayola easel pad fits on ...   
12  crayola  ['metal', 'made in usa or imported', 'melt and...   
13  crayola  ['n/a', 'made in usa or imported'