In [6]:
import pandas as pd

In [10]:
file_path = 'Data/amazon_meta.json'  
amazon_df = pd.read_json(file_path, lines=True, compression=None)

In [11]:
print(f"📦 Total products in the dataset: {len(amazon_df)}")

📦 Total products in the dataset: 633883


## Incidents Data

In [15]:
df_arts = pd.read_csv('Data/Toysandchildren_ArtsandCrafts.csv', skiprows=2)
df_riding = pd.read_csv('Data/Toysandchildren_Riding_Toys.csv', skiprows=1)
df_toys = pd.read_csv('Data/Toysandchildren_Toys.csv', skiprows=1)

In [17]:
recalls_df = pd.concat([df_arts, df_riding, df_toys], ignore_index=True)
# Preview
print("Combined recall records:", len(recalls_df))
print("Columns:", recalls_df.columns.tolist())
recalls_df.head(3)

Combined recall records: 2514
Columns: ['Report No.', 'Report Date', 'Sent to Manufacturer / Importer / Private Labeler', 'Publication Date', 'Category of Submitter', 'Product Description', 'Product Category', 'Product Sub Category', 'Product Type', 'Product Code', 'Manufacturer / Importer / Private Labeler Name', 'Brand', 'Model Name or Number', 'Serial Number', 'UPC', 'Date Manufactured', 'Manufacturer Date Code', 'Retailer', 'Retailer State', 'Purchase Date', 'Purchase Date Is Estimate', 'Incident Description', 'City', 'State', 'ZIP', 'Location', '(Primary) Victim Severity', "(Primary) Victim's Sex", 'My Relation To The (Primary) Victim', "(Primary) Victim's Age (years)", 'Submitter Has Product', 'Product Was Damaged Before Incident', 'Damage Description', 'Damage Repaired', 'Product Was Modified Before Incident', 'Have You Contacted The Manufacturer', 'If Not Do You Plan To', 'Answer Explanation', 'Company Comments', 'Associated Report Numbers']


Unnamed: 0,Report No.,Report Date,Sent to Manufacturer / Importer / Private Labeler,Publication Date,Category of Submitter,Product Description,Product Category,Product Sub Category,Product Type,Product Code,...,Submitter Has Product,Product Was Damaged Before Incident,Damage Description,Damage Repaired,Product Was Modified Before Incident,Have You Contacted The Manufacturer,If Not Do You Plan To,Answer Explanation,Company Comments,Associated Report Numbers
0,20231002-D13C7-2147344911,10/2/2023,10/27/2023,11/20/2023,Consumer,Slime globe with colored spheres which resembl...,Toys & Children,Arts & Crafts,Molding Compounds (1376),1376,...,,,,,,,,,TOYSMITH: Thank you for the opportunity to rev...,
1,20180126-BBF18-2147393362,1/26/2018,4/13/2018,4/27/2018,Consumer,Slime kit from Nickelodeon by Cra-Z-Art,Toys & Children,Arts & Crafts,Molding Compounds (1376),1376,...,Yes,No,,,No,Yes,,I already have explained.,We at Cra-Z-Art® are very sorry that the cons...,
2,20141231-7C153-2147437145,12/31/2014,1/9/2015,1/26/2015,Consumer,Lalaloopsy Color Me Doll ( Squiggles N. Shapes...,Toys & Children,Arts & Crafts,Crayons or Chalk (5010),5010,...,,,,,,No,No,,The chalk markers included with this item are ...,


In [19]:
from rapidfuzz import fuzz
import pandas as pd

# Ensure brand fields are cleaned
amazon_df['brand'] = amazon_df['brand'].astype(str).str.lower().str.strip()
recalls_df['Brand'] = recalls_df['Brand'].astype(str).str.lower().str.strip()

# 🧼 Create the list of all unique brands from the entire recalls_df
all_recall_brands = pd.Series(recalls_df['Brand'].dropna().unique())

# Prepare dictionary to store (recall_rows, amazon_matches)
matched_amazon_by_brand = {}

# Loop over all brands from the full recall list
for recall_brand in all_recall_brands:
    # Find Amazon entries where the brand fuzzy-matches (≥ 90)
    matched_rows = amazon_df[amazon_df['brand'].apply(
        lambda x: fuzz.token_sort_ratio(x, recall_brand) >= 90
    )]

    # Get recall rows with that brand
    recall_rows = recalls_df[recalls_df['Brand'] == recall_brand]

    if not matched_rows.empty:
        matched_amazon_by_brand[recall_brand] = (recall_rows.copy(), matched_rows.copy())

# Summary
print(f" Found matches for {len(matched_amazon_by_brand)} recall brand entries.")

✅ Found matches for 586 recall brand entries.


In [20]:
# Step 2: Tag Amazon entries that match product descriptions

# Initialize column to 0
amazon_df['Is match'] = 0

# Store indices of matched rows
matched_amazon_indices = set()

# Loop over matched brands
for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    # Clean relevant columns
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, title in amazon_df_matches['title'].items():
            score = fuzz.token_set_ratio(recall_desc, title)
            if score >= 80:
                matched_amazon_indices.add(idx)

# Update original amazon_df with match indicator
amazon_df.loc[amazon_df.index.isin(matched_amazon_indices), 'Is match'] = 1

# ✅ Summary
print(f"✅ Total matched Amazon listings: {len(matched_amazon_indices)}")
print("🧾 Is match counts:\n", amazon_df['Is match'].value_counts())

✅ Total matched Amazon listings: 2153
🧾 Is match counts:
 Is match
0    631730
1      2153
Name: count, dtype: int64


### Write these as Functions

In [25]:
def match_amazon_by_brand(
    amazon_df: pd.DataFrame,
    recalls_df: pd.DataFrame,
    score_threshold: int = 90
) -> dict[str, tuple[pd.DataFrame, pd.DataFrame]]:
    """
    For each unique brand in recalls_df, find all amazon_df rows whose
    'brand' fuzzy-matches at or above score_threshold.
    Returns a dict mapping recall_brand -> (recall_rows_df, matching_amazon_rows_df).
    """
    # work on copies
    amazon = amazon_df.copy()
    recalls = recalls_df.copy()

    # clean brand columns
    amazon['brand']  = amazon['brand'].astype(str).str.lower().str.strip()
    recalls['Brand'] = recalls['Brand'].astype(str).str.lower().str.strip()

    matched = {}
    for recall_brand in recalls['Brand'].dropna().unique():
        mask = amazon['brand'].apply(
            lambda x: fuzz.token_sort_ratio(x, recall_brand) >= score_threshold
        )
        amazon_matches = amazon[mask]
        recall_rows     = recalls[recalls['Brand'] == recall_brand]

        if not amazon_matches.empty:
            matched[recall_brand] = (recall_rows.copy(), amazon_matches.copy())

    print(f"✅ Found matches for {len(matched)} recall-brand entries.")
    return matched


def tag_amazon_by_description(
    amazon_df: pd.DataFrame,
    matched_by_brand: dict[str, tuple[pd.DataFrame, pd.DataFrame]],
    desc_score_threshold: int = 80
) -> pd.DataFrame:
    """
    Given an amazon_df and the output of match_amazon_by_brand(),
    tag each amazon row with 'Is match' = 1 if its title fuzzy-matches
    any recall product description (≥ desc_score_threshold).
    Returns a new DataFrame with the added 'Is match' column.
    """
    amazon = amazon_df.copy()
    amazon['Is match'] = 0
    matched_indices = set()

    for recall_brand, (recall_rows, amazon_matches) in matched_by_brand.items():
        descs  = (
            recall_rows['Product Description']
            .astype(str).fillna("")
            .str.lower().str.strip()
        )
        titles = (
            amazon_matches['title']
            .astype(str).fillna("")
            .str.lower().str.strip()
        )

        for desc in descs:
            for idx, title in titles.items():
                if fuzz.token_set_ratio(desc, title) >= desc_score_threshold:
                    matched_indices.add(idx)

    amazon.loc[amazon.index.isin(matched_indices), 'Is match'] = 1
    total = len(matched_indices)
    counts = amazon['Is match'].value_counts()
    print(f" Total matched Amazon listings: {total}")
    print(" Is match counts:\n", counts.to_dict())
    return amazon

In [29]:
matched_by_brand = match_amazon_by_brand(amazon_df, recalls_df, score_threshold = 90)
amazon_df = tag_amazon_by_description(amazon_df, matched_by_brand, desc_score_threshold = 80)

✅ Found matches for 586 recall-brand entries.
 Total matched Amazon listings: 2153
 Is match counts:
 {0: 631730, 1: 2153}


### Rewrite these functions so that recall indices are saved

In [43]:
def match_amazon_by_brand(
    amazon_df: pd.DataFrame,
    recalls_df: pd.DataFrame,
    score_threshold: int = 90
) -> dict[str, tuple[pd.DataFrame, pd.DataFrame, list[int]]]:
    """
    For each unique brand in recalls_df, find all amazon_df rows whose
    'brand' fuzzy-matches at or above score_threshold.
    Returns a dict mapping recall_brand -> (recall_rows_df, matching_amazon_rows_df, global_recall_indices).
    """
    amazon = amazon_df.copy()
    recalls = recalls_df.copy()

    amazon['brand'] = amazon['brand'].astype(str).str.lower().str.strip()
    recalls['Brand'] = recalls['Brand'].astype(str).str.lower().str.strip()

    matched = {}

    for recall_brand in recalls['Brand'].dropna().unique():
        mask = amazon['brand'].apply(
            lambda x: fuzz.token_sort_ratio(x, recall_brand) >= score_threshold
        )
        amazon_matches = amazon[mask]
        recall_rows = recalls[recalls['Brand'] == recall_brand]

        if not amazon_matches.empty:
            # ✅ capture the *original recall indices*
            global_indices = recall_rows.index.tolist()
            matched[recall_brand] = (recall_rows.copy(), amazon_matches.copy(), global_indices)

    print(f"✅ Found matches for {len(matched)} recall-brand entries.")
    return matched

In [47]:
def tag_amazon_by_description(
    amazon_df: pd.DataFrame,
    matched_by_brand: dict[str, tuple[pd.DataFrame, pd.DataFrame, list[int]]],
    desc_score_threshold: int = 80
) -> pd.DataFrame:
    """
    Tag each amazon row with:
    - 'Is match' = 1 if title matches any recall description
    - 'matched_recall_indices' = list of original recalls_df indices
    """
    amazon = amazon_df.copy()
    amazon['Is match'] = 0
    amazon['matched_recall_indices'] = [[] for _ in range(len(amazon))]

    for recall_brand, (recall_rows, amazon_matches, global_recall_indices) in matched_by_brand.items():
        recall_descs = recall_rows['Product Description'].astype(str).fillna("").str.lower().str.strip()
        amazon_titles = amazon_matches['title'].astype(str).fillna("").str.lower().str.strip()

        for local_recall_i, recall_desc in enumerate(recall_descs):
            global_recall_i = global_recall_indices[local_recall_i]  # map local to global index
            for amazon_i, title in amazon_titles.items():
                if fuzz.token_set_ratio(recall_desc, title) >= desc_score_threshold:
                    amazon.at[amazon_i, 'Is match'] = 1
                    amazon.at[amazon_i, 'matched_recall_indices'].append(global_recall_i)

    total = amazon['Is match'].sum()
    print(f"Total matched Amazon listings: {total}")
    print("Match counts:", amazon['Is match'].value_counts().to_dict())
    return amazon


In [49]:
matched_by_brand = match_amazon_by_brand(amazon_df, recalls_df, score_threshold = 90)
amazon_df = tag_amazon_by_description(amazon_df, matched_by_brand, desc_score_threshold = 80)

✅ Found matches for 586 recall-brand entries.
Total matched Amazon listings: 2153
Match counts: {0: 631730, 1: 2153}


Print to see we have exactly the same results.

In [54]:
# Collect examples to show
example_matches = []

for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, title in amazon_df_matches['title'].items():
            if idx in matched_amazon_indices:  # Only show already matched indices
                score = fuzz.token_set_ratio(recall_desc, title)
                if score >= 80:
                    example_matches.append({
                        'brand': brand,
                        'amazon_index': idx,
                        'amazon_title': title,
                        'recall_description': recall_desc,
                        'score': score
                    })
                if len(example_matches) >= 10:
                    break
        if len(example_matches) >= 10:
            break
    if len(example_matches) >= 10:
        break

# Convert to DataFrame for display
examples_df = pd.DataFrame(example_matches)
print("\n📦 First 10 example matches:")
print(examples_df[['brand', 'amazon_title', 'recall_description', 'score']])


📦 First 10 example matches:
         brand                                       amazon_title  \
0     play doh  play doh disney princess design a dress boutiq...   
1  nickelodeon  1 x turtles teenage mutant ninja turtles mutag...   
2  nickelodeon  nickelodeon teenage mutant ninja turtle heli b...   
3  nickelodeon  dora the explorer bed tent with push light fea...   
4  nickelodeon              teenage mutant ninja turtles splinter   
5  nickelodeon              teenage mutant ninja turtles leonardo   
6  nickelodeon              teenage mutant ninja turtles shredder   
7  nickelodeon          teenage mutant ninja turtles foot soldier   
8  nickelodeon            teenage mutant ninja turtles cups 4 set   
9  nickelodeon             teenage mutant ninja turtles kickboard   

                                  recall_description       score  
0                                           play doh  100.000000  
1  playmate's teenage mutant ninja turtle nickelo...   84.033613  
2  playmat

## Analyze the results:

In [63]:
amazon_df.columns

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details',
       'Is match', 'matched_recall_indices'],
      dtype='object')

In [65]:
amazon_df.loc[amazon_df['Is match']==1,['matched_recall_indices']]

Unnamed: 0,matched_recall_indices
1755,[1442]
1758,[1442]
1773,[1442]
1907,[1442]
1971,"[1094, 1149, 1254, 1331, 1938, 2391]"
...,...
628711,[2052]
629091,[2249]
633736,[2274]
633860,[1753]


#### Is one amazon data is matched with more than one recall data?

In [69]:
amazon_df.loc[1971,['matched_recall_indices']]

matched_recall_indices    [1094, 1149, 1254, 1331, 1938, 2391]
Name: 1971, dtype: object

In [78]:
for i in [1971]:
    meta_row = amazon_df.iloc[i]
       
    print("🔹 META")
    print(f"Brand: {meta_row['brand']}")
    print(f"Title: {meta_row['title']}")
    print(f"Feature: {meta_row['feature']}")
    
    print("-" * 80)  

🔹 META
Brand: learning resources
Title: Learning Resources Tool Set
Feature: ['For the young builder', '13 piece set', 'Stores in plastic tool box']
--------------------------------------------------------------------------------


In [153]:
print_metadata(amazon_df, [1971])

🔹 META
Brand:   learning resources
Title:   Learning Resources Tool Set
Feature: ['For the young builder', '13 piece set', 'Stores in plastic tool box']
--------------------------------------------------------------------------------


In [157]:
def print_metadata(df, indices):
    
    for i in indices:
        if i >= len(df):
            print(f"⚠️ Index {i} is out of bounds.")
            continue

        meta_row = df.iloc[i]
        
        print("🔹 META")
        print(f"Brand:   {meta_row.get('brand', 'N/A')}")
        print(f"Title:   {meta_row.get('title', 'N/A')}")
        print(f"Feature: {meta_row.get('feature', 'N/A')}")
        print("-" * 80)

def print_recall_data(df, indices):
    
    for i in indices:
        if i >= len(df):
            print(f"⚠️ Index {i} is out of bounds.")
            continue

        recall_row = df.iloc[i]

        print("🔹 REPORT")
        print(f"Brand:               {recall_row.get('Brand', 'N/A')}")
        print(f"Product Description: {recall_row.get('Product Description', 'N/A')}")
        print("-" * 80)

In [155]:
print_metadata(amazon_df, [1971])

🔹 META
Brand:   learning resources
Title:   Learning Resources Tool Set
Feature: ['For the young builder', '13 piece set', 'Stores in plastic tool box']
--------------------------------------------------------------------------------


In [161]:
print_recall_data(recalls_df, [1094, 1149, 1254, 1331, 1938, 2391])

🔹 REPORT
Brand:               learning resources
Product Description: Learning Resources magnet movers toy. Comes with multiple magnets encased in plastic. the issue was with the orange sphere
--------------------------------------------------------------------------------
🔹 REPORT
Brand:               learning resources
Product Description: Learning Resources Snap and Learn Rainbow Owls. The two-piece owls pop apart and snap back together, use to learn gross motor skills or sorting skills.The 10 owls are 3-in-1  color toys, number toys, and sorting toys. They are each about 2x3.
--------------------------------------------------------------------------------
🔹 REPORT
Brand:               learning resources
Product Description: Learning Resources Spike The Fine Motor Hedgehoghttps://www.learningresources.com/spike-the-fine-motor-hedgehogtm
--------------------------------------------------------------------------------
🔹 REPORT
Brand:               learning resources
Product Descriptio

#### How many Amazon Items are matched with more than one recall?

In [111]:
amazon_df['number of matches'] = 0

In [119]:
amazon_df['number_of_matches'] = amazon_df['matched_recall_indices'].apply(
    lambda x: len(x)
)

In [123]:
amazon_df['number_of_matches'].sum()

2844

In [139]:
amazon_df.loc[amazon_df['number_of_matches'] > 1,['number_of_matches']] \
    .sort_values(by='number_of_matches', ascending=False)

Unnamed: 0,number_of_matches
38108,38
400047,13
460809,11
538004,10
596066,10
...,...
360268,2
364448,2
367023,2
368382,2


In [147]:
print(f"Length with greater than one matches: {len(amazon_df.loc[amazon_df['number_of_matches'] > 1, ['number_of_matches']])}")

Length with greater than one matches: 331


In [35]:
from rapidfuzz import fuzz
from rapidfuzz.process import cdist

# 1) BRANDS
UNMATCHABLE = "__MISSING__"

# build aligned lists
brand_queries = (
    recalls_df['Brand']
    .fillna(UNMATCHABLE)
    .astype(str)
    .str.lower().str.strip()
    .tolist()
)
brand_choices = (
    amazon_df['brand']
    .fillna(UNMATCHABLE)
    .astype(str)
    .str.lower().str.strip()
    .tolist()
)

# run cdist to get all brand-matches ≥ 90
similarity_brand_matrix2 = cdist(
    brand_queries,         # rows = reports
    brand_choices,         # cols = df_meta
    scorer=fuzz.token_sort_ratio,
    processor=None,        # already pre-processed
    score_cutoff=90,       # only return score ≥ 90
    workers=24
)
# similarity_brand_matrix2[i] is a list of (brand_str, score, col_idx)
# for every brand_choices[col_idx] that matched reports.iloc[i]['Brand'].

# 2) DESCRIPTIONS
desc_queries = (
    recalls_df['Product Description']
    .fillna("")
    .astype(str)
    .str.lower().str.strip()
    .tolist()
)
title_choices = (
    amazon_df['title']
    .fillna("")
    .astype(str)
    .str.lower().str.strip()
    .tolist()
)

# run cdist for desc vs title
similarity_desc_matrix2 = cdist(
    desc_queries,
    title_choices,
    scorer=fuzz.token_set_ratio,
    processor=None,
    score_cutoff=80,
    workers=24
)
# similarity_desc_matrix2[i] is a list of (title_str, score, col_idx)
# for every title_choices[col_idx] that matched reports.iloc[i]['Product Description'].


In [41]:
import numpy as np

# Get positions where brand similarity is > 90
brand_indices = np.where(similarity_brand_matrix2 > 90)

# Create a list of (i, j) where both conditions are true
matches = []
for i, j in zip(*brand_indices):
    if similarity_desc_matrix2[i, j] > 80:
        matches.append((i, j))
