# Documentation for `matching_recalls.ipynb`

## Purpose

This notebook matches Amazon product listings to official product recall records by comparing product titles. It helps identify which Amazon products have been subject to recalls.


## Overview

- We first load Amazon product metadata and the recall titles are loaded from raw data.
- We then cleans recall titles by removing generic/banned words (e.g., "recall", "hazard", "product", etc.) to focus on the core product name.
- We then use fuzzy matching (`RapidFuzz`) to compare the cleaned recall title to each Amazon product title. We record a match if the returned score is above 80.
- We then save  the updated dataframe to `../Data/amazon_meta_with_recall_matches.csv`.

## Input Files

- `../Data/metadata_raw.pkl`  
- `../Data/recalls/*.csv`  

## Output Files

- `../Data/amazon_meta_with_recall_matches.csv`  

In [19]:
import pandas as pd
import os


In [20]:
file_path = '../Data/metadata_raw.pkl'  
amazon_df = pd.read_pickle(file_path)

In [21]:
# Extract the title path for all the product recalls

folder_path = '../Data/recalls'
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

titles = []

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    try:
        # Skip the first row, treat row 1 as header
        df = pd.read_csv(file_path, skiprows=1, encoding='utf-8', on_bad_lines='skip', dtype=str)

        if 'Title' not in df.columns:
            print(f"⚠️ Skipped {file}: No 'Title' column found in row 2.")
            continue

        df = df[['Title']]  # Keep only the 'Title' column
        titles.append(df)

    except Exception as e:
        print(f"❌ Error reading {file}: {e}")

# Combine all 'Title' columns
toysrecall_df = pd.concat(titles, ignore_index=True)

In [22]:
print("Combined recall records:", len(toysrecall_df))
print("Columns:", toysrecall_df.columns.tolist())
toysrecall_df.head(20)

Combined recall records: 846
Columns: ['Title']


Unnamed: 0,Title
0,Toy Truck Gifts with Purchase Recalled by Happ...
1,Tumblekins Toys Recalled by International Play...
2,Schylling Recalls Police Press and Go Toy Vehi...
3,Green Toys Recalls Mini Vehicles Due To Chokin...
4,Cycle Gear Recalls Semi Truck and Motorcycle T...
5,LM Import & Export Recalls Toy Cars Due to Vio...
6,Dollar General Recalls Construction Truck Toy ...
7,Family Dollar Stores Recall Tough Treadz Auto ...
8,Family Dollar Stores Recalls Remote Controlled...
9,Infantino Recalls Toy Activity Trucks Due to C...


In [23]:
print(toysrecall_df.columns)

Index(['Title'], dtype='object')


In [24]:
from rapidfuzz import fuzz, process

In [25]:
from rapidfuzz import fuzz
import pandas as pd
import re

def clean_recall_title(title: str) -> str:
    banned_words = [
        'recall', 'recalls', 'recalled','choking', 'choked', 'hazard', 'hazards', 'hazardous',
        'title', 'danger', 'due', 'to', 'for', 'announce', 'announced', 'announcement',
        'alert', 'alerts', 'warning', 'warnings', 'safety', 'unsafe', 'product', 'products',
        'item', 'items', 'may', 'cause', 'risk', 'risks', 'injury', 'injuries', 'harm',
        'damages', 'defect', 'defective', 'faulty', 'fault', 'issue', 'issues',
        'problem', 'problems', 'dangerous', 'dangerously'
    ]
    pattern = r'\b(?:' + '|'.join(banned_words) + r')\b'
    cleaned = re.sub(pattern, '', title, flags=re.IGNORECASE)
    cleaned = re.sub(r'\s+', ' ', cleaned)  # remove extra spaces
    return cleaned.strip()

def match_amazon_to_recalls(amazon_df, recall_df, score_threshold=90):
    amazon = amazon_df.copy()
    recalls = recall_df.copy()

    amazon['title'] = amazon['title'].astype(str).str.lower().str.strip()
    recalls['Title'] = recalls['Title'].astype(str).str.lower().str.strip()

    # Initialize columns
    amazon_df['is_match'] = 0
    amazon_df['recall_index'] = [[] for _ in range(len(amazon_df))]

    matched = {}

    for recall_title in recalls['Title'].dropna().unique():
        cleaned = clean_recall_title(recall_title)
        if not cleaned:
            continue

        mask = amazon['title'].apply(lambda x: fuzz.token_sort_ratio(x, cleaned) >= score_threshold)
        amazon_matches = amazon[mask]
        recall_rows = recalls[recalls['Title'] == recall_title]

        if not amazon_matches.empty:
            matched[recall_title] = (recall_rows.copy(), amazon_matches.copy())
            recall_idx = recall_rows.index[0]

            for idx in amazon_matches.index:
                amazon_df.at[idx, 'is_match'] = 1
                amazon_df.at[idx, 'recall_index'].append(2514 + recall_idx)

    print(f"✅ Found matches for {len(matched)} recall-title entries.")
    print(f"📦 Unique Amazon products matched: {amazon_df['is_match'].sum()}")

    return matched, amazon_df


In [26]:
from rapidfuzz import fuzz
import pandas as pd
import re

def clean_recall_title(title: str) -> str:
    banned_words = ['recall', 'recalls', 'recalled','choking', 'choked', 'hazard', 'hazards', 'hazardous', 'title', 'danger', "due", "to", "for", "announce", "announced", "announcement", "alert", "alerts", "warning", "warnings", "safety", "unsafe", "product", "products", "item", "items", "may", "cause", "risk", "risks", "injury", "injuries", "harm", "damages", "defect", "defective", "faulty", "fault", "issue", "issues", "problem", "problems", "dangerous", "dangerously"]
    pattern = r'\b(?:' + '|'.join(banned_words) + r')\b'
    cleaned = re.sub(pattern, '', title, flags=re.IGNORECASE)
    return cleaned.strip()

def match_amazon_to_recalls(amazon_df, recall_df, score_threshold=90):
    amazon = amazon_df.copy()
    recalls = recall_df.copy()

    amazon['title'] = amazon['title'].astype(str).str.lower().str.strip()
    recalls['Title'] = recalls['Title'].astype(str).str.lower().str.strip()

    amazon_df['is_match'] = 0
    amazon_df['recall_index'] = None

    matched = {}

    for recall_title in recalls['Title'].dropna().unique():
        cleaned = clean_recall_title(recall_title)
        if not cleaned:
            continue

        mask = amazon['title'].apply(lambda x: fuzz.token_sort_ratio(x, cleaned) >= score_threshold)
        amazon_matches = amazon[mask]
        recall_rows = recalls[recalls['Title'] == recall_title]

        if not amazon_matches.empty:
            matched[recall_title] = (recall_rows.copy(), amazon_matches.copy())
            recall_idx = recall_rows.index[0]

            for idx in amazon_matches.index:
                amazon_df.at[idx, 'is_match'] = 1
                amazon_df.at[idx, 'recall_index'] = 2514 + recall_idx

    print(f"✅ Found matches for {len(matched)} recall-title entries.")
    print(f"📦 Unique Amazon products matched: {amazon_df['is_match'].sum()}")

    return matched, amazon_df


In [27]:
matched_results, updated_amazon_df = match_amazon_to_recalls(amazon_df, toysrecall_df, 80)

✅ Found matches for 20 recall-title entries.
📦 Unique Amazon products matched: 33


In [28]:
for recall_title, (recall_df, amazon_df_match) in matched_results.items():
    print(f"\n⚠️ Recall Title: {recall_title}")
    print("🛒 Matched Amazon Titles:")

    for _, row in amazon_df_match.iterrows():
        print(f"   - {row['title']}")



⚠️ Recall Title: green toys recalls mini vehicles due to choking hazard
🛒 Matched Amazon Titles:
   - green toys mixer vehicle
   - green toys  mini vehicle, 4-pack

⚠️ Recall Title: infantino recalls toy activity trucks due to choking hazard
🛒 Matched Amazon Titles:
   - infantino activity toy set

⚠️ Recall Title: fisher-price recalls to repair little people builders' load 'n go wagons due to laceration hazard
🛒 Matched Amazon Titles:
   - fisher-price little people builders load 'n go wagon

⚠️ Recall Title: stacking toy recalled for choking hazard risk
🛒 Matched Amazon Titles:
   - stacking tower

⚠️ Recall Title: star wars lightsaber recalled by hasbro
🛒 Matched Amazon Titles:
   - hasbro star wars light saber battle game
   - star wars ahsoka lightsaber
   - star wars lightsaber ahsoka

⚠️ Recall Title: bathtub toys recalled by munchkin due to risk of injury
🛒 Matched Amazon Titles:
   - munchkin lazy buoys bathtub toys

⚠️ Recall Title: toy mobile phones recalled for choking ha

In [None]:
updated_amazon_df.to_csv('../Data/amazon_meta_with_recall_matches.csv', index=False, encoding='utf-8')
print("Updated Amazon DataFrame saved to 'amazon_meta_with_recall_matches.csv'.")

Updated Amazon DataFrame saved to 'amazon_meta_with_recall_matches.csv'.


In [30]:
print(updated_amazon_df.columns)

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details',
       'is_match', 'recall_index'],
      dtype='object')
