In [31]:
import pandas as pd


In [32]:
file_path = '../Data/amazon_meta.json'  
amazon_df = pd.read_json(file_path, lines=True, compression=None)

In [50]:
file_path = "all_titles_only.csv"
# Read the CSV file as-is
toysrecall_df = pd.read_csv(file_path, encoding='utf-8')





In [51]:


print("Combined recall records:", len(toysrecall_df))
print("Columns:", toysrecall_df.columns.tolist())
toysrecall_df.head(20)

Combined recall records: 709
Columns: ['Title']


Unnamed: 0,Title
0,Kids II Inc. Recalls Crib Toys Due to Choking ...
1,Discovery Toys Children's Toy Phone Recall
2,Kids Station Toys Recalls Little Tikes Toy Cel...
3,Pop Up Phone Recalled; Antenna Is Choking Hazard
4,TDT Toy Company Recalls Plastic Telephones Bec...
5,Toy Telephones Recalled by Durham
6,"Coynes, Inc. Recalls Musical Cordless Toy Tele..."
7,Toy Baby Phone Recalled by Vtech
8,Toy Phones Recalled for Choking Hazard
9,Evenflo Recalls Telephone Toys Due to Choking ...


In [52]:
print(toysrecall_df.columns)

Index(['Title'], dtype='object')


In [60]:
from rapidfuzz import fuzz, process

In [69]:
import re

def clean_recall_title(title: str) -> str:
    """
    Removes unwanted words from a recall title string (case-insensitive).
    """
    banned_words = ['recall', 'recalls', 'recalled','choking', 'choked', 'hazard', 'hazards', 'hazardous', 'title', 'danger']
    pattern = r'\b(?:' + '|'.join(banned_words) + r')\b'
    cleaned = re.sub(pattern, '', title, flags=re.IGNORECASE)
    return cleaned.strip()

def match_amazon_to_recalls(
    amazon_df: pd.DataFrame,
    recall_df: pd.DataFrame,
    score_threshold: int = 90
) -> dict[str, tuple[pd.DataFrame, pd.DataFrame]]:
    """
    For each cleaned recall title, find all Amazon rows whose 'title'
    fuzzy-matches at or above score_threshold.
    Returns a dict mapping original recall_title -> (recall_rows_df, matching_amazon_rows_df).
    """
    # Work on copies
    amazon = amazon_df.copy()
    recalls = recall_df.copy()

    # Clean text
    amazon['title'] = amazon['title'].astype(str).str.lower().str.strip()
    recalls['Title'] = recalls['Title'].astype(str).str.lower().str.strip()

    matched = {}
    for recall_title in recalls['Title'].dropna().unique():
        cleaned_recall_title = clean_recall_title(recall_title)

        if not cleaned_recall_title:  # skip if cleaning removed all text
            continue

        mask = amazon['title'].apply(
            lambda x: fuzz.token_sort_ratio(x, cleaned_recall_title) >= score_threshold
        )
        amazon_matches = amazon[mask]
        recall_rows = recalls[recalls['Title'] == recall_title]

        if not amazon_matches.empty:
            matched[recall_title] = (recall_rows.copy(), amazon_matches.copy())

    print(f"✅ Found matches for {len(matched)} recall-title entries.")
    return matched


In [73]:
matched_results = match_amazon_to_recalls(amazon_df, toysrecall_df, 80)

✅ Found matches for 7 recall-title entries.


In [74]:
def get_matched_amazon_titles_with_recall_title() -> list[tuple[str, str]]:
    """
    Uses the globally defined matched_results and returns a list of (recall_title, matched_amazon_title).
    """
    results = []

    for recall_title, (_, amazon_matches) in matched_results.items():
        for title in amazon_matches['title'].tolist():
            results.append((recall_title, title))

    return results


In [75]:
matches = get_matched_amazon_titles_with_recall_title()

for recall_title, amazon_title in matches:
    print(f"📌 Recall Title: {recall_title}")
    print(f"🔗 Matched Amazon Title: {amazon_title}")
    print("-" * 60)


📌 Recall Title: toy mobile phones recalled for choking hazard
🔗 Matched Amazon Title: frozen toy mobile phone
------------------------------------------------------------
📌 Recall Title: wooden toy alphabet blocks recalled
🔗 Matched Amazon Title: aleph bet wooden blocks
------------------------------------------------------------
📌 Recall Title: wooden toy alphabet blocks recalled
🔗 Matched Amazon Title: wooden alphabet blocks - wagon
------------------------------------------------------------
📌 Recall Title: fisher-price recalls little people play 'n go campsite™ due to choking hazard
🔗 Matched Amazon Title: fisher-price little people play 'n go farm
------------------------------------------------------------
📌 Recall Title: fisher-price recalls little people play 'n go campsite™ due to choking hazard
🔗 Matched Amazon Title: fisher-price little people going camping playset
------------------------------------------------------------
📌 Recall Title: xylophone mallets recalled by play