In [None]:
import os
import pandas as pd
from rapidfuzz import fuzz
import pandas as pd

amazon_df = pd.read_pickle('metadata_raw.pkl')


In [None]:
# Importing all Recall Data, combining the three files

def load_clean_csv(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # ✅ Find the first line that contains all expected headers (starts with 'Report No.')
    header_index = next(i for i, line in enumerate(lines) if 'Report No.' in line)

    # ✅ Load CSV from that line forward
    return pd.read_csv(path, skiprows=header_index)

# ✅ Paths to all three files
recall_files = [
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_ArtsandCrafts.csv",
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_Riding_Toys.csv",
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_Toys.csv"
]

# ✅ Load all cleanly
recall_dfs = [load_clean_csv(path) for path in recall_files]

# ✅ Merge into single df
recalls_df = pd.concat(recall_dfs, ignore_index=True)

# ✅ Preview
print("✅ Combined recall records:", len(recalls_df))
print("📦 Columns:", recalls_df.columns.tolist())
recalls_df.head(3)


In [None]:
# Features we can use for fuzzy matching.
print(amazon_df.columns)
print(recalls_df.columns.tolist())

The code below creates a dictionary with each entry being a tuple containing two dataframes 
(recalls_df['Brand'], matched_amazon_by_brand) 
Here matched_amazon_by_brand is a Dataframe containing all Amazon entries that Fuzzy match 'brand' with 'Brand' above a certain threshold, here it is 90. To save time, and if you like to run this as an experiment, the block below uses a sample of 500 brands and 30 000 Amazon entries. The full dataset is explored at the end of this notebook.

In [None]:
# Step 1: Clean and sample brand names from recalls_df
sampled_brands = pd.Series(recalls_df['Brand'].dropna().unique()[:500])
sampled_brands = sampled_brands.astype(str).str.lower().str.strip()

# Step 2: Sample and clean Amazon metadata
amazon_sample = amazon_df.sample(30000, random_state=42).copy()
amazon_sample['brand'] = amazon_sample['brand'].astype(str).str.lower().str.strip()

# Step 3: Clean 'Brand' in recalls_df for matching
recalls_df['Brand'] = recalls_df['Brand'].astype(str).str.lower().str.strip()

# Step 4: Prepare dictionary to store tuples (Amazon matches, recall row(s))
matched_amazon_by_brand = {}

# Step 5: For each brand, store Amazon matches + associated recall row(s)
for recall_brand in sampled_brands:
    # Find Amazon entries that match this brand (fuzzy match ≥ 90)
    matched_rows = amazon_sample[amazon_sample['brand'].apply(
        lambda x: fuzz.token_sort_ratio(x, recall_brand) >= 90
    )]

    # Get corresponding recall row(s) for this brand
    recall_rows = recalls_df[recalls_df['Brand'] == recall_brand]

    # Only store if there are matched Amazon entries
    if not matched_rows.empty:
        matched_amazon_by_brand[recall_brand] = (recall_rows.copy(), matched_rows.copy())

# ✅ matched_amazon_by_brand now maps each brand → (recall_rows_df, matched_amazon_rows_df)

# Optional: Print summary
print(f"✅ Found matches for {len(matched_amazon_by_brand)} recall brand entries.")


In [None]:
list(matched_amazon_by_brand.values())[0]

In [None]:
recalls_df.loc[recalls_df['Brand'] == list(matched_amazon_by_brand.keys())[0]]

In [None]:
amazon_sample

In [None]:
# 🔢 Stats on number of Amazon listings matched per recall brand
match_counts = [len(amazon_df) for _, amazon_df in matched_amazon_by_brand.values()]

if match_counts:
    avg_matches = sum(match_counts) / len(match_counts)
    min_matches = min(match_counts)
    max_matches = max(match_counts)

    print(f"📊 Average # of Amazon listings matched per brand: {avg_matches:.2f}")
    print(f"🔽 Minimum # of matches: {min_matches}")
    print(f"🔼 Maximum # of matches: {max_matches}")
else:
    print("⚠️ No matches found to compute stats.")


Now the idea is to do Fuzzy matching on each entry of the Dictionary we have created, using 'Product Description' from the recall data and some interesting category from the Amazon data. Let's have a look on 'title', 'feature' and 'description'.

In [None]:
i = 10 #choosing brand nr 10 from dictionary, change i to see different brands
brand = sampled_brands.iloc[i]
print(f"\n🔍 Brand selected (index {i}): '{brand}'")

if brand in matched_amazon_by_brand:
    recall_df, amazon_df_matches = matched_amazon_by_brand[brand]

    # Display the dataframes (normal truncated view)
    display(recall_df)
    display(amazon_df_matches) 

    # Display full text of selected columns
    print("\n📌 Product Descriptions (Recall):")
    print(recall_df['Product Description'].to_string(index=False))

    print("\n📌 Amazon 'description':")
    print(amazon_df_matches['description'].to_string(index=False))

    print("\n📌 Amazon 'title':")
    print(amazon_df_matches['title'].to_string(index=False))

    print("\n📌 Amazon 'feature':")
    print(amazon_df_matches['feature'].to_string(index=False))
else:
    print(f"No matches found for brand: {brand}")



After some inspection and experimentation it seems like the 'title' category is the best option. Below we use a fuzzy match of 80, which on the brand -- matched dataframes seem to do the right thing, look at the code below. I originally used 70, which might have been to low. 

In [None]:
from rapidfuzz import fuzz
import pandas as pd

# Loop over all brands in sampled_brands
for i, brand in enumerate(sampled_brands):
    if brand in matched_amazon_by_brand:
        recall_df, amazon_df_matches = matched_amazon_by_brand[brand]

        # Clean and prepare text columns
        recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
        amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

        matches = []

        # Compare each product description in recall_df to each Amazon title
        for recall_idx, recall_desc in recall_df['Product Description'].items():
            for amazon_idx, amazon_title in amazon_df_matches['title'].items():
                score = fuzz.token_set_ratio(recall_desc, amazon_title)
                if score >= 80:
                    matches.append({
                        'recall_idx': recall_idx,
                        'amazon_idx': amazon_idx,
                        'recall_desc': recall_desc,
                        'amazon_title': amazon_title,
                        'score': score
                    })

        if matches:
            matched_desc_df = pd.DataFrame(matches).sort_values(by='score', ascending=False)
            print(f"\n🔍 [{i}] Brand: '{brand}'")
            print(f"✅ Found {len(matched_desc_df)} title matches")
            display(matched_desc_df.head(2))  # Limit display to top 2
# You will find a list of all matches showing the title that was matched to the product description. 
# You can scroll through and see if it is reasonable to you           


This looks pretty good! Lets see how many matches we have. 

In [None]:

#Loops over all (recall_df, amazon_df_matches) pairs from matched_amazon_by_brand

#Concatenates all Amazon matches into one DataFrame

#Drops duplicates using asin, assuming asin uniquely identifies a listing

#Returns the count

# Collect all matched Amazon listings across all brands
all_amazon_matches = pd.concat(
    [amazon_df_matches for _, amazon_df_matches in matched_amazon_by_brand.values()],
    ignore_index=True
)

# Drop duplicates by ASIN to count unique listings
total_unique_amazon_matches = all_amazon_matches.drop_duplicates(subset='asin').shape[0]

print(f"📦 Total unique asin nr Amazon listings matched by brand: {total_unique_amazon_matches}")


We see that on a sample of 30 000 listings a full 3456 was matched to recall data, and this was when only 1/4 of the brands appearing in the recall data where used. 

I will now run this code on the full Amazon data and using the full recall data, this took me 8 min 45s. 

In [None]:
from rapidfuzz import fuzz
import pandas as pd

# Ensure brand fields are cleaned
amazon_df['brand'] = amazon_df['brand'].astype(str).str.lower().str.strip()
recalls_df['Brand'] = recalls_df['Brand'].astype(str).str.lower().str.strip()

# 🧼 Create the list of all unique brands from the entire recalls_df
all_recall_brands = pd.Series(recalls_df['Brand'].dropna().unique())

# Prepare dictionary to store (recall_rows, amazon_matches)
matched_amazon_by_brand = {}

# Loop over all brands from the full recall list
for recall_brand in all_recall_brands:
    # Find Amazon entries where the brand fuzzy-matches (≥ 90)
    matched_rows = amazon_df[amazon_df['brand'].apply(
        lambda x: fuzz.token_sort_ratio(x, recall_brand) >= 90
    )]

    # Get recall rows with that brand
    recall_rows = recalls_df[recalls_df['Brand'] == recall_brand]

    if not matched_rows.empty:
        matched_amazon_by_brand[recall_brand] = (recall_rows.copy(), matched_rows.copy())

# ✅ Summary
print(f"✅ Found matches for {len(matched_amazon_by_brand)} recall brand entries.")


When matching with 70, I got around 6000 matches, below we do for 80

In [None]:
# Step 2: Tag Amazon entries that match product descriptions

# Initialize column to 0
amazon_df['Is match'] = 0

# Store indices of matched rows
matched_amazon_indices = set()

# Loop over matched brands
for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    # Clean relevant columns
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, title in amazon_df_matches['title'].items():
            score = fuzz.token_set_ratio(recall_desc, title)
            if score >= 80:
                matched_amazon_indices.add(idx)

# Update original amazon_df with match indicator
amazon_df.loc[amazon_df.index.isin(matched_amazon_indices), 'Is match'] = 1

# ✅ Summary
print(f"✅ Total matched Amazon listings: {len(matched_amazon_indices)}")
print("🧾 Is match counts:\n", amazon_df['Is match'].value_counts())


In [None]:
print(f"📦 Total products in the dataset: {len(amazon_df)}")

In [None]:
# Collect examples to show
example_matches = []

for brand, (recall_df, amazon_df_matches) in matched_amazon_by_brand.items():
    recall_df['Product Description'] = recall_df['Product Description'].astype(str).fillna('').str.lower().str.strip()
    amazon_df_matches['title'] = amazon_df_matches['title'].astype(str).fillna('').str.lower().str.strip()

    for recall_desc in recall_df['Product Description']:
        for idx, title in amazon_df_matches['title'].items():
            if idx in matched_amazon_indices:  # Only show already matched indices
                score = fuzz.token_set_ratio(recall_desc, title)
                if score >= 80:
                    example_matches.append({
                        'brand': brand,
                        'amazon_index': idx,
                        'amazon_title': title,
                        'recall_description': recall_desc,
                        'score': score
                    })
                if len(example_matches) >= 10:
                    break
        if len(example_matches) >= 10:
            break
    if len(example_matches) >= 10:
        break

# Convert to DataFrame for display
examples_df = pd.DataFrame(example_matches)
print("\n📦 First 10 example matches:")
print(examples_df[['brand', 'amazon_title', 'recall_description', 'score']])


Match is definetely not perfect but maybe good enough?