Importing the data frame (I'm using the six categories with dropped asin)

In [51]:
import pandas as pd

# Go up one level, then into Data/
df = pd.read_csv('../Data/amazon_top6_unique.csv')

# Check it's loaded
print(df.shape)

(109011, 22)


  df = pd.read_csv('../Data/amazon_top6_unique.csv')


In [52]:
sample_df = df.sample(1000, random_state=42)

In [53]:
from transformers import pipeline
from tqdm import tqdm

In [54]:
import torch
print(torch.__version__)

2.7.1


In [55]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


Device set to use mps:0


In [56]:
text = "FDA certified and ISO approved non-toxic product."
labels = ["certified", "non-toxic", "hypoallergenic", "safe to use"]

result = classifier(text, labels, multi_label=True)
print(result)


{'sequence': 'FDA certified and ISO approved non-toxic product.', 'labels': ['safe to use', 'non-toxic', 'certified', 'hypoallergenic'], 'scores': [0.9976987838745117, 0.9969680905342102, 0.9933708310127258, 0.6317756175994873]}


In [57]:
certification_labels = [
    "FDA approved",
    "FDA certified",
    "FDA registered",
    "FDA compliant",
    "ISO certified",
    "CE marked",
    "CE certified",
    "GMP certified",
    "UL certified",
    "CPSIA compliant",
    "ASTM F963 compliant",
    "Childrens Product Certificate (CPC)",
    "EN 71 compliant",
    "REACH compliant",
    "Declaration of Conformity (DoC)",
    "FCC compliant",
    "CCC Mark",
    "ISO 8124 compliant",
    "UKCA compliant",
    "Canada Consumer Product Safety Act (CCPSA) compliant",
    "ST Mark compliant",
    "RoHS compliant",
    "WEEE compliant"
]

non_toxic_labels = [
    "third-party tested",
    "lab tested",
    "non-toxic",
    "PVC free",
    "BPA free",
    "phthalate free",
    "lead free",
    "formaldehyde free",
    "latex free",
]

age_appropriate_labels = []



In [58]:
def get_all_certifications_with_score(text, labels=certification_labels, threshold=0.97):
    """
    Returns (labels_above_threshold, max_score) from zero-shot classification.
    If no label passes threshold or text is empty, returns (None, max_score).
    """
    if not text or not text.strip() or text.strip() == '[]':
        return (None, 0.0)

    result = classifier(text, labels, multi_label=True)
    high_conf_labels = [
        label for label, score in zip(result['labels'], result['scores']) if score > threshold
    ]
    max_score = max(result['scores']) if result['scores'] else 0.0

    return (high_conf_labels if high_conf_labels else None, max_score)


In [59]:
def get_cert_label_if_either_matches(row):
    desc_labels, desc_score = get_all_certifications_with_score(row['description'])
    feat_labels, feat_score = get_all_certifications_with_score(row['feature'])

    # If either has labels above threshold, return combined labels and max score
    if desc_labels or feat_labels:
        combined_labels = []
        if desc_labels:
            combined_labels.extend(desc_labels)
        if feat_labels:
            combined_labels.extend(feat_labels)

        # Remove duplicates while preserving order
        combined_labels = list(dict.fromkeys(combined_labels))
        return combined_labels, max(desc_score, feat_score)
    else:
        return None, max(desc_score, feat_score)


In [60]:
text_1 = "FDA and CE certified product."
text_2 = "Gentle soap for everyday use."
text_3 = ""

print(get_all_certifications_with_score(text_1))  # ✅ should return some labels
print(get_all_certifications_with_score(text_2))  # ❌ should return None (scores < 0.95)
print(get_all_certifications_with_score(text_3))  # ❌ should return None (empty)


(['CE marked', 'FDA compliant'], 0.9782139658927917)
(None, 0.7379173636436462)
(None, 0.0)


In [61]:

from tqdm import tqdm
tqdm.pandas()

results = sample_df.progress_apply(get_cert_label_if_either_matches, axis=1)
sample_df['certification_labels'], sample_df['max_cert_score'] = zip(*results)


100%|██████████| 1000/1000 [15:59<00:00,  1.04it/s]


In [62]:
# Total number of rows
total_rows = len(sample_df)

# Number of rows with non-empty certifications
non_empty_cert = sample_df['certification_labels'].apply(lambda x: bool(x)).sum()

# Percentage
percentage_cert = non_empty_cert / total_rows

print(f"Number of rows with non-empty certifications: {non_empty_cert} / {total_rows}")
print(f"Percentage: {percentage_cert:.2%}")


Number of rows with non-empty certifications: 41 / 1000
Percentage: 4.10%


We now ad a "non toxic" label feature

In [63]:
def get_non_toxic_labels_with_score(text, labels=non_toxic_labels, threshold=0.97):
    """
    Returns (labels_above_threshold, max_score) from zero-shot classification for non-toxic toy labels.
    If no label passes threshold or text is empty, returns (None, max_score).
    """
    if not text or not text.strip() or text.strip() == '[]':
        return (None, 0.0)

    result = classifier(text, labels, multi_label=True)
    high_conf_labels = [
        label for label, score in zip(result['labels'], result['scores']) if score > threshold
    ]
    max_score = max(result['scores']) if result['scores'] else 0.0

    return (high_conf_labels if high_conf_labels else None, max_score)


In [64]:
def get_non_toxic_label_if_either_matches(row):
    desc_labels, desc_score = get_non_toxic_labels_with_score(row['description'])
    feat_labels, feat_score = get_non_toxic_labels_with_score(row['feature'])

    # If either has labels above threshold, return combined labels and max score
    if desc_labels or feat_labels:
        combined_labels = []
        if desc_labels:
            combined_labels.extend(desc_labels)
        if feat_labels:
            combined_labels.extend(feat_labels)

        # Remove duplicates while preserving order
        combined_labels = list(dict.fromkeys(combined_labels))
        return combined_labels, max(desc_score, feat_score)
    else:
        return None, max(desc_score, feat_score)


In [65]:
results_toxic = sample_df.progress_apply(get_non_toxic_label_if_either_matches, axis=1)
sample_df['non_toxic_labels'], sample_df['max_non_toxic_score'] = zip(*results_toxic)

100%|██████████| 1000/1000 [06:25<00:00,  2.60it/s]


In [66]:
# Total number of rows
total_rows = len(sample_df)

# Number of rows with non-empty labels
non_empty_tox = sample_df['non_toxic_labels'].apply(lambda x: bool(x)).sum()

# Percentage
percentage_tox = non_empty_tox / total_rows

print(f"Number of rows with non-empty non toxic labels: {non_empty_tox} / {total_rows}")
print(f"Percentage: {percentage_tox:.2%}")


Number of rows with non-empty non toxic labels: 123 / 1000
Percentage: 12.30%


Lastly we create a feature that binary let's us know if there is an age recommendation present in either feature, description or title field.

In [67]:
import re

def has_age_recommendation_from_fields(row):
    text = ' '.join([
        str(row.get('description', '')),
        str(row.get('feature', '')),
        str(row.get('title', ''))
    ]).lower()

    patterns = [
    r'ages?\s*\d+\+',
    r'recommended\s*ages?:?\s*\d+',
    r'\d+\s*years?\s*\+',
    r'from\s*age\s*\d+',
    r'for\s*ages?\s*\d+\+',
    r'suitable\s*for\s*ages?\s*\d+',
    r'ideal\s*for\s*ages?\s*\d+',
    r'not\s*suitable\s*for\s*children\s*under\s*\d+',
    r'for\s*children\s*over\s*\d+',
    r'age\s*range\s*\d+\s*to\s*\d+',
    r'\d+\s*months?\s*\+',
    r'suitable\s*for\s*children\s*aged\s*\d+\s*(?:and up|\+)',
    r'great\s*for\s*ages?\s*\d+',
    r'intended\s*for\s*children\s*aged\s*\d+\s*(?:and up|\+)',
    r'safe\s*for\s*children\s*\d+\s*years?\s*and\s*older'
]


    for pattern in patterns:
        if re.search(pattern, text):
            return 1
    return 0


In [70]:
sample_df['has_age_label'] = sample_df.apply(has_age_recommendation_from_fields, axis=1)
print(f"Number of rows with age recommendation: {sample_df['has_age_label'].sum()} / {len(sample_df)}")

Number of rows with age recommendation: 71 / 1000


Opening the output below in "text editor" let's you see a sample of 10 listings with at least one certification or non toxic label.

In [69]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
 
filtered = sample_df[
    sample_df['certification_labels'].apply(lambda x: bool(x)) |
    sample_df['non_toxic_labels'].apply(lambda x: bool(x))
]

print(filtered[['feature', 'description', 'certification_labels', 'non_toxic_labels','has_age_label']].head(10))


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        