In [2]:
import pandas as pd 
import numpy as np

In [3]:
file_path = 'Data/amazon_meta.json'  
amazon_df = pd.read_json(file_path, lines=True, compression=None)

In [4]:
file_path_train = 'Data/train.parquet'  
file_path_test = 'Data/test.parquet'  
df_train = pd.read_parquet(file_path_train)
df_test = pd.read_parquet(file_path_test)

In [5]:
all_asins = df_train['asin'].tolist() + df_test['asin'].tolist()

In [6]:
len(all_asins)

201370

In [7]:
amazon_df.shape

(633883, 19)

In [8]:
amazon_df_filtered = amazon_df[amazon_df['asin'].isin(all_asins)]

In [9]:
amazon_df_filtered.shape

(204604, 19)

In [10]:
amazon_df_unique = amazon_df_filtered.drop_duplicates(subset='asin')

In [11]:
amazon_df_unique.shape

(201370, 19)

In [12]:
certification_labels = [
    "FDA approved",
    "FDA certified",
    "FDA registered",
    "FDA compliant",
    "ISO certified",
    "CE marked",
    "CE certified",
    "GMP certified",
    "UL certified",
    "CPSIA compliant",
    "ASTM F963 compliant",
    "Childrens Product Certificate (CPC)",
    "EN 71 compliant",
    "REACH compliant",
    "Declaration of Conformity (DoC)",
    "FCC compliant",
    "CCC Mark",
    "ISO 8124 compliant",
    "UKCA compliant",
    "Canada Consumer Product Safety Act (CCPSA) compliant",
    "ST Mark compliant",
    "RoHS compliant",
    "WEEE compliant"
]

non_toxic_labels = [
    "third-party tested",
    "lab tested",
    "non-toxic",
    "PVC free",
    "BPA free",
    "phthalate free",
    "lead free",
    "formaldehyde free",
    "latex free",
]

age_appropriate_labels = []

In [47]:
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import torch

# Setup
device = 0 if torch.cuda.is_available() else -1
#classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
#classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-large-mnli", device=0)
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli", device=0)


# Simulate batch processing
def batched_zero_shot(df, text_cols=('description', 'feature'), labels=None, threshold=0.97, batch_size=16):
    all_labels = []
    all_scores = []

    for i in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
        batch_df = df.iloc[i:i+batch_size]

        # Pre-process and classify each row
        for _, row in batch_df.iterrows():
            desc_text = row[text_cols[0]]
            feat_text = row[text_cols[1]]

            desc_labels, desc_score = process_text(desc_text, labels, threshold)
            feat_labels, feat_score = process_text(feat_text, labels, threshold)

            combined_labels = []
            if desc_labels:
                combined_labels.extend(desc_labels)
            if feat_labels:
                combined_labels.extend(feat_labels)

            # Deduplicate and store
            combined_labels = list(dict.fromkeys(combined_labels)) if combined_labels else None
            all_labels.append(combined_labels)
            all_scores.append(max(desc_score, feat_score))

    return all_labels, all_scores

# Helper function to handle single text input
def process_text(text, labels, threshold):
    # Handle missing, list, or bad formats
    if isinstance(text, list):
        text = " ".join(str(x) for x in text if x).strip()
    elif not isinstance(text, str):
        text = str(text).strip()

    if not text or text == '[]':
        return None, 0.0

    try:
        result = classifier(text, labels, multi_label=True)
        scores = result["scores"]
        high_conf = [label for label, score in zip(result["labels"], scores) if score > threshold]
        return high_conf if high_conf else None, max(scores)
    except Exception as e:
        return None, 0.0


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
sample_df['certification_labels'], sample_df['max_cert_score'] = batched_zero_shot(
    sample_df,
    text_cols=('description', 'feature'),
    labels=certification_labels,
    threshold=0.97,
    batch_size=8
)

Processing batches:   0%|                                                                      | 0/125 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing batches: 100%|████████████████████████████████████████████████████████████| 125/125 [08:07<00:00,  3.90s/it]


In [18]:
# Total number of rows
total_rows = len(sample_df)

# Number of rows with non-empty certifications
non_empty_cert = sample_df['certification_labels'].apply(lambda x: bool(x)).sum()

# Percentage
percentage_cert = non_empty_cert / total_rows

print(f"Number of rows with non-empty certifications: {non_empty_cert} / {total_rows}")
print(f"Percentage: {percentage_cert:.2%}")

Number of rows with non-empty certifications: 62 / 1000
Percentage: 6.20%


In [64]:
def batched_zero_shot(df, text_cols=('description', 'feature'), labels=None, threshold=0.97, batch_size=16):
    all_labels = []
    all_scores = []

    for i in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
        batch_df = df.iloc[i:i+batch_size]

        # Collect cleaned text for description and feature
        desc_batch = [clean_text(row[text_cols[0]]) for _, row in batch_df.iterrows()]
        feat_batch = [clean_text(row[text_cols[1]]) for _, row in batch_df.iterrows()]

        try:
            desc_results = classifier(desc_batch, labels, multi_label=True)
            feat_results = classifier(feat_batch, labels, multi_label=True)
        except Exception as e:
            desc_results = [None] * len(batch_df)
            feat_results = [None] * len(batch_df)

        for desc_res, feat_res in zip(desc_results, feat_results):
            labels_d, score_d = parse_result(desc_res, labels, threshold)
            labels_f, score_f = parse_result(feat_res, labels, threshold)

            combined = list(dict.fromkeys((labels_d or []) + (labels_f or []))) or None
            all_labels.append(combined)
            all_scores.append(max(score_d, score_f))

    return all_labels, all_scores


def clean_text(text):
    if isinstance(text, list):
        return " ".join(str(x) for x in text if x).strip()
    elif not isinstance(text, str):
        return str(text).strip()
    return text.strip()


def parse_result(result, labels, threshold):
    if not result or not isinstance(result, dict):
        return None, 0.0
    scores = result["scores"]
    high_conf = [label for label, score in zip(result["labels"], scores) if score > threshold]
    return high_conf if high_conf else None, max(scores)


In [None]:
amazon_df_unique['certification_labels'], amazon_df_unique['max_cert_score'] = batched_zero_shot(
    amazon_df_unique,
    text_cols=('description', 'feature'),
    labels=certification_labels,
    threshold=0.97,
    batch_size=32
)

Processing batches:  17%|█████████                                             | 1058/6293 [1:11:14<3:43:11,  2.56s/it]

In [None]:
# Total number of rows
total_rows = len(amazon_df_unique)

# Number of rows with non-empty certifications
non_empty_cert = amazon_df_unique['certification_labels'].apply(lambda x: bool(x)).sum()

# Percentage
percentage_cert = non_empty_cert / total_rows

print(f"Number of rows with non-empty certifications: {non_empty_cert} / {total_rows}")
print(f"Percentage: {percentage_cert:.2%}")

In [None]:
amazon_df_unique['certification_labels'].value_counts()

In [None]:
# save 
amazon_df_unique.to_pickle("Data/amazon_df_cert_features2.pkl")

In [None]:
amazon_df_unique['non_toxic_labels'], amazon_df_unique['max_non_toxic_score'] = batched_zero_shot(
    amazon_df_unique,
    text_cols=('description', 'feature'),
    labels=non_toxic_labels,
    threshold=0.97,
    batch_size=32
)

In [None]:
# Total number of rows
total_rows = len(sample_df)

# Number of rows with non-empty labels
non_empty_tox = sample_df['non_toxic_labels'].apply(lambda x: bool(x)).sum()

# Percentage
percentage_tox = non_empty_tox / total_rows

print(f"Number of rows with non-empty non toxic labels: {non_empty_tox} / {total_rows}")
print(f"Percentage: {percentage_tox:.2%}")