In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# Install required packages
!pip install -q rapidfuzz xgboost joblib

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/3.1 MB[0m [31m36.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m55.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os, re, unicodedata, math
from pathlib import Path
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, classification_report, confusion_matrix, brier_score_loss
from scipy.sparse import hstack, csr_matrix
import matplotlib.pyplot as plt
import joblib
pd.set_option('display.max_colwidth', 200)
RND = 42

In [4]:
import shutil, datetime, os
src = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/processed"
dst = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/processed_backup_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
if os.path.exists(src):
    shutil.copytree(src, dst)
    print("Backed up processed ->", dst)
else:
    print("No existing processed folder to backup.")


Backed up processed -> /content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/processed_backup_20250808_195254


In [5]:
# Cell 1: Inspect top flagged ingredients occurrences (why so many positives)
import pandas as pd
from pathlib import Path
base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
processed = Path(base) / "processed"

chem = pd.read_csv(processed / "chemical_labels_from_main.csv", low_memory=False)
main_dedup = pd.read_csv(processed / "main_deduped_rows.csv", low_memory=False)

# Count occurrences of each normalized ingredient and whether it's flagged
counts = main_dedup.groupby(['_chem_norm']).size().reset_index(name='occurrences')
flags = chem[['chemical_name_norm','flag_any']].drop_duplicates().rename(columns={'chemical_name_norm':'_chem_norm','flag_any':'is_flagged'})
merged = counts.merge(flags, on='_chem_norm', how='left').fillna({'is_flagged':False})
merged = merged.sort_values(['is_flagged','occurrences'], ascending=[False,False])
display(merged.head(50))
# Save for inspection
merged.to_csv(processed / "ingredient_flag_occurrence_summary.csv", index=False)
print("Saved ingredient_flag_occurrence_summary.csv")


Unnamed: 0,_chem_norm,occurrences,is_flagged
117,titanium dioxide,30123,True
109,"silica, crystalline (airborne particles of respirable size)",1282,True
40,cocamide diethanolamine,864,True
103,"retinol/retinyl esters, when in daily dosages in excess of 10,000 iu, or 3,000 retinol equivalents.",716,True
30,carbon black,696,True
78,mica,603,True
113,talc,533,True
106,retinyl palmitate,493,True
125,vitamin a palmitate,401,True
120,trade secret,399,True


Saved ingredient_flag_occurrence_summary.csv


In [6]:
# Cell 2: Build conservative flagged set using IARC (Group classification required)
# Note: you must check your IARC CSV for a column that indicates group (e.g., "Classification" or "Group")
import pandas as pd
from pathlib import Path
base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
paths = Path(base)
iarc = pd.read_csv(paths / "List of Classifications – IARC Monographs on the Identification of Carcinogenic Hazards to Humans.csv", low_memory=False)

# Heuristic: try to find column with 'Group' or 'Classification' in name
group_col = None
for c in iarc.columns:
    if 'group' in c.lower() or 'classification' in c.lower() or 'class' in c.lower():
        group_col = c
        break
print("IARC group column detected (if any):", group_col)
# If group_col found, select CASs where group indicates 1, 2A, or 2B
def extract_cas_from_str(s):
    import re
    m = re.search(r'(\d{1,7}-\d{1,2}-\d)', str(s))
    return m.group(1) if m else ''

iarc['_cas'] = ''
# Try common CAS columns as fallback
possible_cas = [c for c in iarc.columns if 'cas' in c.lower() or 'casrn' in c.lower() or 'rn'==c.lower()]
if possible_cas:
    iarc['_cas'] = iarc[possible_cas[0]].astype(str).apply(lambda s: extract_cas_from_str(s))
else:
    # try to parse from any text column
    iarc['_cas'] = iarc.astype(str).apply(lambda row: ''.join(row.values), axis=1).apply(extract_cas_from_str)

# choose groups matching '1' '2A' '2B' if group_col exists
iarc_cas_strict = set()
if group_col:
    def group_match(x):
        s = str(x).lower()
        return ('group 1' in s) or ('group1' in s) or ('2a' in s) or ('2b' in s) or ('2-a' in s) or ('2-b' in s)
    mask = iarc[group_col].apply(group_match)
    iarc_cas_strict = set(iarc.loc[mask,'_cas'].dropna().unique())
    print("IARC strict CAS count:", len(iarc_cas_strict))
else:
    print("No group/class column found in IARC file; please open the CSV and tell me which column indicates the classification.")
    # fallback (do not proceed without inspection)


IARC group column detected (if any): Group
IARC strict CAS count: 363


In [7]:
# Cell 3: recompute product labels using the conservative iarc_cas_strict (if non-empty)
import pandas as pd
from pathlib import Path
base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
processed = Path(base)/"processed"
main = pd.read_csv(processed / "main_deduped_rows.csv", low_memory=False)
chem_labels = pd.read_csv(processed / "chemical_labels_from_main.csv", low_memory=False)

# get conservative CAS set (from previous cell)
# If you ran the previous cell, you should have variable iarc_cas_strict in memory.
try:
    iarc_cas = iarc_cas_strict
except NameError:
    # fallback: build set from chem_labels where iarc_cas column was true
    if 'iarc_cas' in chem_labels.columns:
        iarc_cas = set(chem_labels.loc[chem_labels['iarc_cas']==True,'cas'].dropna().astype(str).unique())
    else:
        iarc_cas = set()
print("Conservative IARC CAS count (used):", len(iarc_cas))

# flag an ingredient if its CAS is in iarc_cas (CAS match only) — strict
main['_cas_norm'] = main['_cas'].astype(str).apply(lambda s: s.strip())
main['_ing_flag_strict'] = main['_cas_norm'].apply(lambda c: (c in iarc_cas) if c and c.strip()!='' else False)

# product-level agg (unique ingredients)
prod_records = []
for pid, g in main.groupby('_product_id'):
    unique_pairs = g.drop_duplicates(subset=['_chem_norm','_cas'])
    flagged_unique_count = int(unique_pairs['_cas'].apply(lambda c: (str(c) in iarc_cas) if c and str(c).strip()!='' else False).sum())
    total_unique = int(unique_pairs['_chem_norm'].nunique())
    prod_records.append({
        '_product_id': pid,
        'product_name': g['_product_name'].iloc[0],
        'brand': g['_brand'].iloc[0],
        'unique_num_ingredients': total_unique,
        'unique_flagged_count_strict': flagged_unique_count,
        'unique_flagged_fraction_strict': flagged_unique_count / total_unique if total_unique>0 else 0.0
    })

prod_strict = pd.DataFrame(prod_records)
prod_strict['product_label_strict'] = (prod_strict['unique_flagged_count_strict'] > 0).astype(int)
prod_strict.to_csv(processed / "product_level_flags_strict_iarc.csv", index=False)
print("Saved product_level_flags_strict_iarc.csv")
print("Total products:", len(prod_strict))
print("Strict positives:", prod_strict['product_label_strict'].sum(), f"({prod_strict['product_label_strict'].mean()*100:.4f}%)")
display(prod_strict['unique_flagged_count_strict'].describe())


Conservative IARC CAS count (used): 363
Saved product_level_flags_strict_iarc.csv
Total products: 34751
Strict positives: 31543 (90.7686%)


Unnamed: 0,unique_flagged_count_strict
count,34751.0
mean,0.948577
std,0.366084
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,5.0


In [8]:
# Cell 4: Create a balanced test set (if needed) and train a text-only model (honest)
import pandas as pd, joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix

base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
processed = Path(base)/"processed"
pf = pd.read_csv(processed / "product_level_flags_strict_iarc.csv")
pf['product_text'] = (pf['product_name'].fillna('') + " " + pf['brand'].fillna('')).str.strip()
y = pf['product_label_strict'].values

# If positives are extremely rare (<1%), create a balanced test by subsampling positives and negatives for evaluation
pos_idx = pf[pf['product_label_strict']==1].index.tolist()
neg_idx = pf[pf['product_label_strict']==0].index.tolist()
print("Positives:", len(pos_idx), "Negatives:", len(neg_idx))
# If positives less than 1% of dataset, sample negatives to create a balanced test set
if len(pos_idx) < 1000:
    # train/test split with stratify, then create balanced test by up/down sampling
    # First a stratified 80/20 split
    X = pf['product_text'].values
    X_tr_idx, X_te_idx = train_test_split(pf.index.values, test_size=0.20, random_state=42, stratify=y)
    # Build a balanced test sample with up to 2000 items
    test_pos = [i for i in X_te_idx if pf.loc[i,'product_label_strict']==1]
    test_neg = [i for i in X_te_idx if pf.loc[i,'product_label_strict']==0]
    # sample negatives to equal positives (or cap)
    import numpy as np
    npos = len(test_pos)
    nneg = min(len(test_neg), max(npos, 1000))  # ensure at least 1000 negatives if possible
    sampled_neg = np.random.RandomState(42).choice(test_neg, size=nneg, replace=False).tolist()
    balanced_test_idx = test_pos + sampled_neg
    print("Balanced test size:", len(balanced_test_idx), "positives:", len(test_pos), "negatives:", len(sampled_neg))
else:
    # if many positives, standard stratified split is fine
    X_tr_idx, X_te_idx = train_test_split(pf.index.values, test_size=0.20, random_state=42, stratify=y)
    balanced_test_idx = X_te_idx

# Fit TF-IDF on training product_text
train_idx = [i for i in X_tr_idx if i not in balanced_test_idx]  # ensure disjoint
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1,2), min_df=2)
tfidf.fit(pf.loc[train_idx,'product_text'].values)
X_test = tfidf.transform(pf.loc[balanced_test_idx,'product_text'].values)
y_test = pf.loc[balanced_test_idx,'product_label_strict'].values

# Train a simple logistic on training set (text-only)
X_train = tfidf.transform(pf.loc[train_idx,'product_text'].values)
y_train = pf.loc[train_idx,'product_label_strict'].values
clf = LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga', random_state=42)
clf.fit(X_train, y_train)

probs = clf.predict_proba(X_test)[:,1]
print("Evaluation on balanced test (honest):")
print("AUROC:", roc_auc_score(y_test, probs) if len(set(y_test))>1 else 'n/a')
print("AUPRC:", average_precision_score(y_test, probs) if len(set(y_test))>1 else 'n/a')
pred = (probs >= 0.5).astype(int)
print("Confusion:\n", confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, zero_division=0))

# Save vectorizer and model
joblib.dump(tfidf, processed / "../trained_models/tfidf_strict_iarc.joblib")
joblib.dump(clf, processed / "../trained_models/clf_lr_text_strict_iarc.joblib")
print("Saved TFIDF and logistic for strict labeling")


Positives: 31543 Negatives: 3208
Evaluation on balanced test (honest):
AUROC: 0.9239362350871944
AUPRC: 0.9902856618769977
Confusion:
 [[ 522  120]
 [ 716 5593]]
              precision    recall  f1-score   support

           0       0.42      0.81      0.56       642
           1       0.98      0.89      0.93      6309

    accuracy                           0.88      6951
   macro avg       0.70      0.85      0.74      6951
weighted avg       0.93      0.88      0.90      6951

Saved TFIDF and logistic for strict labeling


In [9]:
# A: Count how often normalized ingredient tokens appear inside product_name
import pandas as pd, re
from pathlib import Path
base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
proc = Path(base) / "processed"
pf = pd.read_csv(proc / "product_level_flags_strict_iarc.csv", low_memory=False)
main_dedup = pd.read_csv(proc / "main_deduped_rows.csv", low_memory=False)
# Build set of normalized ingredient names (only those with length>3 to avoid tiny tokens)
ing_counts = main_dedup['_chem_norm'].value_counts()
common_ings = set(ing_counts[ing_counts >= 5].index.tolist())  # ingredients appearing ≥5 times
len_common = len(common_ings)
print("Number of common ingredient normalized names (≥5 occurrences):", len_common)

# lower-product-name text
pf['product_name_norm'] = pf['product_name'].fillna('').astype(str).str.lower()

# function to check presence of any ingredient token in the product name
def contains_any_ingredient(prodname, ing_set, max_check=2000):
    # For speed, only check top-N ingredients
    for ing in list(ing_set)[:max_check]:
        if ing and len(ing) > 3 and ing in prodname:
            return True
    return False

# compute fraction of product names that contain any common ingredient token
pf['contains_ing_token'] = pf['product_name_norm'].apply(lambda s: contains_any_ingredient(s, common_ings, max_check=2000))
frac = pf['contains_ing_token'].mean()
print(f"Fraction of products whose name contains a common ingredient token (approx): {frac:.4%}")
# Save some examples
pf[pf['contains_ing_token']].sample(20)[['product_name','brand']].to_csv(proc / "examples_product_names_with_ingredient_tokens.csv", index=False)
print("Saved examples ->", proc / "examples_product_names_with_ingredient_tokens.csv")


Number of common ingredient normalized names (≥5 occurrences): 75
Fraction of products whose name contains a common ingredient token (approx): 0.7683%
Saved examples -> /content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/processed/examples_product_names_with_ingredient_tokens.csv


In [10]:
# B: remove ingredient normalized names from product_text and retrain text-only model on balanced test
import pandas as pd, joblib, re
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
proc = Path(base) / "processed"

pf = pd.read_csv(proc / "product_level_flags_strict_iarc.csv", low_memory=False)
main_dedup = pd.read_csv(proc / "main_deduped_rows.csv", low_memory=False)

# prepare mask list: normalized ingredient tokens (top K)
ing_counts = main_dedup['_chem_norm'].value_counts()
top_ing_tokens = [t for t in ing_counts.index if isinstance(t, str) and len(t) > 3][:2000]

def mask_product_name(name, tokens):
    s = (name or "").lower()
    for t in tokens:
        if t and len(t)>3 and t in s:
            s = s.replace(t, " ")
    s = re.sub(r'\s+', ' ', s).strip()
    return s

pf['product_text_masked'] = pf['product_name'].fillna('').astype(str).apply(lambda s: mask_product_name(s, top_ing_tokens)) + " " + pf['brand'].fillna('').astype(str)
pf['product_text_unmasked'] = (pf['product_name'].fillna('') + " " + pf['brand'].fillna('')).str.strip()

# build balanced test same method as before
y = pf['product_label_strict'].values
X_tr_idx, X_te_idx = train_test_split(pf.index.values, test_size=0.20, random_state=42, stratify=y)
# form balanced_test_idx like before (only if positives are rare)
pos_in_test = [i for i in X_te_idx if pf.loc[i,'product_label_strict']==1]
neg_in_test = [i for i in X_te_idx if pf.loc[i,'product_label_strict']==0]
import numpy as np
npos = len(pos_in_test)
nneg = min(len(neg_in_test), max(npos, 1000))
sampled_neg = np.random.RandomState(42).choice(neg_in_test, size=nneg, replace=False).tolist()
balanced_test_idx = pos_in_test + sampled_neg

train_idx = [i for i in X_tr_idx if i not in balanced_test_idx]
# fit tfidf and logistic on masked text
tfidf_mask = TfidfVectorizer(max_features=2000, ngram_range=(1,2), min_df=2)
X_train = tfidf_mask.fit_transform(pf.loc[train_idx,'product_text_masked'].values)
X_test = tfidf_mask.transform(pf.loc[balanced_test_idx,'product_text_masked'].values)
y_train = pf.loc[train_idx,'product_label_strict'].values
y_test = pf.loc[balanced_test_idx,'product_label_strict'].values

clf_mask = LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga', random_state=42)
clf_mask.fit(X_train, y_train)
probs_mask = clf_mask.predict_proba(X_test)[:,1]

print("Masked-text logistic evaluation:")
print("AUROC:", roc_auc_score(y_test, probs_mask) if len(set(y_test))>1 else 'n/a')
print("AUPRC:", average_precision_score(y_test, probs_mask) if len(set(y_test))>1 else 'n/a')
pred = (probs_mask >= 0.5).astype(int)
print("Confusion:\n", confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, zero_division=0))
# Save artifacts
joblib.dump(tfidf_mask, proc.parent / "trained_models" / "tfidf_masked.joblib")
joblib.dump(clf_mask, proc.parent / "trained_models" / "clf_masked_lr.joblib")
print("Saved masked models")


Masked-text logistic evaluation:
AUROC: 0.9242450951491442
AUPRC: 0.9902886497204975
Confusion:
 [[ 525  117]
 [ 731 5578]]
              precision    recall  f1-score   support

           0       0.42      0.82      0.55       642
           1       0.98      0.88      0.93      6309

    accuracy                           0.88      6951
   macro avg       0.70      0.85      0.74      6951
weighted avg       0.93      0.88      0.89      6951

Saved masked models


In [11]:
# C: run brand-holdout split and evaluate the masked and unmasked text models
import pandas as pd, joblib
from pathlib import Path
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
proc = Path(base) / "processed"
pf = pd.read_csv(proc / "product_level_flags_strict_iarc.csv", low_memory=False)

# choose brands until roughly 20% of products are in test set
import numpy as np
unique_brands = pf['brand'].fillna('').unique().tolist()
np.random.seed(42); np.random.shuffle(unique_brands)
acc = 0; selected=[]
total = len(pf)
for b in unique_brands:
    selected.append(b)
    acc += (pf['brand'].fillna('')==b).sum()
    if acc/total >= 0.20:
        break
test_mask = pf['brand'].isin(selected)
train_mask = ~test_mask
print("Brand-holdout test brands sample:", selected[:10], "...")

# load models (masked/unmasked)
tfidf = joblib.load(proc.parent / "trained_models" / "tfidf_textonly.joblib")
clf_xgb_text = joblib.load(proc.parent / "trained_models" / "clf_xgb_textonly.joblib")
tfidf_mask = joblib.load(proc.parent / "trained_models" / "tfidf_masked.joblib")
clf_mask = joblib.load(proc.parent / "trained_models" / "clf_masked_lr.joblib")

# evaluate unmasked text model
X_test_un = tfidf.transform(pf.loc[test_mask,'product_name'].fillna('') + " " + pf.loc[test_mask,'brand'].fillna(''))
y_test = pf.loc[test_mask, 'product_label_strict'].values
probs_un = clf_xgb_text.predict_proba(X_test_un)[:,1]
print("Unmasked model on brand-holdout:")
print("AUROC:", roc_auc_score(y_test, probs_un))
print("AUPRC:", average_precision_score(y_test, probs_un))

# masked model on brand holdout
X_test_mask = tfidf_mask.transform(pf.loc[test_mask,'product_name'].fillna('').apply(lambda s: s.lower()).values)  # we used masked pipeline earlier; ensure consistent
probs_mask = clf_mask.predict_proba(X_test_mask)[:,1]
print("Masked model on brand-holdout:")
print("AUROC:", roc_auc_score(y_test, probs_mask))
print("AUPRC:", average_precision_score(y_test, probs_mask))


Brand-holdout test brands sample: ['CND', 'John Paul Mitchell Systems', 'Jurlique', 'PS', 'Pure Nautica Discovery', 'VICHY', 'Erbaviva', 'Camouflage Gel', 'Nicki Minaj', 'Nail Magic'] ...
Unmasked model on brand-holdout:
AUROC: 0.5964900249919451
AUPRC: 0.9266387108588724
Masked model on brand-holdout:
AUROC: 0.8762727526050758
AUPRC: 0.9821894342652524


In [14]:
# Permutation importance — robust full cell
import os, joblib, re, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score
from scipy.sparse import issparse

base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
proc = Path(base) / "processed"
models_dir = Path(base) / "trained_models"

# 1) Load product-level strict labels
pf_path = proc / "product_level_flags_strict_iarc.csv"
assert pf_path.exists(), f"Missing {pf_path}. Run strict-label generation first."
pf = pd.read_csv(pf_path, low_memory=False)

# 2) Check for masked column; if missing, rebuild it the same way used earlier
if 'product_text_masked' not in pf.columns:
    print("product_text_masked not found — recreating it now (masking top ingredient tokens).")
    # load main dedup to get ingredient norms
    main_dedup_path = proc / "main_deduped_rows.csv"
    assert main_dedup_path.exists(), f"Missing {main_dedup_path} — please run dedupe step first."
    main_dedup = pd.read_csv(main_dedup_path, low_memory=False)
    # get top ingredient tokens (same heuristic as before)
    ing_counts = main_dedup['_chem_norm'].value_counts()
    top_ing_tokens = [t for t in ing_counts.index if isinstance(t, str) and len(t) > 3][:2000]
    print(f"Using top {len(top_ing_tokens)} ingredient tokens for masking.")
    # simple masker
    def mask_product_name(name, tokens):
        s = (name or "").lower()
        for t in tokens:
            if t and len(t) > 3 and t in s:
                s = s.replace(t, " ")
        s = re.sub(r'\s+', ' ', s).strip()
        return s
    pf['product_text_masked'] = pf['product_name'].fillna('').astype(str).apply(lambda s: mask_product_name(s, top_ing_tokens)) + " " + pf['brand'].fillna('').astype(str)
    # optionally save back to disk for future cells
    pf.to_csv(pf_path, index=False)
    print("Recreated and saved product_text_masked into", pf_path)
else:
    print("product_text_masked exists in DataFrame. Proceeding.")

# 3) Load masked TF-IDF and masked classifier if available; else fit small ones
tfidf_mask_path = models_dir / "tfidf_masked.joblib"
clf_mask_path = models_dir / "clf_masked_lr.joblib"
if tfidf_mask_path.exists() and clf_mask_path.exists():
    print("Loading saved masked TF-IDF and classifier.")
    tfidf_mask = joblib.load(tfidf_mask_path)
    clf_mask = joblib.load(clf_mask_path)
else:
    print("Masked artifacts not found in trained_models. Fitting small masked TF-IDF + LR now (this may take a bit).")
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    # build a stratified train/test (20% test)
    y = pf['product_label_strict'].values
    train_idx, val_idx = train_test_split(pf.index.values, test_size=0.20, random_state=42, stratify=y)
    tfidf_mask = TfidfVectorizer(max_features=2000, ngram_range=(1,2), min_df=2)
    X_train_texts = pf.loc[train_idx, 'product_text_masked'].values
    tfidf_mask.fit(X_train_texts)
    clf_mask = LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga', random_state=42)
    X_tr = tfidf_mask.transform(X_train_texts)
    clf_mask.fit(X_tr, pf.loc[train_idx, 'product_label_strict'].values)
    # save artifacts
    models_dir.mkdir(parents=True, exist_ok=True)
    joblib.dump(tfidf_mask, tfidf_mask_path)
    joblib.dump(clf_mask, clf_mask_path)
    print("Fitted and saved masked TF-IDF and classifier to", models_dir)

# 4) Build validation set for permutation importance (use held-out stratified 20% or balanced selection)
y = pf['product_label_strict'].values
idx_tr, idx_te = train_test_split(pf.index.values, test_size=0.20, random_state=42, stratify=y)
# Use idx_te as validation (if extremely imbalanced, you may want to create a balanced subset instead)
valid_idx = idx_te
X_val = tfidf_mask.transform(pf.loc[valid_idx, 'product_text_masked'].values)
y_val = pf.loc[valid_idx, 'product_label_strict'].values

print("Validation size:", X_val.shape, "Positives:", int(y_val.sum()), "Negatives:", len(y_val)-int(y_val.sum()))

# 5) Run permutation importance
print("Running permutation importance (n_repeats=10). This may take ~1-3 minutes depending on CPU.")
# Convert sparse matrix to dense array for permutation_importance
X_val_dense = X_val.toarray()

res = permutation_importance(clf_mask, X_val_dense, y_val, n_repeats=10, random_state=42, n_jobs=2)
importances = res.importances_mean
tokens = tfidf_mask.get_feature_names_out()
# get top K important tokens
K = 40
top_idx = np.argsort(importances)[-K:][::-1]
imp_df = pd.DataFrame({
    'token': tokens[top_idx],
    'importance_mean': importances[top_idx],
    'importance_std': res.importances_std[top_idx]
})
print("\nTop tokens by permutation importance (token -> importance_mean):")
pd.set_option('display.max_rows', None)
display(imp_df.head(K))
# save to disk
outp = proc / "permutation_importance_masked_top_tokens.csv"
imp_df.to_csv(outp, index=False)
print("Saved permutation importance to:", outp)

# quick heuristic: how many of top tokens look like ingredient names
top_tokens_list = imp_df['token'].tolist()
ingredient_like = [t for t in top_tokens_list if any(len(tk)>3 and tk in t for tk in list(set(main_dedup['_chem_norm'].values) ) )]
print(f"\nOf top {K} tokens, approx {len(ingredient_like)} tokens match ingredient-normalized strings (rough heuristic).")

product_text_masked exists in DataFrame. Proceeding.
Loading saved masked TF-IDF and classifier.
Validation size: (6951, 2000) Positives: 6309 Negatives: 642
Running permutation importance (n_repeats=10). This may take ~1-3 minutes depending on CPU.

Top tokens by permutation importance (token -> importance_mean):


Unnamed: 0,token,importance_mean,importance_std
0,bath body,0.003496,0.000412
1,lumene,0.002892,0.000291
2,chi,0.002719,0.000276
3,entity,0.00246,0.000452
4,olay,0.002316,0.000318
5,avon,0.001856,0.000512
6,bioglitz,0.001798,0.000196
7,shampoo,0.001554,0.000831
8,cr me,0.001424,0.000163
9,gelish,0.001381,0.000418


Saved permutation importance to: /content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/processed/permutation_importance_masked_top_tokens.csv

Of top 40 tokens, approx 0 tokens match ingredient-normalized strings (rough heuristic).


In [15]:
# E: save a manual-review pack (50 positives and 50 negatives) for human inspection
import pandas as pd, random
from pathlib import Path
base = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics"
proc = Path(base) / "processed"
pf = pd.read_csv(proc / "product_level_flags_strict_iarc.csv", low_memory=False)

pos = pf[pf['product_label_strict']==1].sample(min(50, pf['product_label_strict'].sum()), random_state=42)
neg = pf[pf['product_label_strict']==0].sample(min(50, (len(pf)-pf['product_label_strict'].sum())), random_state=42)
sample_pack = pd.concat([pos, neg])
sample_pack.to_csv(proc / "manual_review_sample_100.csv", index=False)
print("Saved manual review sample ->", proc / "manual_review_sample_100.csv")


Saved manual review sample -> /content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/processed/manual_review_sample_100.csv
