In [None]:
# Cell 1 (updated): install packages with robust RDKit fallback (Colab)
# Try stable RDKit first; if it fails, try an alternate available version; continue regardless.
import subprocess, sys, os

def run(cmd):
    print(f"\n>>> Running: {cmd}")
    r = subprocess.run(cmd, shell=True)
    return r.returncode

# Downgrade numpy to a compatible version
run("pip install --quiet numpy<2")

# Try a stable rdkit-pypi wheel first (available on PyPI)
if run("pip install --quiet rdkit-pypi==2022.9.5") != 0:
    print("rdkit-pypi==2022.9.5 failed, trying an alternate wheel...")
    # alternate available version (beta) — try it if the stable one fails
    run("pip install --quiet rdkit-pypi==2023.3.1b1 || true")

# Install the rest (quiet)
run("pip install --quiet iterstrat imbalanced-learn xgboost optuna node2vec pubchempy shap joblib")

# Quick import checks and summary
print('\n=== Import checks ===')
try:
    import rdkit
    print("RDKit import: OK (version {})".format(rdkit.__version__))
except Exception as e:
    print("RDKit import: FAILED. You can still continue, but chemical fingerprinting will be disabled.")
    print("If you need RDKit, consider installing via conda (recommended) or uploading a precomputed cas_smiles.csv.")
try:
    import iterstrat, imblearn, xgboost, optuna, node2vec, pubchempy, shap, joblib
    print("Other packages imported OK.")
except Exception as e:
    print("Warning: one or more packages failed to import. Check the cell output above for pip errors.")


>>> Running: pip install --quiet numpy<2

>>> Running: pip install --quiet rdkit-pypi==2022.9.5

>>> Running: pip install --quiet iterstrat imbalanced-learn xgboost optuna node2vec pubchempy shap joblib

=== Import checks ===
RDKit import: OK (version 2022.09.5)


In [None]:
# Run this cell to detect & install any missing packages, then re-check imports
import importlib, subprocess, sys, traceback

# map pip package name -> python import name
pkg_map = {
    'iterstrat': 'iterstrat',
    'imbalanced-learn': 'imblearn',
    'xgboost': 'xgboost',
    'optuna': 'optuna',
    'node2vec': 'node2vec',
    'pubchempy': 'pubchempy',
    'shap': 'shap',
    'joblib': 'joblib'
}

missing_pip = []
failed_imports = {}

print("Checking imports...")
for pip_name, import_name in pkg_map.items():
    try:
        importlib.import_module(import_name)
        print(f"OK: {import_name}")
    except Exception as e:
        print(f"MISSING: {import_name} (will try to install {pip_name})")
        missing_pip.append(pip_name)
        failed_imports[import_name] = str(e)

if missing_pip:
    print("\nInstalling missing packages (this may take a minute)...")
    # install all missing packages at once
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet"] + missing_pip)
    except subprocess.CalledProcessError:
        print("Some installs failed. Attempting installs one-by-one for better diagnostics...")
        for p in missing_pip:
            try:
                print(f"Installing {p} ...")
                subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", p])
            except subprocess.CalledProcessError as e:
                print(f"Failed to install {p}: {e}")

print("\nRe-checking imports after installation...")
all_ok = True
for pip_name, import_name in pkg_map.items():
    try:
        importlib.import_module(import_name)
        print(f"OK: {import_name}")
    except Exception as e:
        all_ok = False
        print(f"FAILED: {import_name} -> {e}")
        traceback.print_exc()

if all_ok:
    print("\nAll packages imported successfully. You're good to continue to the next cell.")
else:
    print("\nSome packages still failed to import.")
    print("If failures persist, try restarting the runtime (Runtime -> Restart runtime), then re-run this cell once and continue.")
    print("If you still see errors, paste the failed import messages here and I will help resolve them.")


Checking imports...
MISSING: iterstrat (will try to install iterstrat)
OK: imblearn
OK: xgboost
MISSING: optuna (will try to install optuna)
MISSING: node2vec (will try to install node2vec)
MISSING: pubchempy (will try to install pubchempy)
OK: shap
OK: joblib

Installing missing packages (this may take a minute)...
Some installs failed. Attempting installs one-by-one for better diagnostics...
Installing iterstrat ...
Failed to install iterstrat: Command '['/usr/bin/python3', '-m', 'pip', 'install', '--quiet', 'iterstrat']' returned non-zero exit status 1.
Installing optuna ...
Installing node2vec ...
Installing pubchempy ...

Re-checking imports after installation...
FAILED: iterstrat -> No module named 'iterstrat'
OK: imblearn
OK: xgboost
OK: optuna


Traceback (most recent call last):
  File "/tmp/ipython-input-1565019117.py", line 47, in <cell line: 0>
    importlib.import_module(import_name)
  File "/usr/lib/python3.11/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen importlib._bootstrap>", line 1204, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1140, in _find_and_load_unlocked
ModuleNotFoundError: No module named 'iterstrat'


FAILED: node2vec -> numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
OK: pubchempy
OK: shap
OK: joblib

Some packages still failed to import.
If failures persist, try restarting the runtime (Runtime -> Restart runtime), then re-run this cell once and continue.
If you still see errors, paste the failed import messages here and I will help resolve them.


Traceback (most recent call last):
  File "/tmp/ipython-input-1565019117.py", line 47, in <cell line: 0>
    importlib.import_module(import_name)
  File "/usr/lib/python3.11/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen importlib._bootstrap>", line 1204, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 690, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 940, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/usr/local/lib/python3.11/dist-packages/node2vec/__init__.py", line 1, in <module>
    from . import edges
  File "/usr/local/lib/python3.11/dist-packages/node2vec/edges.py", line 7, in <module>
    from g

In [None]:
# Fix installs for gensim/node2vec/iterative-stratification and ensure numpy ABI compatibility.
# Run this whole block in one Colab cell.
import subprocess, sys

def run(cmd):
    print(f"\n>>> Running: {cmd}")
    r = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    print(r.stdout)
    if r.stderr:
        print("STDERR:\n", r.stderr)
    return r.returncode

# 1) Uninstall problematic packages to clear any broken binaries (including numpy to allow pip to manage it)
run("pip uninstall -y gensim node2vec numpy iterative-stratification rdkit-pypi || true")

# 2) Install a specific, commonly compatible NumPy version
run("pip install --quiet numpy==1.23.4")

# 3) Install rdkit-pypi
if run("pip install --quiet rdkit-pypi==2022.9.5") != 0:
    print("rdkit-pypi==2022.9.5 failed, trying an alternate wheel...")
    # alternate available version (beta) — try it if the stable one fails
    run("pip install --quiet rdkit-pypi==2023.3.1b1 || true")

# 4) Install other packages, including those that failed before
run("pip install --quiet iterative-stratification optuna pubchempy gensim node2vec imbalanced-learn xgboost shap joblib")


print("\n=== Reinstall complete ===")


>>> Running: pip uninstall -y gensim node2vec numpy iterative-stratification rdkit-pypi || true
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Found existing installation: node2vec 0.5.0
Uninstalling node2vec-0.5.0:
  Successfully uninstalled node2vec-0.5.0
Found existing installation: numpy 1.24.3
Uninstalling numpy-1.24.3:
  Successfully uninstalled numpy-1.24.3
Found existing installation: iterative-stratification 0.1.9
Uninstalling iterative-stratification-0.1.9:
  Successfully uninstalled iterative-stratification-0.1.9
Found existing installation: rdkit-pypi 2022.9.5
Uninstalling rdkit-pypi-2022.9.5:
  Successfully uninstalled rdkit-pypi-2022.9.5


>>> Running: pip install --quiet numpy==1.23.4
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 100.8 MB/s eta 0:00:00

STDERR:
 ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of

In [1]:
import importlib, traceback

packages = {
    'iterative-stratification': 'iterstrat',  # pip name -> import name alias we'll check
    'imbalanced-learn': 'imblearn',
    'xgboost': 'xgboost',
    'optuna': 'optuna',
    'node2vec': 'node2vec',
    'pubchempy': 'pubchempy',
    'shap': 'shap',
    'joblib': 'joblib',
    'rdkit-pypi': 'rdkit'
}

ok = True
for pip_name, import_name in packages.items():
    try:
        importlib.import_module(import_name)
        print(f"OK: {import_name}")
    except Exception as e:
        ok = False
        print(f"FAILED to import {import_name}: {e}")
        traceback.print_exc()

if ok:
    print("\nAll imports OK — continue with the notebook.")
else:
    print("\nOne or more imports failed. Copy the failed import messages here and I'll help fix them.")


OK: iterstrat
OK: imblearn
OK: xgboost
OK: optuna
OK: node2vec
OK: pubchempy
OK: shap
OK: joblib
OK: rdkit

All imports OK — continue with the notebook.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
# Cell 2: imports & reproducibility
import os
import random
import time
import json
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd

# sklearn / imblearn
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, precision_recall_curve, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline as SKPipeline

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import joblib

# Graph embeddings
from node2vec import Node2Vec

# RDKit and PubChem (may not be available)
try:
    from rdkit import Chem
    from rdkit.Chem import AllChem, DataStructs
    RDKit_AVAILABLE = True
except Exception as e:
    RDKit_AVAILABLE = False
    print("RDKit not available — chemical fingerprinting will fallback to zeros. Install RDKit for full features.")

# PubChem lookup (fallback for cas -> smiles)
try:
    import pubchempy as pcp
    PUBCHEM_AVAILABLE = True
except Exception:
    PUBCHEM_AVAILABLE = False

# Optuna for hyperparameter tuning
import optuna

# SHAP (for interpretability)
import shap

# Iterative stratification for multilabel
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit

# Classifiers
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# Paths (adjust if necessary)
DRIVE_ROOT = "/content/drive/MyDrive"   # change if you've mounted elsewhere
DATA_PATH = f"{DRIVE_ROOT}/ML Projects/Chemicals in Cosmetics/chemicals-in-cosmetics.csv"
PROPOSITION65_PATH = f"{DRIVE_ROOT}/ML Projects/Chemicals in Cosmetics/proposition65_clean.csv"
CAS_SMILES_PATH = f"{DRIVE_ROOT}/ML Projects/Chemicals in Cosmetics/cas_smiles.csv"  # optional

# Tunable budgets
OPTUNA_TRIALS = 40   # inner CV optimization trials; increase for better tuning
N_OUTER_FOLDS = 5    # outer CV for nested evaluation (if used)


In [8]:
# Cell 3: load csv and group by CDPHId to create product-level rows
df = pd.read_csv(DATA_PATH)
print("Raw rows:", len(df))

# Some CSVs store CAS as text per ingredient row; group by CDPHId to collect CAS numbers
# We'll build product-level DataFrame similar to your earlier cell
product_df = df.groupby('CDPHId').agg({
    'ProductName': 'first',
    'CompanyName': 'first',
    'BrandName': 'first',
    'PrimaryCategory': 'first',
    'SubCategory': 'first',
    'CasNumber': lambda x: list(x.dropna().astype(str)),  # list of CAS strings per product
}).reset_index()

print("Product-level rows:", len(product_df))
product_df.head()


Raw rows: 112870
Product-level rows: 36474


Unnamed: 0,CDPHId,ProductName,CompanyName,BrandName,PrimaryCategory,SubCategory,CasNumber
0,2,ULTRA COLOR RICH EXTRA PLUMP LIPSTICK-ALL SHADES,New Avon LLC,AVON,Makeup Products (non-permanent),"Lip Color - Lipsticks, Liners, and Pencils",[13463-67-7]
1,3,Glover's Medicated Shampoo,J. Strickland & Co.,Glover's,Hair Care Products (non-coloring),Hair Shampoos (making a cosmetic claim),"[65996-92-1, 140-67-0]"
2,4,PRECISION GLIMMER EYE LINER-ALL SHADES �,New Avon LLC,AVON,Makeup Products (non-permanent),Eyeliner/Eyebrow Pencils,[13463-67-7]
3,5,AVON BRILLIANT SHINE LIP GLOSS-ALL SHADES �,New Avon LLC,AVON,Makeup Products (non-permanent),Lip Gloss/Shine,[13463-67-7]
4,6,JILLIAN DEMPSEY FOR AVON CELESTIAL EYESHADOW-A...,New Avon LLC,AVON,Makeup Products (non-permanent),Eye Shadow,[13463-67-7]


In [12]:
import os
import pandas as pd

# Define expected path
csv_path = "/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/p65chemicalslist.csv"

# Check if file exists
if not os.path.exists(csv_path):
    raise FileNotFoundError(
        f"❌ proposition65.csv not found at: {csv_path}\n"
        "Please upload it to Google Drive in that exact location."
    )

# Load CSV
try:
    df_prop65 = pd.read_csv(csv_path)
except Exception as e:
    raise ValueError(f"❌ Failed to read proposition65.csv: {e}")

# Basic checks
required_columns = ["Chemical Name", "CAS Number", "Toxicity Type"]  # Example expected columns
missing_cols = [col for col in required_columns if col not in df_prop65.columns]

if missing_cols:
    raise ValueError(f"❌ Missing required columns in CSV: {missing_cols}")

# Check for empty rows
if df_prop65.empty:
    raise ValueError("❌ proposition65.csv is empty!")

# Show first few rows
print("✅ proposition65.csv loaded successfully!")
print(f"Total rows: {len(df_prop65)}")
print(df_prop65.head())


ValueError: ❌ Missing required columns in CSV: ['Chemical Name', 'CAS Number', 'Toxicity Type']

In [16]:
import pandas as pd

# Try reading the CSV while skipping extra rows before the header
df = pd.read_csv("/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/p65chemicalslist.csv", skiprows=11)
# Adjust skiprows=4 to however many junk rows are before the header

print(df.head())


                                     Chemical              Type of Toxicity  \
0  A-alpha-C (2-Amino-9H-pyrido[2,3-b]indole)                        cancer   
1                         Abiraterone acetate  developmental, female, male    
2                                Acetaldehyde                        cancer   
3                                   Acetamide                       cancer    
4                            Acetazolamide                  developmental     

  Listing Mechanism      CAS No. Date Listed NSRL or MADL (µg/day)a  \
0                AB   26148-68-5    1-Jan-90                      2   
1                FR  154229-18-2    8-Apr-16                    NaN   
2               SQE      75-07-0    1-Apr-88        90 (inhalation)   
3                AB      60-35-5    1-Jan-90                     10   
4                FR      59-66-5   20-Aug-99                    NaN   

   Unnamed: 6  Unnamed: 7  Unnamed: 8  
0         NaN         NaN         NaN  
1         NaN     

In [17]:
df = df.dropna(axis=1, how='all')


In [18]:
df.to_csv("/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/proposition65_clean.csv", index=False)


In [21]:
df = pd.read_csv("/content/drive/MyDrive/ML Projects/Chemicals in Cosmetics/proposition65_clean.csv")

In [23]:
df.head(20)

Unnamed: 0,Chemical,Type of Toxicity,Listing Mechanism,CAS No.,Date Listed,NSRL or MADL (µg/day)a
0,"A-alpha-C (2-Amino-9H-pyrido[2,3-b]indole)",cancer,AB,26148-68-5,1-Jan-90,2
1,Abiraterone acetate,"developmental, female, male",FR,154229-18-2,8-Apr-16,
2,Acetaldehyde,cancer,SQE,75-07-0,1-Apr-88,90 (inhalation)
3,Acetamide,cancer,AB,60-35-5,1-Jan-90,10
4,Acetazolamide,developmental,FR,59-66-5,20-Aug-99,
5,Acetochlor,cancer,SQE,34256-82-1,1-Jan-89,
6,Acetohydroxamic acid,developmental,FR,546-88-3,1-Apr-90,
7,2-Acetylaminofluorene,cancer,SQE,53-96-3,1-Jul-87,0.2
8,Acifluorfen sodium,cancer,AB,62476-59-9,1-Jan-90,
9,Acrylamide,cancer,AB,79-06-1,1-Jan-90,0.2


In [28]:
# Cell 4: Normalize CAS lists & load Proposition 65 mapping

def normalize_cas_list(cas_list):
    cleaned = []
    for c in cas_list:
        if pd.isna(c):
            continue
        s = str(c).strip()
        # Handle floats like '12345.0'
        if s.endswith('.0'):
            s = s[:-2]
        s = s.replace(' ', '')  # remove spaces inside CAS numbers
        if s:
            cleaned.append(s)
    # Deduplicate while preserving order
    return list(dict.fromkeys(cleaned))

# Apply normalization to the CasNumber column
product_df['CasNumber'] = product_df['CasNumber'].apply(normalize_cas_list)

# Load Proposition 65 mapping if available
prop65 = {}
if Path(PROPOSITION65_PATH).exists():
    propdf = pd.read_csv(PROPOSITION65_PATH)

    # Ensure columns exist and clean data
    if 'cas' in propdf.columns and 'hazard' in propdf.columns:
        for _, r in propdf.iterrows():
            cas = str(r['CAS No.']).strip()
            if pd.notna(r.get('Type of Toxicity')):
             hazards = {h.strip() for h in str(r['Type of Toxicity']).split(',') if h.strip()}
            if cas:
                prop65[cas] = hazards
        print(f"Loaded Proposition 65 mapping for {len(prop65)} CAS entries.")
    else:
        print("Proposition65 file found but missing 'cas' or 'hazard' columns.")
else:
    print(f"No proposition65.csv found at {PROPOSITION65_PATH}. You can create one with columns: cas,hazard")


Proposition65 file found but missing 'cas' or 'hazard' columns.
