In [None]:
import os
import glob
import time
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

CLASSIFIER_URL = "https://npclassifier.gnps2.org/classify"
MAX_WORKERS = 64  # Tune based on your machine

def classify_smiles(smiles):
    try:
        if not smiles or pd.isna(smiles) or smiles.lower().strip() == 'nan':
            return "", "", "", smiles
        r = requests.get(CLASSIFIER_URL, params={"smiles": smiles}, timeout=10)
        r.raise_for_status()
        result = r.json()

        pathway = ', '.join(result.get('pathway_results', []))
        superclass = ', '.join(result.get('superclass_results', []))
        class_ = ', '.join(result.get('class_results', []))

        if not any([pathway, superclass, class_]):
            print(f"[WARN] Empty result for: {smiles}")

        return pathway, superclass, class_, smiles

    except Exception as e:
        print(f"[ERROR] Failed SMILES: {smiles} | Reason: {str(e)}")
        with open("failed_smiles.txt", "a") as f:
            f.write(smiles + "\n")
        return "", "", "", smiles

def resolve_smiles_column(df):
    for col in ['isoSmiles', 'primary_SMILES', 'smiles']:
        if col in df.columns:
            df['isoSmiles'] = df[col].astype(str)
            return df
    raise ValueError("No valid SMILES column found. Expect one of: isoSmiles, primary_SMILES, smiles")

def process_chunk(chunk_df, chunk_index, outdir):
    smiles_list = chunk_df['isoSmiles'].fillna('').astype(str).tolist()

    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_smiles = {executor.submit(classify_smiles, s): s for s in smiles_list}
        for future in as_completed(future_to_smiles):
            results.append(future.result())

    # Sort results back to original input order
    results_sorted = [r for _, r in sorted(zip(future_to_smiles.keys(), results), key=lambda x: smiles_list.index(x[1][3]))]

    # Unpack results
    pathways, superclasses, classes, smiles_out = zip(*results_sorted)
    chunk_df['isoSmiles'] = smiles_out
    chunk_df['Pathway Results'] = pathways
    chunk_df['Superclass Results'] = superclasses
    chunk_df['Class Results'] = classes

    outfile = os.path.join(outdir, f"output_chunk_{chunk_index}.csv")
    chunk_df.to_csv(outfile, index=False)
    print(f"[SAVED] Chunk {chunk_index}: {outfile}")
def merge_chunks(outdir, final_outfile):
    files = sorted(glob.glob(os.path.join(outdir, 'output_chunk_*.csv')))
    all_chunks = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
    all_chunks.to_csv(final_outfile, index=False)
    print(f"[MERGED] Output written to: {final_outfile}")

def run_pipeline(input_file, outdir="classified_chunks", chunk_size=10000):
    os.makedirs(outdir, exist_ok=True)
    df = pd.read_csv(input_file)
    df = resolve_smiles_column(df)

    total_chunks = (len(df) + chunk_size - 1) // chunk_size
    print(f"[INFO] Classifying {len(df)} SMILES in {total_chunks} chunks of {chunk_size}...")

    start_time = time.time()
    for i in range(total_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(df))
        chunk_df = df.iloc[start:end].copy()
        t0 = time.time()
        process_chunk(chunk_df, i + 1, outdir)
        print(f"[DONE] Chunk {i + 1} in {time.time() - t0:.2f}s")

    merge_chunks(outdir, os.path.join(outdir, "final_npclassifier_output.csv"))
    print(f"[DONE] Full run in {time.time() - start_time:.2f}s")
# Create a test CSV in notebook
# test_smiles_df = pd.DataFrame({"isoSmiles": ["COc1ccc(/C=C/c2cc(OC)cc(=O)o2)cc1"]})
test_input_path = "2025-04-07_master_cmp_scoring_dev_w_priority.csv"
test_output_dir = "2025-06-03_master_cmp_with_np.csv"
#test_smiles_df.to_csv(test_input_path, index=False)

# Run
run_pipeline(test_input_path, outdir=test_output_dir, chunk_size=1)
