In [None]:
import os
import glob
import time
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

CLASSIFIER_URL = "https://npclassifier.gnps2.org/classify"
MAX_WORKERS = 64

# SMILES classifier (one input → one output tuple)
def classify_smiles(smiles):
    try:
        if not smiles or pd.isna(smiles) or smiles.strip().lower() == "nan":
            return "", "", "", smiles
        r = requests.get(CLASSIFIER_URL, params={"smiles": smiles}, timeout=10)
        r.raise_for_status()
        result = r.json()

        pathway = ', '.join(result.get('pathway_results', []))
        superclass = ', '.join(result.get('superclass_results', []))
        class_ = ', '.join(result.get('class_results', []))

        if not any([pathway, superclass, class_]):
            print(f"[WARN] Empty classification for: {smiles}")

        return pathway, superclass, class_, smiles
    except Exception as e:
        print(f"[ERROR] Failed: {smiles} → {str(e)}")
        with open("failed_smiles.txt", "a") as f:
            f.write(smiles + "\n")
        return "", "", "", smiles

# Process a chunk (strict row alignment)
def process_chunk(chunk_df, chunk_index, outdir):
    smiles_list = chunk_df['primary_SMILES'].fillna('').astype(str).tolist()
    results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(classify_smiles, s) for s in smiles_list]
        for i, future in enumerate(as_completed(futures)):
            results.append(future.result())
            if (i + 1) % 50 == 0 or (i + 1) == len(futures):
                print(f"[Chunk {chunk_index}] Processed {i + 1}/{len(futures)}")

    # Reorder results to original input order using index
    results_dict = {res[3]: res for res in results}
    ordered = [results_dict.get(s, ("", "", "", s)) for s in smiles_list]

    # Attach results
    chunk_df['Pathway Results'] = [r[0] for r in ordered]
    chunk_df['Superclass Results'] = [r[1] for r in ordered]
    chunk_df['Class Results'] = [r[2] for r in ordered]

    outfile = os.path.join(outdir, f"output_chunk_{chunk_index}.csv")
    chunk_df.to_csv(outfile, index=False)
    print(f"[SAVED] Chunk {chunk_index} → {outfile}")

# Merge chunks
def merge_chunks(outdir, final_outfile):
    files = sorted(glob.glob(os.path.join(outdir, 'output_chunk_*.csv')))
    all_chunks = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
    all_chunks.to_csv(final_outfile, index=False)
    print(f"[MERGED] Output → {final_outfile}")

# Pipeline entrypoint
def run_pipeline(input_file, outdir="classified_chunks", chunk_size=10000):
    os.makedirs(outdir, exist_ok=True)
    df = pd.read_csv(input_file)[['cmp', 'primary_SMILES']].dropna()

    total_chunks = (len(df) + chunk_size - 1) // chunk_size
    print(f"[INFO] Classifying {len(df)} SMILES across {total_chunks} chunks...")

    start_time = time.time()
    for i in range(total_chunks):
        print(f"\n[START] Chunk {i + 1}/{total_chunks}")
        chunk_df = df.iloc[i * chunk_size : (i + 1) * chunk_size].copy()
        t0 = time.time()
        process_chunk(chunk_df, i + 1, outdir)
        print(f"[DONE] Chunk {i + 1} in {time.time() - t0:.2f}s")

    final_csv = os.path.join(outdir, "npclassifier_output.csv")
    merge_chunks(outdir, final_csv)
    print(f"[FINISHED] Total time: {time.time() - start_time:.2f}s")
    return final_csv


input_path = "2025-04-07_master_cmp_scoring_dev_w_priority.csv"
output_dir = "pain_np_leftover"
final_output = run_pipeline(input_path, outdir=output_dir, chunk_size=10000)
