# CATALYSTS 

In [3]:
import os
import json
import ast
import glob
from pathlib import Path
from typing import List, Tuple, Dict, Any, Optional

# ==========================
# Configuration (edit these)
# ==========================

LITERATURE_PATTERN = "/home/siddharth/nas/chemquest_literature/*/metadata"
PATENT_PATTERN = "/home/siddharth/nas/chemquest_patents/*/metadata"

# Output directory
RESULTS_DIR = "results"

# Search token (case-insensitive)
SEARCH_TOKEN = "catalyst"


def safe_load_json(path: str) -> Optional[Dict[str, Any]]:
    """Safely load a JSON file; return None if invalid or unreadable."""
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return None


def normalize_list_or_string_field(value: Any) -> List[str]:
    """
    Normalize a field that might be:
    - a list of strings
    - a single string
    - a stringified Python list (e.g., "['A', 'B']")
    Returns a list of strings (trimmed).
    """
    if isinstance(value, list):
        return [str(v).strip() for v in value]
    
    if isinstance(value, dict):
        return []
    
    if isinstance(value, str):
        s = value.strip()
        parsed: Any = None
        try:
            parsed = ast.literal_eval(s)
        except Exception:
            parsed = None
        
        if isinstance(parsed, list):
            return [str(v).strip() for v in parsed]
        else:
            return [s]
    
    return []


def token_match_in_items(items: List[str], token: str) -> Tuple[bool, str, str]:
    """
    Determine if token is present as exact item or substring.
    Returns (matched, match_type, matched_text).
    """
    t = token.lower()
    for item in items:
        text = item.strip()
        low = text.lower()
        
        # exact item match (allow trivial pluralization)
        if low == t or (low.endswith("s") and low[:-1] == t):
            return True, "exact_item", text
        
        # substring match
        if t in low:
            return True, "substring", text
    
    return False, "", ""


def scan_literature_file(path: str, token: str) -> Optional[Dict[str, Any]]:
    """Scan one literature JSON for token in 'categories'."""
    data = safe_load_json(path)
    if not data:
        return None
    
    categories_raw = data.get("categories")
    items = normalize_list_or_string_field(categories_raw)
    matched, match_type, matched_text = token_match_in_items(items, token)
    if matched:
        return {
            "source_type": "literature",
            "file": path,
            "id": data.get("chemrxiv_id") or data.get("doi") or os.path.basename(path),
            "title": data.get("title", ""),
            "match_type": match_type,
            "matched_text": matched_text,
            "all_categories": items,
        }
    return None


def scan_patent_file(path: str, token: str) -> Optional[Dict[str, Any]]:
    """Scan one patent JSON for token in 'classifications'."""
    data = safe_load_json(path)
    if not data:
        return None
    
    classifications_raw = data.get("classifications") or data.get("classification")
    items = normalize_list_or_string_field(classifications_raw)
    matched, match_type, matched_text = token_match_in_items(items, token)
    if matched:
        return {
            "source_type": "patent",
            "file": path,
            "id": data.get("patent") or data.get("application_number") or data.get("title") or os.path.basename(path),
            "title": data.get("title", ""),
            "match_type": match_type,
            "matched_text": matched_text,
            "all_classifications": items,
        }
    return None


def iter_json_files(root_dir: str):
    """Yield full paths of .json files under root_dir recursively."""
    for r, _, files in os.walk(root_dir):
        for fn in files:
            if fn.lower().endswith(".json"):
                yield os.path.join(r, fn)


def extract_set_name(metadata_path: str) -> str:
    """
    Extract the set name from a metadata directory path.
    E.g., /home/.../chemquest_literature/set1/metadata -> set1
    """
    path = Path(metadata_path)
    # Parent of metadata is the set directory
    return path.parent.name


def find_metadata_directories(pattern: str) -> List[Tuple[str, str]]:
    """
    Find all metadata directories matching the pattern.
    Returns list of (set_name, full_metadata_path) tuples.
    """
    matches = glob.glob(pattern)
    result = []
    for match in matches:
        if os.path.isdir(match):
            set_name = extract_set_name(match)
            result.append((set_name, match))
    return result


def main():
    # Create results directory
    os.makedirs(RESULTS_DIR, exist_ok=True)
    
    # Find all metadata directories
    lit_dirs = find_metadata_directories(LITERATURE_PATTERN)
    pat_dirs = find_metadata_directories(PATENT_PATTERN)
    
    print("=== Multi-Set Catalyst Topic Scan ===")
    print(f"Search token: '{SEARCH_TOKEN}' (case-insensitive)")
    print(f"Literature pattern: {LITERATURE_PATTERN}")
    print(f"Patent pattern:     {PATENT_PATTERN}")
    print(f"Results directory:  {RESULTS_DIR}/")
    print()
    
    print(f"Found {len(lit_dirs)} literature metadata directories")
    print(f"Found {len(pat_dirs)} patent metadata directories")
    print()
    
    # ==========================================
    # Process LITERATURE sets (by set ID)
    # ==========================================
    print("=" * 50)
    print("PROCESSING LITERATURE SETS")
    print("=" * 50)
    
    for set_name, metadata_dir in sorted(lit_dirs):
        literature_hits: List[Dict[str, Any]] = []
        
        for json_path in iter_json_files(metadata_dir):
            hit = scan_literature_file(json_path, SEARCH_TOKEN)
            if hit:
                literature_hits.append(hit)
        
        # Save literature results
        lit_output = os.path.join(RESULTS_DIR, f"{set_name}_literature_hits.json")
        with open(lit_output, "w", encoding="utf-8") as f:
            json.dump({
                "set_name": set_name,
                "search_token": SEARCH_TOKEN,
                "total_matches": len(literature_hits),
                "matches": literature_hits
            }, f, ensure_ascii=False, indent=2)
        
        print(f"  ✓ {set_name:<45} {len(literature_hits):>4} matches")
    
    print()
    
    # ==========================================
    # Process PATENT sets (by year)
    # ==========================================
    print("=" * 50)
    print("PROCESSING PATENT SETS (by year)")
    print("=" * 50)
    
    for set_name, metadata_dir in sorted(pat_dirs):
        patent_hits: List[Dict[str, Any]] = []
        
        for json_path in iter_json_files(metadata_dir):
            hit = scan_patent_file(json_path, SEARCH_TOKEN)
            if hit:
                patent_hits.append(hit)
        
        # Save patent results
        pat_output = os.path.join(RESULTS_DIR, f"{set_name}_patent_hits.json")
        with open(pat_output, "w", encoding="utf-8") as f:
            json.dump({
                "set_name": set_name,
                "search_token": SEARCH_TOKEN,
                "total_matches": len(patent_hits),
                "matches": patent_hits
            }, f, ensure_ascii=False, indent=2)
        
        print(f"  ✓ {set_name:<45} {len(patent_hits):>4} matches")
    
    print()
    
    # ==========================================
    # Summary
    # ==========================================
    print("=" * 50)
    print("SUMMARY")
    print("=" * 50)
    print(f"Literature sets processed: {len(lit_dirs)}")
    print(f"Patent sets processed:     {len(pat_dirs)}")
    print(f"Output files saved to:     {RESULTS_DIR}/")
    print()
    
    # List all output files organized by type
    lit_files = sorted(glob.glob(os.path.join(RESULTS_DIR, "*_literature_hits.json")))
    pat_files = sorted(glob.glob(os.path.join(RESULTS_DIR, "*_patent_hits.json")))
    
    print("Literature output files:")
    for f in lit_files:
        with open(f, 'r') as fh:
            data = json.load(fh)
            count = data.get('total_matches', 0)
        if count > 0:  # Only show non-empty files
            size_kb = os.path.getsize(f) / 1024
            print(f"  - {os.path.basename(f):<50} {count:>4} matches ({size_kb:>7.1f} KB)")
    
    print()
    print("Patent output files:")
    for f in pat_files:
        with open(f, 'r') as fh:
            data = json.load(fh)
            count = data.get('total_matches', 0)
        if count > 0:  # Only show non-empty files
            size_kb = os.path.getsize(f) / 1024
            print(f"  - {os.path.basename(f):<50} {count:>4} matches ({size_kb:>7.1f} KB)")


if __name__ == "__main__":
    main()


=== Multi-Set Catalyst Topic Scan ===
Search token: 'catalyst' (case-insensitive)
Literature pattern: /home/siddharth/nas/chemquest_literature/*/metadata
Patent pattern:     /home/siddharth/nas/chemquest_patents/*/metadata
Results directory:  results/

Found 17 literature metadata directories
Found 9 patent metadata directories

PROCESSING LITERATURE SETS
  ✓ 605c72ef153207001f6470ce                         5 matches
  ✓ 605c72ef153207001f6470d1                        16 matches
  ✓ 605c72ef153207001f6470d2                        80 matches
  ✓ 605c72ef153207001f6470d3                        67 matches
  ✓ 605c72ef153207001f6470d4                       171 matches
  ✓ 605c72ef153207001f6470d5                         0 matches
  ✓ 605c72ef153207001f6470d6                         5 matches
  ✓ 605c72ef153207001f6470d7                        59 matches
  ✓ 605c72ef153207001f6470d8                        42 matches
  ✓ 605c72ef153207001f6470d9                        32 matches
  ✓ 605c72ef

### moving to temporary directory inside nas

In [1]:
import os
import json
import shutil
from pathlib import Path
from typing import Dict, Any, List, Tuple

# ==========================
# CONFIGURATION
# ==========================

# Parent dirs where original chemquest data lives
LITERATURE_PARENT = "/home/siddharth/nas/chemquest_literature"
PATENT_PARENT = "/home/siddharth/nas/chemquest_patents"

# Where the scan results JSONs ( *_literature_hits.json / *_patent_hits.json ) live
RESULTS_DIR = "results"

# Destination base dir for this topic
TOPIC_NAME = "catalysts"  # used in destination path only, not for matching

DEST_BASE = "/home/siddharth/nas/multi_collection_dataset"

# Derived destination dirs
DEST_LIT_BASE = os.path.join(DEST_BASE, TOPIC_NAME, "literatures")
DEST_PAT_BASE = os.path.join(DEST_BASE, TOPIC_NAME, "patents")

# Subdirs under each of the above
SUBDIRS = ["metadata", "markdown", "json"]

# Failed IDs logs
FAILED_LIT_IDS_FILE = os.path.join(DEST_BASE, TOPIC_NAME, "failed_literature_ids.txt")
FAILED_PAT_IDS_FILE = os.path.join(DEST_BASE, TOPIC_NAME, "failed_patent_ids.txt")


# ==========================
# HELPERS
# ==========================

def ensure_dest_dirs():
    """Create destination directories (read-only with respect to source)."""
    for base in [DEST_LIT_BASE, DEST_PAT_BASE]:
        for sub in SUBDIRS:
            os.makedirs(os.path.join(base, sub), exist_ok=True)
    # Also ensure directory for failed logs
    os.makedirs(os.path.dirname(FAILED_LIT_IDS_FILE), exist_ok=True)


def load_hits(path: str) -> Dict[str, Any]:
    """Load one *_hits.json file."""
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def iter_result_files(pattern_suffix: str) -> List[str]:
    """
    List all result files with given suffix inside RESULTS_DIR.
    E.g., pattern_suffix = '_literature_hits.json' or '_patent_hits.json'.
    """
    files = []
    for fn in os.listdir(RESULTS_DIR):
        if fn.endswith(pattern_suffix):
            files.append(os.path.join(RESULTS_DIR, fn))
    return sorted(files)


def resolve_source_paths_for_literature(src_meta_path: str) -> Tuple[Path, Path, Path]:
    """
    Given 'file' path from literature hit (metadata path),
    resolve the 3 source files: metadata, markdown, json.
    Assumes structure:
        /.../chemquest_literature/<set_id>/metadata/<id>.json
        /.../chemquest_literature/<set_id>/markdown/<id>.md (or .markdown)
        /.../chemquest_literature/<set_id>/json/<id>.json
    """
    meta = Path(src_meta_path)
    set_dir = meta.parent.parent  # .../<set_id>/
    stem = meta.stem

    meta_path = set_dir / "metadata" / f"{stem}.json"
    # markdown could be .md; adjust if your real extension differs
    md_path = set_dir / "markdown" / f"{stem}.md"
    json_path = set_dir / "json" / f"{stem}.json"

    return meta_path, md_path, json_path


def resolve_source_paths_for_patent(src_meta_path: str) -> Tuple[Path, Path, Path]:
    """
    Given 'file' path from patent hit (metadata path),
    resolve the 3 source files: metadata, markdown, json.
    Assumes structure:
        /.../chemquest_patents/<year_or_set>/metadata/<id>.json
        /.../chemquest_patents/<year_or_set>/markdown/<id>.md
        /.../chemquest_patents/<year_or_set>/json/<id>.json
    """
    meta = Path(src_meta_path)
    set_dir = meta.parent.parent  # .../<year_or_set>/
    stem = meta.stem

    meta_path = set_dir / "metadata" / f"{stem}.json"
    md_path = set_dir / "markdown" / f"{stem}.md"
    json_path = set_dir / "json" / f"{stem}.json"

    return meta_path, md_path, json_path


def safe_copy(src: Path, dst: Path) -> bool:
    """
    Copy file from src to dst if src exists.
    Returns True if copied, False if src missing.
    Never modifies or deletes src.
    """
    if not src.is_file():
        return False
    os.makedirs(dst.parent, exist_ok=True)
    # copy2 preserves metadata; source unchanged
    shutil.copy2(str(src), str(dst))
    return True


def process_literature_hits():
    failed_ids: List[str] = []
    files = iter_result_files("_literature_hits.json")

    for hits_file in files:
        data = load_hits(hits_file)
        matches = data.get("matches", [])
        for m in matches:
            src_meta_path = m.get("file")
            if not src_meta_path:
                continue

            # Resolve source paths under chemquest_literature
            meta_path, md_path, json_path = resolve_source_paths_for_literature(src_meta_path)
            basename = meta_path.stem

            # Destination paths
            dst_meta = Path(DEST_LIT_BASE) / "metadata" / f"{basename}.json"
            dst_md = Path(DEST_LIT_BASE) / "markdown" / f"{basename}.md"
            dst_json = Path(DEST_LIT_BASE) / "json" / f"{basename}.json"

            ok_meta = safe_copy(meta_path, dst_meta)
            ok_md = safe_copy(md_path, dst_md)
            ok_json = safe_copy(json_path, dst_json)

            if not (ok_meta and ok_md and ok_json):
                failed_ids.append(basename)

    # Write failed IDs log
    if failed_ids:
        with open(FAILED_LIT_IDS_FILE, "w", encoding="utf-8") as f:
            for _id in sorted(set(failed_ids)):
                f.write(_id + "\n")


def process_patent_hits():
    failed_ids: List[str] = []
    files = iter_result_files("_patent_hits.json")

    for hits_file in files:
        data = load_hits(hits_file)
        matches = data.get("matches", [])
        for m in matches:
            src_meta_path = m.get("file")
            if not src_meta_path:
                continue

            # Resolve source paths under chemquest_patents
            meta_path, md_path, json_path = resolve_source_paths_for_patent(src_meta_path)
            basename = meta_path.stem

            # Destination paths
            dst_meta = Path(DEST_PAT_BASE) / "metadata" / f"{basename}.json"
            dst_md = Path(DEST_PAT_BASE) / "markdown" / f"{basename}.md"
            dst_json = Path(DEST_PAT_BASE) / "json" / f"{basename}.json"

            ok_meta = safe_copy(meta_path, dst_meta)
            ok_md = safe_copy(md_path, dst_md)
            ok_json = safe_copy(json_path, dst_json)

            if not (ok_meta and ok_md and ok_json):
                failed_ids.append(basename)

    # Write failed IDs log
    if failed_ids:
        with open(FAILED_PAT_IDS_FILE, "w", encoding="utf-8") as f:
            for _id in sorted(set(failed_ids)):
                f.write(_id + "\n")


def main():
    print("=== Building multi_collection_dataset for topic:", TOPIC_NAME)
    print("Literature parent:", LITERATURE_PARENT)
    print("Patent parent:    ", PATENT_PARENT)
    print("Results dir:      ", RESULTS_DIR)
    print("Destination base: ", DEST_BASE)
    print()

    ensure_dest_dirs()

    print("Processing literature hits...")
    process_literature_hits()
    print("Processing patent hits...")
    process_patent_hits()

    print("Done.")
    print("Destination structure:")
    print("  -", DEST_LIT_BASE)
    print("  -", DEST_PAT_BASE)
    print("Failed IDs logs (if any):")
    print("  -", FAILED_LIT_IDS_FILE)
    print("  -", FAILED_PAT_IDS_FILE)


if __name__ == "__main__":
    main()


=== Building multi_collection_dataset for topic: catalysts
Literature parent: /home/siddharth/nas/chemquest_literature
Patent parent:     /home/siddharth/nas/chemquest_patents
Results dir:       results
Destination base:  /home/siddharth/nas/multi_collection_dataset

Processing literature hits...
Processing patent hits...
Done.
Destination structure:
  - /home/siddharth/nas/multi_collection_dataset/catalysts/literatures
  - /home/siddharth/nas/multi_collection_dataset/catalysts/patents
Failed IDs logs (if any):
  - /home/siddharth/nas/multi_collection_dataset/catalysts/failed_literature_ids.txt
  - /home/siddharth/nas/multi_collection_dataset/catalysts/failed_patent_ids.txt
