In [1]:
import pandas as pd

In [6]:
df = pd.read_csv("./test.csv")
df

Unnamed: 0,catalyst_ac,active_me,catalyst_mw,catalyst_st,catalyst_p,catalyst_e,catalyst_b,catalyst_d,catalyst_si
0,'423.49,'506.73,ISN-1071,'541,'1201_1,'0.19,'0.24,'0.29,'0.24
1,'506.73,'423.49,ISN-1062,'407.74,ISN-1062,'0.24,'0.29,'0.68,'0.29
2,'423.49,'407.74,ISN-1071,'506.73,ISN-1071,'0.29,'0.24,'0.29,'0.68
3,'407.74,'506.73,ISN-1062,'541,'1201_1,'0.68,'0.29,'0.19,'0.29
4,'506.73,'423.49,ISN-1071,'407.74,ISN-1062,'0.29,'0.24,'0.68,'0.68
5,'423.49,'407.74,ISN-1071,'506.73,ISN-1071,'0.24,'0.68,'0.29,'0.24
6,'407.74,'506.73,ISN-1062,'423.49,ISN-1071,'0.68,'0.29,'0.24,'0.29
7,'506.73,'423.49,ISN-1071,'407.74,ISN-1062,'0.29,'0.24,'0.68,'0.68
8,'423.49,'407.74,ISN-1062,'506.73,ISN-1071,'0.24,'0.68,'0.29,'0.19
9,'407.74,'506.73,ISN-1071,'541,'1201_1,'0.68,'0.29,'0.19,'0.24


In [8]:
print(df.head())

  catalyst_ac active_me catalyst_mw catalyst_st catalyst_p catalyst_e  \
0     '423.49   '506.73    ISN-1071        '541    '1201_1      '0.19   
1     '506.73   '423.49    ISN-1062     '407.74   ISN-1062      '0.24   
2     '423.49   '407.74    ISN-1071     '506.73   ISN-1071      '0.29   
3     '407.74   '506.73    ISN-1062        '541    '1201_1      '0.68   
4     '506.73   '423.49    ISN-1071     '407.74   ISN-1062      '0.29   

  catalyst_b catalyst_d catalyst_si  
0      '0.24      '0.29       '0.24  
1      '0.29      '0.68       '0.29  
2      '0.24      '0.29       '0.68  
3      '0.29      '0.19       '0.29  
4      '0.24      '0.68       '0.68  


In [None]:


for col in df.columns:
    df[col] = df[col].astype(str).str.replace("^'", "", regex=True) 

print("\nAFTER:")
print(df.head())



AFTER:
  catalyst_ac active_me catalyst_mw catalyst_st catalyst_p catalyst_e  \
0      423.49    506.73    ISN-1071         541     1201_1       0.19   
1      506.73    423.49    ISN-1062      407.74   ISN-1062       0.24   
2      423.49    407.74    ISN-1071      506.73   ISN-1071       0.29   
3      407.74    506.73    ISN-1062         541     1201_1       0.68   
4      506.73    423.49    ISN-1071      407.74   ISN-1062       0.29   

  catalyst_b catalyst_d catalyst_si  
0       0.24       0.29        0.24  
1       0.29       0.68        0.29  
2       0.24       0.29        0.68  
3       0.29       0.19        0.29  
4       0.24       0.68        0.68  


In [3]:
#!/usr/bin/env python3
"""
Google BigQuery USPTO Patent Downloader
No API key, no registration needed (1TB free/month)
Topics: CO2 Methanation | Fischer-Tropsch | Hydrocracking
"""

from google.cloud import bigquery
from pathlib import Path
import json
from datetime import datetime
from tqdm import tqdm

# ==================== CONFIG ====================
OUTPUT_DIR = Path("./uspto_patents")
XML_DIR = OUTPUT_DIR / "xml"
TRACKING_FILE = OUTPUT_DIR / "download_tracker.json"

# CPC codes for topics
TOPIC_CPC_CODES = {
    "co2_methanation": ["B01J23/72", "B01J23/755", "C07C1/12", "B01J37"],
    "fischer_tropsch": ["C10G2/00", "B01J23/75", "C07C1/04", "C10G11/05"],
    "hydrocracking": ["C10G47", "C10G65", "B01J21/12", "B01J23/88", "C10G49/02"],
}

# BigQuery settings
PROJECT_ID = "your-project-id"  # Create free GCP project
DATASET = "patents-public-data.patents"

# ==================== TRACKING ====================
def load_tracker():
    if TRACKING_FILE.exists():
        with open(TRACKING_FILE, 'r') as f:
            return json.load(f)
    return {
        "downloaded_patents": [],
        "stats": {topic: {"patents": 0, "last_update": None} for topic in TOPIC_CPC_CODES}
    }

def save_tracker(tracker):
    tracker["last_updated"] = datetime.now().isoformat()
    with open(TRACKING_FILE, 'w') as f:
        json.dump(tracker, f, indent=2)

# ==================== BIGQUERY FUNCTIONS ====================
def query_patents_by_topic(client, topic, cpc_codes, year_from=2015):
    """Query patents from BigQuery by CPC codes"""
    
    # Build CPC filter (match any of the codes)
    cpc_conditions = " OR ".join([f"cpc.code LIKE '{code}%'" for code in cpc_codes])
    
    query = f"""
    SELECT 
        pub.publication_number,
        pub.title_localized[SAFE_OFFSET(0)].text as title,
        pub.abstract_localized[SAFE_OFFSET(0)].text as abstract,
        pub.description_localized[SAFE_OFFSET(0)].text as description,
        pub.claims_localized[SAFE_OFFSET(0)].text as claims,
        pub.publication_date,
        pub.filing_date,
        pub.assignee_harmonized,
        cpc.code as cpc_code
    FROM 
        `{DATASET}.publications` pub,
        UNNEST(cpc) as cpc
    WHERE 
        ({cpc_conditions})
        AND CAST(pub.publication_date AS INT64) >= {year_from}0101
        AND pub.country_code = 'US'
        AND pub.kind_code IN ('B1', 'B2')  -- Granted patents only
    LIMIT 50000
    """
    
    print(f"  Running BigQuery for {topic}...")
    query_job = client.query(query)
    return query_job.result()

def save_patent_xml(row, topic):
    """Save patent data as XML"""
    topic_dir = XML_DIR / topic
    topic_dir.mkdir(parents=True, exist_ok=True)
    
    patent_num = row.publication_number.replace(" ", "_")
    filepath = topic_dir / f"{patent_num}.xml"
    
    # Build simple XML structure
    xml_content = f"""<?xml version="1.0" encoding="UTF-8"?>
<us-patent-grant>
    <publication-reference>
        <document-id>
            <doc-number>{row.publication_number}</doc-number>
            <date>{row.publication_date}</date>
        </document-id>
    </publication-reference>
    <application-reference>
        <document-id>
            <date>{row.filing_date}</date>
        </document-id>
    </application-reference>
    <assignees>
        <assignee>{row.assignee_harmonized or ''}</assignee>
    </assignees>
    <invention-title>{row.title or ''}</invention-title>
    <abstract>
        <p>{row.abstract or ''}</p>
    </abstract>
    <description>
        {row.description or ''}
    </description>
    <claims>
        {row.claims or ''}
    </claims>
    <classifications-cpc>
        <classification-cpc>
            <cpc-version-indicator><date>20130101</date></cpc-version-indicator>
            <section>{row.cpc_code[0] if row.cpc_code else ''}</section>
            <class>{row.cpc_code[1:3] if len(row.cpc_code) > 2 else ''}</class>
            <subclass>{row.cpc_code[3] if len(row.cpc_code) > 3 else ''}</subclass>
            <main-group>{row.cpc_code[4:] if len(row.cpc_code) > 4 else ''}</main-group>
        </classification-cpc>
    </classifications-cpc>
</us-patent-grant>
"""
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(xml_content)
    
    return filepath

# ==================== MAIN ====================
def main():
    # Setup
    OUTPUT_DIR.mkdir(exist_ok=True)
    XML_DIR.mkdir(exist_ok=True)
    
    print("=" * 70)
    print("Google BigQuery USPTO Patent Downloader")
    print("Free - No API Key - No Registration")
    print("=" * 70)
    
    # Initialize BigQuery client
    print("\nüîß Initializing BigQuery client...")
    try:
        client = bigquery.Client(project=PROJECT_ID)
        print("‚úÖ Connected to BigQuery\n")
    except Exception as e:
        print(f"‚ùå Error: {e}")
        print("\nSetup required:")
        print("1. Install: pip install google-cloud-bigquery")
        print("2. Create free GCP account: https://console.cloud.google.com")
        print("3. Create a project and set PROJECT_ID in script")
        print("4. Run: gcloud auth application-default login")
        return
    
    tracker = load_tracker()
    
    # Process each topic
    for topic, cpc_codes in TOPIC_CPC_CODES.items():
        print(f"\n{'='*70}")
        print(f"üìÅ {topic.upper()}")
        print(f"{'='*70}")
        
        # Query BigQuery
        results = query_patents_by_topic(client, topic, cpc_codes)
        
        # Save patents
        count = 0
        for row in tqdm(results, desc=f"  Saving {topic} patents"):
            if row.publication_number not in tracker["downloaded_patents"]:
                save_patent_xml(row, topic)
                tracker["downloaded_patents"].append(row.publication_number)
                count += 1
        
        tracker["stats"][topic]["patents"] = count
        tracker["stats"][topic]["last_update"] = datetime.now().isoformat()
        save_tracker(tracker)
        
        print(f"‚úÖ {count} patents saved for {topic}")
    
    # Stats
    print("\n" + "=" * 70)
    print("üìä FINAL STATISTICS")
    print("=" * 70)
    for topic, stats in tracker["stats"].items():
        print(f"{topic.upper()}: {stats['patents']} patents")
    print(f"\nTotal: {len(tracker['downloaded_patents'])} patents")
    print(f"Location: {XML_DIR}")
    print("=" * 70)

if __name__ == "__main__":
    main()


USPTO Open Data Portal - Patent XML Downloader
Topics: CO2 Methanation | Fischer-Tropsch | Hydrocracking

üîç Testing API connectivity...
‚ö†Ô∏è  API responded with status 403

üìÅ Processing topic: CO2_METHANATION

üîé Searching CPC code: B01J23/72
  Fetching patents at offset 0...
Error searching patents for CPC B01J23/72: 403 Client Error: Forbidden for url: https://api.uspto.gov/patent-data/v1/patent?query=cpc%3AB01J23%2F72%2A&offset=0&limit=100&sort=filingDate+desc
  No more results for B01J23/72

üîé Searching CPC code: B01J23/755
  Fetching patents at offset 0...
Error searching patents for CPC B01J23/755: 403 Client Error: Forbidden for url: https://api.uspto.gov/patent-data/v1/patent?query=cpc%3AB01J23%2F755%2A&offset=0&limit=100&sort=filingDate+desc
  No more results for B01J23/755

üîé Searching CPC code: C07C1/12
  Fetching patents at offset 0...
Error searching patents for CPC C07C1/12: 403 Client Error: Forbidden for url: https://api.uspto.gov/patent-data/v1/patent?q

In [2]:
#!/usr/bin/env python3
"""
Google BigQuery USPTO Patents - No Authentication Required
Uses public dataset sandbox
"""

from google.cloud import bigquery
from pathlib import Path
import json
from datetime import datetime
from tqdm import tqdm

# ==================== CONFIG ====================
OUTPUT_DIR = Path("./uspto_patents")
XML_DIR = OUTPUT_DIR / "xml"
TRACKING_FILE = OUTPUT_DIR / "download_tracker.json"

# CPC codes for topics
TOPIC_CPC_CODES = {
    "co2_methanation": ["B01J23/72", "B01J23/755", "C07C1/12", "B01J37"],
    "fischer_tropsch": ["C10G2/00", "B01J23/75", "C07C1/04", "C10G11/05"],
    "hydrocracking": ["C10G47", "C10G65", "B01J21/12", "B01J23/88", "C10G49/02"],
}

DATASET = "patents-public-data.patents"

# ==================== TRACKING ====================
def load_tracker():
    if TRACKING_FILE.exists():
        with open(TRACKING_FILE, 'r') as f:
            return json.load(f)
    return {
        "downloaded_patents": [],
        "stats": {topic: {"patents": 0, "last_update": None} for topic in TOPIC_CPC_CODES}
    }

def save_tracker(tracker):
    tracker["last_updated"] = datetime.now().isoformat()
    with open(TRACKING_FILE, 'w') as f:
        json.dump(tracker, f, indent=2)

# ==================== BIGQUERY ====================
def init_bigquery_client():
    """Initialize BigQuery client without authentication"""
    # For public datasets, you can omit credentials
    # BigQuery will use anonymous access
    import os
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''  # Clear any existing
    
    client = bigquery.Client(project='patents-public-data')
    return client

def query_patents_by_topic(client, topic, cpc_codes, year_from=2015, limit=10000):
    """Query patents from BigQuery by CPC codes"""
    
    cpc_conditions = " OR ".join([f"cpc.code LIKE '{code}%'" for code in cpc_codes])
    
    query = f"""
    SELECT 
        pub.publication_number,
        pub.title_localized[SAFE_OFFSET(0)].text as title,
        pub.abstract_localized[SAFE_OFFSET(0)].text as abstract,
        pub.description_localized[SAFE_OFFSET(0)].text as description,
        pub.claims_localized[SAFE_OFFSET(0)].text as claims,
        pub.publication_date,
        pub.filing_date,
        ARRAY_TO_STRING(pub.assignee_harmonized, '; ') as assignee,
        cpc.code as cpc_code
    FROM 
        `{DATASET}.publications` pub,
        UNNEST(pub.cpc) as cpc
    WHERE 
        ({cpc_conditions})
        AND CAST(pub.publication_date AS INT64) >= {year_from}0101
        AND pub.country_code = 'US'
        AND pub.kind_code IN ('B1', 'B2')
    LIMIT {limit}
    """
    
    print(f"  Running BigQuery for {topic}...")
    print(f"  CPC codes: {', '.join(cpc_codes)}")
    
    query_job = client.query(query)
    return query_job.result()

def save_patent_xml(row, topic):
    """Save patent data as XML"""
    topic_dir = XML_DIR / topic
    topic_dir.mkdir(parents=True, exist_ok=True)
    
    patent_num = row.publication_number.replace(" ", "_").replace("/", "-")
    filepath = topic_dir / f"{patent_num}.xml"
    
    # Escape XML special characters
    def escape_xml(text):
        if not text:
            return ''
        return (str(text)
                .replace('&', '&amp;')
                .replace('<', '&lt;')
                .replace('>', '&gt;')
                .replace('"', '&quot;')
                .replace("'", '&apos;'))
    
    xml_content = f"""<?xml version="1.0" encoding="UTF-8"?>
<us-patent-grant>
    <publication-reference>
        <document-id>
            <doc-number>{escape_xml(row.publication_number)}</doc-number>
            <date>{escape_xml(row.publication_date)}</date>
        </document-id>
    </publication-reference>
    <application-reference>
        <document-id>
            <date>{escape_xml(row.filing_date)}</date>
        </document-id>
    </application-reference>
    <assignees>
        <assignee>{escape_xml(row.assignee)}</assignee>
    </assignees>
    <invention-title>{escape_xml(row.title)}</invention-title>
    <abstract>
        <p>{escape_xml(row.abstract)}</p>
    </abstract>
    <description>
        <p>{escape_xml(row.description)}</p>
    </description>
    <claims>
        <claim>{escape_xml(row.claims)}</claim>
    </claims>
    <classifications-cpc>
        <classification-cpc>
            <section>{row.cpc_code[0] if row.cpc_code else ''}</section>
            <class>{row.cpc_code[1:3] if len(row.cpc_code) > 2 else ''}</class>
            <subclass>{row.cpc_code[3] if len(row.cpc_code) > 3 else ''}</subclass>
            <main-group>{row.cpc_code[4:] if len(row.cpc_code) > 4 else ''}</main-group>
        </classification-cpc>
    </classifications-cpc>
</us-patent-grant>
"""
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(xml_content)
    
    return filepath

# ==================== MAIN ====================
def main():
    OUTPUT_DIR.mkdir(exist_ok=True)
    XML_DIR.mkdir(exist_ok=True)
    
    print("=" * 70)
    print("Google BigQuery USPTO Patent Downloader")
    print("Public Dataset - No Authentication Required")
    print("=" * 70)
    
    # Initialize client
    print("\nüîß Initializing BigQuery client...")
    try:
        client = init_bigquery_client()
        print("‚úÖ Connected to BigQuery Public Datasets\n")
    except Exception as e:
        print(f"‚ùå Error: {e}")
        print("\nIf you get authentication errors, try:")
        print("pip install --upgrade google-cloud-bigquery")
        return
    
    tracker = load_tracker()
    
    # Process topics
    for topic, cpc_codes in TOPIC_CPC_CODES.items():
        print(f"\n{'='*70}")
        print(f"üìÅ {topic.upper()}")
        print(f"{'='*70}")
        
        try:
            results = query_patents_by_topic(client, topic, cpc_codes, year_from=2015, limit=10000)
            
            count = 0
            print(f"  Processing results...")
            for row in tqdm(results, desc=f"  Saving patents"):
                try:
                    if row.publication_number and row.publication_number not in tracker["downloaded_patents"]:
                        save_patent_xml(row, topic)
                        tracker["downloaded_patents"].append(row.publication_number)
                        count += 1
                except Exception as e:
                    print(f"  ‚ö†Ô∏è  Error saving patent: {e}")
                    continue
            
            tracker["stats"][topic]["patents"] = count
            tracker["stats"][topic]["last_update"] = datetime.now().isoformat()
            save_tracker(tracker)
            
            print(f"‚úÖ {count} patents saved for {topic}")
            
        except Exception as e:
            print(f"‚ùå Error querying {topic}: {e}")
            print(f"   This might be due to query limits on public datasets")
            continue
    
    # Stats
    print("\n" + "=" * 70)
    print("üìä FINAL STATISTICS")
    print("=" * 70)
    for topic, stats in tracker["stats"].items():
        print(f"{topic.upper()}: {stats['patents']} patents")
    print(f"\nTotal: {len(tracker['downloaded_patents'])} unique patents")
    print(f"Location: {XML_DIR}")
    print("=" * 70)

if __name__ == "__main__":
    # Install: pip install google-cloud-bigquery
    main()


Google BigQuery USPTO Patent Downloader
Public Dataset - No Authentication Required

üîß Initializing BigQuery client...
‚ùå Error: File  was not found.

If you get authentication errors, try:
pip install --upgrade google-cloud-bigquery


In [None]:
1. CO‚ÇÇ Methanation
IPC Field:


C07C1/12 OR B01J23/72 OR B01J23/755 OR C10K3/00
Combined IPC + Keywords (recommended):


IPC:(C07C1/12 OR B01J23/72 OR B01J23/755) AND (methanation OR "CO2 hydrogenation" OR Sabatier)
2. Fischer-Tropsch Synthesis
IPC Field:


C07C1/04 OR C10G2/00 OR B01J23/75 OR C10G11/05
Combined IPC + Keywords (recommended):


IPC:(C07C1/04 OR C10G2/00 OR B01J23/75) AND ("Fischer-Tropsch" OR "FT synthesis" OR "syngas conversion")
3. Hydrocracking
IPC Field:


C10G47 OR C10G65 OR B01J21/12 OR B01J23/88 OR C10G49/02
Combined IPC + Keywords (recommended):


IPC:(C10G47 OR C10G65 OR B01J21/12 OR B01J23/88) AND (hydrocracking OR hydrocracker OR "zeolite catalyst")


‚ùØ python ./literature_downloader.py
======================================================================
Parallel PDF Downloader for OpenAlex Papers
Workers: 20 | Timeout: 30s
======================================================================

Found 3 metadata files

======================================================================
üì• CO2_METHANATION
======================================================================
üìä Total papers: 4335
üîì Open access: 2718
üìã Tasks to process: 2716
üë∑ Using 20 parallel workers

  Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2716/2716 [02:21<00:00, 19.26 PDFs/s]

‚úÖ Successfully downloaded: 0
‚è≠Ô∏è  Already existed: 884
‚ùå Failed: 1832

üìä Failure reasons:
  - HTML page: 1607
  - Invalid PDF format: 179
  - HTML response: 20
  - Connection error: 15
  - Timeout: 5
  - HTTP 404: 3
  - File too small: 2
  - HTTP 403: 1

======================================================================
üì• FISCHER_TROPSCH
======================================================================
üìä Total papers: 3822
üîì Open access: 2388
üìã Tasks to process: 2388
üë∑ Using 20 parallel workers

  Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2388/2388 [01:53<00:00, 21.08 PDFs/s]

‚úÖ Successfully downloaded: 1
‚è≠Ô∏è  Already existed: 919
‚ùå Failed: 1468

üìä Failure reasons:
  - HTML page: 1255
  - Invalid PDF format: 168
  - HTML response: 16
  - Timeout: 9
  - Connection error: 8
  - HTTP 403: 5
  - HTTP 404: 3
  - HTTP 410: 1
  - Exceeded 30 redirects.: 1
  - HTTP 303: 1

======================================================================
üì• HYDROCRACKING
======================================================================
üìä Total papers: 2159
üîì Open access: 1661
üìã Tasks to process: 1661
üë∑ Using 20 parallel workers

  Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1661/1661 [01:37<00:00, 17.05 PDFs/s]

‚úÖ Successfully downloaded: 5
‚è≠Ô∏è  Already existed: 626
‚ùå Failed: 1030

üìä Failure reasons:
  - HTML page: 880
  - Invalid PDF format: 99
  - Connection error: 27
  - File too small: 7
  - HTML response: 5
  - Timeout: 5
  - HTTP 403: 2
  - HTTP 500: 2
  - Invalid URL 'www.osti.gov/servlets/purl/2530741': : 1
  - [Errno 2] No such file or directory: 'literature/p: 1

======================================================================
üìä FINAL STATISTICS
======================================================================
‚úÖ Total PDFs downloaded: 6
‚è≠Ô∏è  Already existed: 2429
‚ùå Total failed: 4330
‚è±Ô∏è  Time taken: 352.5s
üìà Download rate: 1.0 PDFs/minute

üìÅ PDFs saved to: literature/pdfs
üìã Failed URLs log: literature/failed_downloads.txt
======================================================================

In [3]:
from pymilvus import connections, Collection

connections.connect(uri="http://localhost:19530")
collection = Collection("ChemQuest_Patents_Multicollection_Hybrid_Embeddings")
collection.load()

# Query one document_metadata entry
results = collection.query(
    expr='attribute_type == "document_metadata"',
    output_fields=["patent", "title", "url"],
    limit=5
)

for r in results:
    print(f"Patent: {r['patent']}")
    print(f"  Title: {r['title'][:50]}...")
    print(f"  URL: {r['url']}")  # ‚Üê See what's actually stored
    print()

Patent: US11866388B1
  Title: Catalyst systems...
  URL: https://patents.google.com/patent/US11866388B1

Patent: US11878289B2
  Title: Hydrotreatment catalysts and process for preparing...
  URL: https://patents.google.com/patent/US11878289B2

Patent: US12227702B2
  Title: Method for conducting finishing hydrodesulphurisat...
  URL: https://patents.google.com/patent/US12227702B2

Patent: US11554365B2
  Title: Organoruthenium carbide complexes as precatalysts ...
  URL: https://patents.google.com/patent/US11554365B2

Patent: US11224853B2
  Title: Metal supported powder catalyst matrix and process...
  URL: https://patents.google.com/patent/US11224853B2

