In [2]:
"""
Load and prepare USPTO patent data for token analysis.

The goal is to analyze token counts to ensure we stay within the API limit of
8000 tokens.
"""

import os
import pandas as pd


# Path to data folder
FOLDER_YEAR = (
    r"C:\Users\Roberto\Documents\GitHub_repositories\USPTO\data"
    r"\fake_2005_folder"
)

# Load patent data into DataFrame
df = pd.read_csv(os.path.join(FOLDER_YEAR, "dataframe.csv"))


In [3]:
df.head(3)

Unnamed: 0,publication_title,publication_num,publication_date,application_type,classifications,inventors,abstract,descriptions,claims,publication_number,ipc_classifications,national_classifications,description,abstract_characters,abstract_tokens,claims_characters,claims_tokens,description_characters,description_tokens
0,Anthurium andreanum plant named 'Anthbondem',,20050106,utility,,"[{'last_name': 'Dijk', 'first_name': 'Jan'}]",\nA new and distinct cultivar of Anthurium and...,,['1\n. A new and distinct\nAnthurium andreanum...,20050005336,['\n07\nG11C019/08\nA01H005/00\n'],['\nUS\nPLT365000\n'],\n\nLATIN NAME OF THE GENUS AND SPECIES OF THE...,643,129,112,29,9258,2306
1,Novelty jeans,,20050106,utility,,"[{'last_name': 'Goldkind', 'first_name': 'Tina'}]",\nNovelty play jeans for children which includ...,,['1\n. A novelty jeans formed of suitable fabr...,20050000001,['\n07\nA41D001/06\n'],['\nUS\n002227000\n'],\n\nBACKGROUND OF THE INVENTION \n The inventi...,507,104,6585,1441,23051,5122
2,Garment featuring means for temporarily attach...,,20050106,utility,,"[{'last_name': 'Levy', 'first_name': 'Philippe...","\nA garment, such as a skirt, culottes, dress,...",,"[""1\n. A temporary means for attaching a cloth...",20050000002,['\n07\nA41D027/08\n'],['\nUS\n002244000\n'],\n\nFIELD OF THE INVENTION \n This invention c...,744,149,2582,541,11689,2466


In [7]:
import tiktoken


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """
    Calculate number of tokens in a string using specified encoding.

    Args:
        string: Input text to tokenize
        encoding_name: Name of the tokenizer encoding to use

    Returns:
        int: Number of tokens in the string
    """
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


TEXT_COLUMNS = ["abstract", "description", "claims"]
ENCODING_MODEL = "text-embedding-3-small"

for column in TEXT_COLUMNS:
    # Count tokens
    df[f"{column}_tokens"] = (
        df[column]
        .fillna("")
        .apply(lambda x: num_tokens_from_string(x, ENCODING_MODEL))
    )

    # Count characters 
    df[f"{column}_characters"] = df[column].fillna("").apply(len)

    print(f"Max tokens for {column}: {df[f'{column}_tokens'].max()}")
    print(f"Max characters for {column}: {df[f'{column}_characters'].max()}")



Max tokens for abstract: 738
Max characters for abstract: 3034
Max tokens for description: 530181
Max characters for description: 1191746
Max tokens for claims: 67972
Max characters for claims: 214028


In [8]:
# Calculate token distribution percentiles for each text column
PERCENTILES = [0.1, 0.25, 0.5, 0.75, 0.9, 0.99]
TEXT_COLUMNS = ["abstract", "description", "claims"]

for column in TEXT_COLUMNS:
    print(f"\nPercentiles for {column}:")
    token_percentiles = df[f"{column}_tokens"].quantile(PERCENTILES)
    print(token_percentiles)

# Calculate total tokens across all text columns
total_tokens = sum(df[f"{col}_tokens"].sum() for col in TEXT_COLUMNS)
print(f"\nTotal number of tokens: {total_tokens:,}")



Percentiles for abstract:
0.10     66.0
0.25     94.0
0.50    130.0
0.75    168.0
0.90    207.0
0.99    320.0
Name: abstract_tokens, dtype: float64

Percentiles for description:
0.10     2768.00
0.25     4214.50
0.50     6859.00
0.75    11954.00
0.90    21289.00
0.99    70266.78
Name: description_tokens, dtype: float64

Percentiles for claims:
0.10     450.00
0.25     739.00
0.50    1170.00
0.75    1842.00
0.90    2898.20
0.99    8702.24
Name: claims_tokens, dtype: float64

Total number of tokens: 140,158,188


Token counts analysis:
- Abstract: Token counts are within acceptable limits for API calls
- Description: Extremely high token count (68,877) - exceeds API limit by ~9x
  This will require chunking/summarization before sending to API
- Claims: Token count (~8,000) is close to API limit
  May need handling for edge cases that exceed limit



In [11]:
# Analyze NaN values in the description column

# Find non-string elements in description column
non_string_elements = [
    element for element in df["description"] 
    if not isinstance(element, str)
]

# Print count and unique non-string values
print(f"Number of non-string elements: {len(non_string_elements)}")
print(f"Unique non-string values: {set(non_string_elements)}")

# Print dataframe dimensions
print(f"DataFrame shape: {df.shape}")

# Print sample of patents with NaN descriptions
print("\nSample patents with NaN descriptions:")
nan_descriptions = df[df["description"].isna()].head(4)
print(nan_descriptions)


Number of non-string elements: 0
Unique non-string values: set()
DataFrame shape: (10979, 19)

Sample patents with NaN descriptions:
Empty DataFrame
Columns: [publication_title, publication_num, publication_date, application_type, classifications, inventors, abstract, descriptions, claims, publication_number, ipc_classifications, national_classifications, description, abstract_characters, abstract_tokens, claims_characters, claims_tokens, description_characters, description_tokens]
Index: []


In [14]:
"""Analyze patent XML data and investigate NaN values in the dataset.

This cell:
1. Sets up imports and paths
2. Parses a sample XML file
3. Analyzes publication numbers and NaN values in the dataset
"""

import os
import sys
from pathlib import Path

# Add parent directory to Python path for imports
parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.append(str(parent_dir))

from process_patent_xml import parse_xml

# Sample XML file path
xml_path = Path(
    "C:/Users/Roberto/Documents/GitHub_repositories/USPTO/data"
    "/fake_2005_folder/I20050106/UTIL0002/US20050002861A1-20050106"
    "/US20050002861A1-20050106.XML"
)

# Parse sample XML file
parsed_xml = parse_xml(xml_path)
print(f"Publication number from XML: {parsed_xml['publication_number']}")

# Find matching record in DataFrame
matching_record = df[df["publication_number"] == 20050002861]
print("\nMatching record from DataFrame:")
print(matching_record)

# Analyze NaN values and publication numbers
nan_desc_missing_pub = df[df["description"].isna()]["publication_number"].isna().sum()
unique_pub_count = df["publication_number"].nunique()
duplicate_pub_count = df["publication_number"].duplicated().sum()

print(f"\nRecords with NaN description and missing publication number: {nan_desc_missing_pub}")
print(f"Unique publication numbers: {unique_pub_count}")
print(f"Duplicate publication numbers: {duplicate_pub_count}")

# Compare duplicated publication numbers with NaN descriptions
duplicated_pub_nums = set(
    df[df["publication_number"].duplicated()]["publication_number"].tolist()
)
nan_desc_pub_nums = set(
    df[df["description"].isna()]["publication_number"].tolist()
)

print("\nDuplicated publication numbers match NaN descriptions?")
print(duplicated_pub_nums == nan_desc_pub_nums)

Publication number from XML: 20050002861

Matching record from DataFrame:
                                      publication_title  publication_num  \
2860  Enhanced scintigraphic imaging agents for imag...              NaN   

      publication_date application_type  classifications  \
2860          20050106          utility              NaN   

                                              inventors  \
2860  [{'last_name': 'Krause', 'first_name': 'Sabine...   

                                               abstract  descriptions  \
2860  \nThe invention describes enhanced scintigraph...           NaN   

                                                 claims  publication_number  \
2860  ['1\n. A reagent comprising:\ni) a polybasic c...         20050002861   

         ipc_classifications national_classifications  \
2860  ['\n07\nA61K051/00\n']    ['\nUS\n424001690\n']   

                                            description  abstract_characters  \
2860  \n\n This application clai

In [13]:
# Print first 50 characters of each line in the parsed XML dictionary
for key, value in parsed_xml.items():
    if isinstance(value, str):
        print(f"{key}: {value[:50]}...")
    elif isinstance(value, list):
        print(f"{key}: {str(value)[:50]}...")
    else:
        print(f"{key}: {str(value)[:50]}...")

title: Enhanced scintigraphic imaging agents for imaging ...
application_id: 10752096...
publication_number: 20050002861...
publication_date: 20050106...
application_type: utility...
ipc_classifications: ['\n07\nA61K051/00\n']...
national_classifications: ['\nUS\n424001690\n']...
inventors: [{'last_name': 'Krause', 'first_name': 'Sabine'}, ...
abstract: 
The invention describes enhanced scintigraphic im...
description: 

 This application claims the benefit of the fili...
claims: 1
. A reagent comprising:
i) a polybasic compound ...


In [17]:
# Compare pricing between Gemini and GPT-4 embedding models
# Pricing data from:
# - Gemini: https://cloud.google.com/vertex-ai/pricing#gemini
# - OpenAI: https://openai.com/pricing#embedding-models

# Model pricing constants
GEMINI_PRICE_PER_1K_CHARS = 0.0002  # $0.0002 per 1k characters
GPT4_PRICE_PER_1M_TOKENS = 0.02     # $0.02 per 1M tokens (text-embedding-3-small)

# Convert to per-token pricing for comparison
# Assuming 1 token ≈ 4 characters
CHARS_PER_TOKEN = 4
gemini_price_per_token = (GEMINI_PRICE_PER_1K_CHARS / 1000) * CHARS_PER_TOKEN
gpt4_price_per_token = GPT4_PRICE_PER_1M_TOKENS / 1e6

def print_price_comparison(df: pd.DataFrame) -> None:
    """Print price comparison between Gemini and GPT-4 embedding models.
    
    Args:
        df: DataFrame containing token counts for abstract, description, claims
    """
    # Calculate average tokens per document
    avg_tokens = {
        'abstract': df['abstract_tokens'].mean(),
        'description': df['description_tokens'].mean(),
        'claims': df['claims_tokens'].mean()
    }
    total_avg_tokens = sum(avg_tokens.values())
    
    # Calculate prices per document
    prices_per_doc = {
        'Gemini': total_avg_tokens * gemini_price_per_token,
        'GPT-4': total_avg_tokens * gpt4_price_per_token
    }
    
    # Calculate prices for 1M documents
    prices_per_1m_docs = {
        model: price * 1e6 
        for model, price in prices_per_doc.items()
    }
    
    # Print results
    print("Price Comparison (per 1M tokens):")
    print(f"Gemini: ${1e6 * gemini_price_per_token:.2f}")
    print(f"GPT-4:  ${1e6 * gpt4_price_per_token:.2f}")
    
    print(f"\nAverage tokens per document: {total_avg_tokens:.0f}")
    
    print("\nPrice per document:")
    for model, price in prices_per_doc.items():
        print(f"{model}: ${price:.4f}")
        
    print("\nPrice per 1M documents:")
    for model, price in prices_per_1m_docs.items():
        print(f"{model}: ${price:,.2f}")

# Generate price comparison
print_price_comparison(df)

Price Comparison (per 1M tokens):
Gemini: $0.80
GPT-4:  $0.02

Average tokens per document: 12766

Price per document:
Gemini: $0.0102
GPT-4: $0.0003

Price per 1M documents:
Gemini: $10,212.82
GPT-4: $255.32


In [19]:
# Calculate total cost for Gemini API processing

# Constants
GEMINI_PRICE_PER_1K_CHARS = 0.0002  # USD per 1000 characters
TEXT_COLUMNS = ["abstract", "description", "claims"]

def calculate_gemini_costs(df: pd.DataFrame) -> None:
    """Calculate and print cost estimates for processing documents with Gemini API.
    
    Args:
        df: DataFrame containing character counts for text columns
    """
    # Calculate total characters across all documents and columns
    total_chars = sum(
        df[f"{col}_characters"].sum() for col in TEXT_COLUMNS
    )
    print(f"Total number of characters: {total_chars:,}")
    
    # Calculate costs at different scales
    total_cost = (GEMINI_PRICE_PER_1K_CHARS * total_chars) / 1000
    cost_per_doc = total_cost / len(df)
    cost_per_1m_docs = cost_per_doc * 1e6
    
    print(f"\nCost Estimates (Gemini API):")
    print(f"Total cost for dataset: ${total_cost:.2f}")
    print(f"Cost per document: ${cost_per_doc:.4f}")
    print(f"Cost for 1M documents: ${cost_per_1m_docs:,.2f}")
    
    print("\nNote: Gemini has a max input size of 2048 tokens.")
    print("Consider evaluating both Gemini and GPT-4 for embedding quality.")

# Generate cost estimates
calculate_gemini_costs(df)

Total number of characters: 604,882,066

Cost Estimates (Gemini API):
Total cost for dataset: $120.98
Cost per document: $0.0110
Cost for 1M documents: $11,018.89

Note: Gemini has a max input size of 2048 tokens.
Consider evaluating both Gemini and GPT-4 for embedding quality.


In [49]:
df = pd.read_csv(r"C:\Users\Roberto\Documents\GitHub Repositories\USPTO\data\fake_2005_folder\dataframe.csv")
df.head(3)

Unnamed: 0,publication_title,publication_num,publication_date,application_type,classifications,inventors,abstract,descriptions,claims,publication_number,ipc_classifications,national_classifications,description
0,Anthurium andreanum plant named 'Anthbondem',,20050106,utility,,"[{'last_name': 'Dijk', 'first_name': 'Jan'}]",\nA new and distinct cultivar of Anthurium and...,,['1\n. A new and distinct\nAnthurium andreanum...,20050005336,['\n07\nG11C019/08\nA01H005/00\n'],['\nUS\nPLT365000\n'],\n\nLATIN NAME OF THE GENUS AND SPECIES OF THE...
1,Novelty jeans,,20050106,utility,,"[{'last_name': 'Goldkind', 'first_name': 'Tina'}]",\nNovelty play jeans for children which includ...,,['1\n. A novelty jeans formed of suitable fabr...,20050000001,['\n07\nA41D001/06\n'],['\nUS\n002227000\n'],\n\nBACKGROUND OF THE INVENTION \n The inventi...
2,Garment featuring means for temporarily attach...,,20050106,utility,,"[{'last_name': 'Levy', 'first_name': 'Philippe...","\nA garment, such as a skirt, culottes, dress,...",,"[""1\n. A temporary means for attaching a cloth...",20050000002,['\n07\nA41D027/08\n'],['\nUS\n002244000\n'],\n\nFIELD OF THE INVENTION \n This invention c...
