In [3]:
import os
import re
import pdfplumber

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        return "\n".join(page.extract_text() for page in pdf.pages)

# Function to extract information based on file type
def extract_patent_information(file_path):
    text = extract_text_from_pdf(file_path)

    # Log raw text for debugging
    print(f"\nRaw text from {file_path}:\n{text}\n")

    # Determine patent type
    if "D0" in os.path.basename(file_path):
        patent_type = "D0"
    elif "RE" in os.path.basename(file_path):
        patent_type = "RE"
    elif "PP" in os.path.basename(file_path):
        patent_type = "PP"
    else:
        raise ValueError("Unsupported patent type")

    # Define patterns for different fields
    patterns = {
        "Patent Number": r"(?:US|USP)?\s*(\w+)",
        "Title": r"Title:\s*(.+?)\n",
        "Applicant": r"Applicant:\s*(.+?)\n",
        "Application Date": r"Application Date:\s*(.+?)\n",
        "Patent Date": r"Patent Date:\s*(.+?)\n",
        "References": r"References:\s*(.+?)\n",
        "Claims": r"Claims:\s*(.+?)\n",
        "Description": r"Description:\s*(.+)$"
    }

    # Extract fields using regex
    extracted_data = {}
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        extracted_data[field] = match.group(1).strip() if match else "Not Found"

    return extracted_data

# Example usage
file_paths = ["PP000881.pdf", "RE023169.pdf", "D0155564.pdf"]
all_data = {}

for file_path in file_paths:
    file_path = os.path.join(os.getcwd(), file_path)  # Make the path OS-independent
    try:
        all_data[file_path] = extract_patent_information(file_path)
    except Exception as e:
        all_data[file_path] = {"Error": str(e)}

# Display extracted data
for file, data in all_data.items():
    print(f"\nExtracted data for {file}:")
    for key, value in data.items():
        print(f"{key}: {value}")



Raw text from d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\PP000881.pdf:





Raw text from d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\RE023169.pdf:




















Raw text from d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\D0155564.pdf:




Extracted data for d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\PP000881.pdf:
Patent Number: Not Found
Title: Not Found
Applicant: Not Found
Application Date: Not Found
Patent Date: Not Found
References: Not Found
Claims: Not Found
Description: Not Found

Extracted data for d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\RE023169.pdf:
Patent Number: Not Found
Title: Not Found
Applicant: Not Found
Application Date: Not Found
Patent Date: Not Found
References: Not Found
Claims: Not Found
Description: Not Found

Extracted data for d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\D0155564.pdf:
Patent Number: Not Found
Title: Not Found
Applicant: Not Found
Application Date: Not Found
Patent Date: Not Found
References: Not Found
Claims: Not Found
Description: Not Fo

In [5]:
import pdfplumber

def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = "\n".join(page.extract_text() or "Page not readable" for page in pdf.pages)
    return text

file_path = "PP000881.pdf"  # Change to the path of the file to debug
raw_text = extract_text_from_pdf(file_path)
print(f"\nRaw text from {file_path}:\n{raw_text}")



Raw text from PP000881.pdf:
Page not readable
Page not readable
Page not readable


In [7]:
import os
import re
from pdf2image import convert_from_path
from pytesseract import image_to_string

# Function to extract text from a PDF using OCR
def extract_text_with_ocr(file_path):
    pages = convert_from_path(file_path)  # Convert PDF to images
    text = ""
    for page_number, page in enumerate(pages, start=1):
        page_text = image_to_string(page)
        text += f"--- Page {page_number} ---\n{page_text}\n"
    return text

# Function to extract patent information dynamically
def extract_patent_information(file_path):
    text = extract_text_with_ocr(file_path)

    # Log raw text for debugging
    print(f"\nRaw OCR text from {file_path}:\n{text}\n")

    # Initialize extracted data dictionary
    extracted_data = {
        "Patent Number": "Not Found",
        "Title": "Not Found",
        "Applicant": "Not Found",
        "Application Date": "Not Found",
        "Patent Date": "Not Found",
        "References": "Not Found",
        "Claims": "Not Found",
        "Description": "Not Found"
    }

    # Update regex patterns based on observed format
    patterns = {
        "Patent Number": r"Patent\\s*Number[:\\-\\s]*(\\w+)",  # Example: "Patent Number: 123456"
        "Title": r"Title[:\\-\\s]*(.+)",  # Example: "Title: Invention Title"
        "Applicant": r"Applicant[:\\-\\s]*(.+)",  # Example: "Applicant: John Doe"
        "Application Date": r"Application\\s*Date[:\\-\\s]*(.+)",  # Example: "Application Date: Jan 1, 2020"
        "Patent Date": r"Patent\\s*Date[:\\-\\s]*(.+)",  # Example: "Patent Date: Feb 2, 2021"
        "References": r"References[:\\-\\s]*(.+)",  # Example: "References: Patent 123456"
        "Claims": r"Claims[:\\-\\s]*(.+)",  # Example: "Claims: 1. Claim text here."
        "Description": r"Description[:\\-\\s]*(.+)",  # Example: "Description: Detailed description."
    }

    # Extract fields using regex
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        extracted_data[field] = match.group(1).strip() if match else "Not Found"

    return extracted_data

# Example usage
file_paths = ["PP000881.pdf", "RE023169.pdf", "D0155564.pdf"]
all_data = {}

for file_path in file_paths:
    file_path = os.path.join(os.getcwd(), file_path)  # Make the path OS-independent
    try:
        all_data[file_path] = extract_patent_information(file_path)
    except Exception as e:
        all_data[file_path] = {"Error": str(e)}

# Display extracted data
for file, data in all_data.items():
    print(f"\nExtracted data for {file}:")
    for key, value in data.items():
        print(f"{key}: {value}")



Raw OCR text from d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\PP000881.pdf:
--- Page 1 ---
Oct. 18, 1949. C. D. CORLISS Plant Pat. 881

EUONYMUS FORTUNEI PLANT

Filed Dec. 19, 1947

ae


--- Page 2 ---
ki

Patented Oct. 18, 1949

Plant Pat. 881

UNITED STATES PATENT OFFICE

881

EUONYMUS FORTUNEI PLANT

Clifford D. Corliss, Magnolia, Mass., assignor to
Corliss Bros. Inc., Gloucester, Mass., a corpora-
tion of Massachusetts

Application December 19, 1947, Serial No. 792,707
(CL 47—59)

1 Claim.

1

This invention relates to a new and distinct
variety of Euonymus fortunei plant, an evergreen
originated by me from a selected seedling pro-
duced by crossing Euonymus fortunei vegeta and
Euonymus fortunei radicans.

This new variety possesses a number of desir-
able characteristics clearly distinguishing it from
its parents and other similar evergreen varieties,
and the following comparisons will suffice to
point out its particular advantages of utility for
ornamental purposes.

It is well k

In [11]:
import os
import re
from pdf2image import convert_from_path
from pytesseract import image_to_string

# Function to extract text from a PDF using OCR
def extract_text_with_ocr(file_path):
    pages = convert_from_path(file_path)  # Convert PDF to images
    text = ""
    for page_number, page in enumerate(pages, start=1):
        page_text = image_to_string(page)
        text += f"--- Page {page_number} ---\n{page_text}\n"
    return text

# Function to extract information from PP and D0 files
def extract_patent_information(file_path):
    text = extract_text_with_ocr(file_path)

    # Log raw text for debugging
    print(f"\nRaw OCR text from {file_path}:\n{text}\n")

    # Initialize extracted data dictionary
    extracted_data = {
        "Patent Number": "Not Found",
        "Title": "Not Found",
        "Applicant": "Not Found",
        "Application Date": "Not Found",
        "Patent Date": "Not Found",
        "References": "Not Found",
        "Claims": "Not Found",
        "Description": "Not Found",
    }

    # Specific patterns for extracting fields
    patterns = {
        "Patent Number": r"(?:Plant Pat\.|Des\.)?\s*(\d{1,6})",  # Example: "Plant Pat. 881" or "Des. 155,564"
        "Title": r"\n(\d{1,6})\n([A-Z][A-Z0-9 ,.\\-]*)",  # Title follows the patent number
        "Applicant": r"\n([A-Z][A-Za-z0-9 ,.\\-]*)\n(Patented|Application)",  # Name before "Patented Date" or "Application"
        "Patent Date": r"(Patented Date|Patented)[:\\s]*(\\w+ \\d{1,2}, \\d{4})",  # "Patented Date: Month Day, Year"
        "Application Date": r"(Application[:\\s]*)?(\\w+ \\d{1,2}, \\d{4})",  # "Application Month Day, Year"
        "References": r"References[:\\s]*([A-Za-z0-9 ,.;]*)",  # "References: Patent details"
        "Claims": r"Claims[:\\s]*(.*?)(?=Description|$)",  # Claims text before "Description"
        "Description": r"Description[:\\s]*(.+)$",  # Everything after "Description"
    }

    # Extract fields using regex
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            # Handle special cases where multiple groups are used
            if field == "Title":
                extracted_data[field] = match.group(2).strip()  # Title is in the second group
            elif field == "Applicant":
                extracted_data[field] = match.group(1).strip()  # Applicant is in the first group
            else:
                extracted_data[field] = match.group(1).strip()

    return extracted_data

# Example usage
file_paths = ["PP000881.pdf", "D0155564.pdf"]  # Add "RE" files as needed
all_data = {}

for file_path in file_paths:
    file_path = os.path.join(os.getcwd(), file_path)  # Make the path OS-independent
    try:
        all_data[file_path] = extract_patent_information(file_path)
    except Exception as e:
        all_data[file_path] = {"Error": str(e)}

# Display extracted data
for file, data in all_data.items():
    print(f"\nExtracted data for {file}:")
    for key, value in data.items():
        print(f"{key}: {value}")



Raw OCR text from d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\PP000881.pdf:
--- Page 1 ---
Oct. 18, 1949. C. D. CORLISS Plant Pat. 881

EUONYMUS FORTUNEI PLANT

Filed Dec. 19, 1947

ae


--- Page 2 ---
ki

Patented Oct. 18, 1949

Plant Pat. 881

UNITED STATES PATENT OFFICE

881

EUONYMUS FORTUNEI PLANT

Clifford D. Corliss, Magnolia, Mass., assignor to
Corliss Bros. Inc., Gloucester, Mass., a corpora-
tion of Massachusetts

Application December 19, 1947, Serial No. 792,707
(CL 47—59)

1 Claim.

1

This invention relates to a new and distinct
variety of Euonymus fortunei plant, an evergreen
originated by me from a selected seedling pro-
duced by crossing Euonymus fortunei vegeta and
Euonymus fortunei radicans.

This new variety possesses a number of desir-
able characteristics clearly distinguishing it from
its parents and other similar evergreen varieties,
and the following comparisons will suffice to
point out its particular advantages of utility for
ornamental purposes.

It is well k

In [12]:
all_data

{'d:\\Data\\HiWi\\Job 2- Hanh\\Hanh\\Patent\\files\\PP000881.pdf': {'Patent Number': '1',
  'Title': 'Not Found',
  'Applicant': 'Not Found',
  'Application Date': 'Not Found',
  'Patent Date': 'Not Found',
  'References': '. cited.',
  'Claims': 'Not Found',
  'Description': 'of the\nnew variety, color terminology being in accord-\nance with ordinary dictionary significance:\n\nClassification: Euonymus fortunei var. (vegeta\nX radicans),\n\nHabit: Distinctly upright without support; plant\narises from a central stem; form of shrub\novoid, well bushed at the base. The regular\nvegeta which it resembles in leaf character is\na more or less prostrate vine which grows\nupright only if supported.\n\nLeaves and branchlets: Leaves evergreen, 3-5\ncm. long, opposite, petioled, broad ovate to sub-\norbicular; margin crenate-serrate; apex emar-\nginate; dark glossy green above; light dull\ngreen beneath; veins slightly more prominent\non the upper than on the under surface; tex-\nture somewhat 

In [13]:
import pandas as pd
import re
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

# Function to extract text from a PDF file using OCR
def extract_text_from_pdf(pdf_path, dpi=300):
    pages = convert_from_path(pdf_path, dpi)
    text_data = ''
    for page in pages:
        text_data += pytesseract.image_to_string(page)
    return text_data

# Function to clean and structure the extracted text
def process_patent_text(text):
    # Normalize the text by removing extra spaces and line breaks
    normalized_text = re.sub(r'\s+', ' ', text.strip())

    # Extract key fields using improved regex patterns
    patent_number = re.search(r'(?:Patent(?:ed)?|Des\.)\s*(\d{1,3},\d{3})', normalized_text)  # Match patterns like 155,564
    title = re.search(r'(?<=COMBINATION\s).*?(?=\sFiled)', normalized_text, re.IGNORECASE)  # Extract title before "Filed"
    applicant = re.search(r'(?:Be it known that I, )(.*?)(?=, a citizen)', normalized_text, re.IGNORECASE)  # Match applicant name
    application_date = re.search(r'Filed (.*?),', normalized_text, re.IGNORECASE)  # Match filed date
    patent_date = re.search(r'Patented (.*?)(?=\sUNITED|Des)', normalized_text, re.IGNORECASE)  # Match patented date
    claims = re.search(r'I claim:(.*?)(?=REFERENCES|$)', normalized_text, re.IGNORECASE)  # Match claims before "REFERENCES"
    references = re.search(r'REFERENCES CITED(.*?)$', normalized_text, re.IGNORECASE)  # Match references section

    # Create a dictionary of the structured data
    structured_data = {
        "Patent Number": patent_number.group(1) if patent_number else None,
        "Title": title.group(0).strip() if title else None,
        "Applicant": applicant.group(1).strip() if applicant else None,
        "Application Date": application_date.group(1).strip() if application_date else None,
        "Patent Date": patent_date.group(1).strip() if patent_date else None,
        "Claims": claims.group(1).strip() if claims else None,
        "References": references.group(1).strip() if references else None,
        "Description": text.strip(),  # Keep the full OCR text for reference
    }

    return structured_data

# Function to convert structured data to a DataFrame
def create_dataframe_from_patent(data):
    df = pd.DataFrame([data])
    return df

# Main function to handle the entire process
def process_patent_file(pdf_path):
    raw_text = extract_text_from_pdf(pdf_path)
    structured_data = process_patent_text(raw_text)
    df = create_dataframe_from_patent(structured_data)
    return df

# Path to the patent PDF file
pdf_file_path = '02488002.pdf'

# Process the file and display the DataFrame
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
patent_df = process_patent_file(pdf_file_path)
print(patent_df)

# Save the DataFrame to a CSV file
patent_df.to_csv('patent_data_with_description_1.csv', index=False)


  Patent Number Title Applicant Application Date      Patent Date  \
0          None  None      None          Oct. 17  Nov. 15, 1949 _   

                                              Claims  \
0  I. A trailer for use in orchards and the like ...   

                                          References  \
0  The following references are of record. in: th...   

                                         Description  
0  G. CARRAHER _ 2,488,002\n\nTRAILER FRAME AND W...  


In [1]:
import os
import re
from pdf2image import convert_from_path
from pytesseract import image_to_string

# Function to extract text from a PDF using OCR
def extract_text_from_pdf(file_path, dpi=300):
    pages = convert_from_path(file_path, dpi)
    text_data = ''
    for page in pages:
        text_data += image_to_string(page)
    return text_data

# Function to process the extracted text and extract patent fields
def process_patent_text(text):
    # Normalize the text by removing extra spaces and line breaks
    normalized_text = re.sub(r'\s+', ' ', text.strip())

    # Extract key fields using refined regex patterns
    patent_number = re.search(r'(?:Plant Pat\.|Des\.|Patent(?:ed)?|Patent Number)\s*(\d{1,3}[,\.]?\d{3})', normalized_text)
    # title = re.search(r'(?<=\d{1,6}\s)([A-Z][A-Z0-9 ,.\-]+)(?=\sFiled|\sPatented|\sApplication)', normalized_text, re.IGNORECASE)
    title = re.search(r'(\d{1,6}\s)([A-Z][A-Z0-9 ,.\-]+)(?=\sFiled|\sPatented|\sApplication)', normalized_text, re.IGNORECASE)

    applicant = re.search(r'(?:Be it known that I, )(.*?)(?=, a citizen| of the)', normalized_text, re.IGNORECASE)
    application_date = re.search(r'Filed (.*?),', normalized_text, re.IGNORECASE)
    patent_date = re.search(r'Patented (.*?)(?=\sUNITED|\sDes|\n)', normalized_text, re.IGNORECASE)
    references = re.search(r'(REFERENCES CITED|no reference cited)(.*?)(?=Claims|Description|$)', normalized_text, re.IGNORECASE)
    claims = re.search(r'I claim[:\s]*(.*?)(?=REFERENCES|Description|$)', normalized_text, re.IGNORECASE)

    # Create a dictionary of the structured data
    structured_data = {
        "Patent Number": patent_number.group(1).replace(',', '').strip() if patent_number else "Not Found",
        "Title": title.group(1).strip() if title else "Not Found",
        "Applicant": applicant.group(1).strip() if applicant else "Not Found",
        "Application Date": application_date.group(1).strip() if application_date else "Not Found",
        "Patent Date": patent_date.group(1).strip() if patent_date else "Not Found",
        "References": references.group(2).strip() if references else "No References Cited",
        "Claims": claims.group(1).strip() if claims else "Not Found",
        "Description": text.strip(),  # Full OCR text as Description
    }

    return structured_data

# Main function to process a patent PDF file
def process_patent_file(file_path):
    raw_text = extract_text_from_pdf(file_path)
    structured_data = process_patent_text(raw_text)
    return structured_data

# Example usage
file_paths = ["PP000881.pdf", "D0155564.pdf",'02488002.pdf','RE023169.pdf']  # Add "RE" files as needed
all_data = {}

for file_path in file_paths:
    file_path = os.path.join(os.getcwd(), file_path)  # Make the path OS-independent
    try:
        all_data[file_path] = process_patent_file(file_path)
    except Exception as e:
        all_data[file_path] = {"Error": str(e)}

# Display extracted data
for file, data in all_data.items():
    print(f"\nExtracted data for {file}:")
    for key, value in data.items():
        print(f"{key}: {value}")



Extracted data for d:\Data\HiWi\Job 2- Hanh\Hanh\Patent\files\PP000881.pdf:
Patent Number: Not Found
Title: 1947
Applicant: Not Found
Application Date: Dec. 19
Patent Date: Oct. 18, 1949 Plant Pat. 881
References: No References Cited
Claims: A new and distinct horticultural variety of Euonymus. fortunei plant herein shown and de- scribed and characterized particularly by its erectness and shapeliness of shrub growth in the absence of support; its vigorous, sturdy and 881 4 bushy habit of growth in forming from one céfi- tral stem, a neatly rounded evergreen bush; its large, thick and glossy leaves; its hardiness and resistance to diseases; ability to stand dry con- ditions, and the absence of any showing of win- ter burn, CLIFFORD D. CORLISS. No.
Description: Oct. 18, 1949.

Cc. D. CORLISS
EUONYMUS FORTUNEI PLANT

Filed Dec. 19, 1947

Plant Pat. 881

Oe
Patented Oct. 18, 1949

Plant Pat. 881

UNITED STATES PATENT OFFICE

881
EUONYMUS FORTUNEI PLANT

Clifford D. Corliss, Magnolia, Mass

In [3]:
type(all_data)

dict

In [4]:
import csv

with open("mycsvfile.csv", "w", newline="") as f:
    w = csv.DictWriter(f, all_data.keys())
    w.writeheader()
    w.writerow(all_data)

In [5]:
import os

def select_pdfs_with_prefixes(directory, prefixes):
    """
    Select all PDF files in the given directory and its subdirectories
    that start with specific prefixes.

    Args:
        directory (str): Path to the directory to search.
        prefixes (list): List of prefixes to match at the start of file names.

    Returns:
        list: List of file paths to PDF files matching the prefixes.
    """
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.pdf') and any(file.startswith(prefix) for prefix in prefixes):
                file_paths.append(os.path.join(root, file))
    return file_paths


# Define the directory to search (current folder or specify the folder path)
directory = '.'  # Change to your desired directory path

# Define the prefixes to filter by
prefixes = ["02", "D0", "PP", "RE"]

# Get all PDF files matching the prefixes
file_paths = select_pdfs_with_prefixes(directory, prefixes)

# Create a dictionary to store data
all_data = {file: None for file in file_paths}  # Initialize with None; update with actual data as needed

# Print the results
print("PDF files matching the prefixes:")
for path in file_paths:
    print(path)


PDF files matching the prefixes:
.\02488002.pdf
.\D0155564.pdf
.\PP000881.pdf
.\RE023169.pdf


In [6]:
file_paths

['.\\02488002.pdf', '.\\D0155564.pdf', '.\\PP000881.pdf', '.\\RE023169.pdf']

In [7]:
for file_path in file_paths:
    file_path = os.path.join(os.getcwd(), file_path)  # Make the path OS-independent
    try:
        all_data[file_path] = process_patent_file(file_path)
    except Exception as e:
        all_data[file_path] = {"Error": str(e)}

# Display extracted data
for file, data in all_data.items():
    print(f"\nExtracted data for {file}:")
    for key, value in data.items():
        print(f"{key}: {value}")


Extracted data for .\02488002.pdf:


AttributeError: 'NoneType' object has no attribute 'items'

## Updating for all file types

In [8]:
import os
import re
import csv
from pdf2image import convert_from_path
from pytesseract import image_to_string


# Function to extract text from a PDF using OCR
def extract_text_from_pdf(file_path, dpi=300):
    pages = convert_from_path(file_path, dpi)
    text_data = ''
    for page in pages:
        text_data += image_to_string(page)
    return text_data


# Function to process the extracted text and extract patent fields
def process_patent_text(text):
    # Normalize the text by removing extra spaces and line breaks
    normalized_text = re.sub(r'\s+', ' ', text.strip())

    # Extract key fields using regex patterns
    patent_number = re.search(r'(?:Plant Pat\.|Des\.|Patent(?:ed)?|Patent Number)\s*(\d{1,3}[,\.]?\d{3})', normalized_text)
    title = re.search(r'(\d{1,6}\s)([A-Z][A-Z0-9 ,.\-]+)(?=\sFiled|\sPatented|\sApplication)', normalized_text, re.IGNORECASE)
    applicant = re.search(r'(?:Be it known that I, )(.*?)(?=, a citizen| of the)', normalized_text, re.IGNORECASE)
    application_date = re.search(r'Filed (.*?),', normalized_text, re.IGNORECASE)
    patent_date = re.search(r'Patented (.*?)(?=\sUNITED|\sDes|\n)', normalized_text, re.IGNORECASE)
    references = re.search(r'(REFERENCES CITED|no reference cited)(.*?)(?=Claims|Description|$)', normalized_text, re.IGNORECASE)
    claims = re.search(r'I claim[:\s]*(.*?)(?=REFERENCES|Description|$)', normalized_text, re.IGNORECASE)

    # Create a dictionary of the structured data
    structured_data = {
        "Patent Number": patent_number.group(1).replace(',', '').strip() if patent_number else "Not Found",
        "Title": title.group(2).strip() if title else "Not Found",
        "Applicant": applicant.group(1).strip() if applicant else "Not Found",
        "Application Date": application_date.group(1).strip() if application_date else "Not Found",
        "Patent Date": patent_date.group(1).strip() if patent_date else "Not Found",
        "References": references.group(2).strip() if references else "No References Cited",
        "Claims": claims.group(1).strip() if claims else "Not Found",
        "Description": text.strip(),  # Full OCR text as Description
    }

    return structured_data


# Main function to process a patent PDF file
def process_patent_file(file_path):
    raw_text = extract_text_from_pdf(file_path)
    structured_data = process_patent_text(raw_text)
    return structured_data


# Function to filter PDF files with specified prefixes
def select_pdfs_with_prefixes(directory, prefixes):
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.pdf') and any(file.startswith(prefix) for prefix in prefixes):
                file_paths.append(os.path.join(root, file))
    return file_paths





In [9]:
# Define the directory to search (folder named "files")
directory = 'data'

# Define the prefixes to filter by
prefixes = ["02", "D0", "PP", "RE"]

# Get all PDF files matching the prefixes
file_paths = select_pdfs_with_prefixes(directory, prefixes)

# Process all PDF files and collect data
all_data = {}
for file_path in file_paths:
    try:
        all_data[file_path] = process_patent_file(file_path)
    except Exception as e:
        all_data[file_path] = {"Error": str(e)}

# Save data to a CSV file
output_csv = "patent_data.csv"
csv_headers = [
    "File Path",
    "Patent Number",
    "Title",
    "Applicant",
    "Application Date",
    "Patent Date",
    "References",
    "Claims",
    "Description",
]


In [10]:

with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
    writer.writeheader()

    for file, data in all_data.items():
        row = {
            "File Path": file,
            "Patent Number": data.get("Patent Number", "Error"),
            "Title": data.get("Title", "Error"),
            "Applicant": data.get("Applicant", "Error"),
            "Application Date": data.get("Application Date", "Error"),
            "Patent Date": data.get("Patent Date", "Error"),
            "References": data.get("References", "Error"),
            "Claims": data.get("Claims", "Error"),
            "Description": data.get("Description", "Error"),
        }
        writer.writerow(row)

print(f"Data has been successfully saved to {output_csv}.")

Data has been successfully saved to patent_data.csv.


In [11]:
import os
import re
import csv
from pdf2image import convert_from_path
from pytesseract import image_to_string
from concurrent.futures import ProcessPoolExecutor


# Precompiled regex patterns for efficiency
REGEX_PATTERNS = {
    "Patent Number": re.compile(r'(?:Plant Pat\.|Des\.|Patent(?:ed)?|Patent Number)\s*(\d{1,3}[,\.]?\d{3})'),
    "Title": re.compile(r'(\d{1,6}\s)([A-Z][A-Z0-9 ,.\-]+)(?=\sFiled|\sPatented|\sApplication)', re.IGNORECASE),
    "Applicant": re.compile(r'(?:Be it known that I, )(.*?)(?=, a citizen| of the)', re.IGNORECASE),
    "Application Date": re.compile(r'Filed (.*?),', re.IGNORECASE),
    "Patent Date": re.compile(r'Patented (.*?)(?=\sUNITED|\sDes|\n)', re.IGNORECASE),
    "References": re.compile(r'(REFERENCES CITED|no reference cited)(.*?)(?=Claims|Description|$)', re.IGNORECASE),
    "Claims": re.compile(r'I claim[:\s]*(.*?)(?=REFERENCES|Description|$)', re.IGNORECASE),
}


def extract_text_from_pdf(file_path, dpi=150):
    """
    Convert scanned PDF pages to images and extract text using OCR.
    Processes one page at a time to reduce memory overhead.
    """
    text_data = []
    for page in convert_from_path(file_path, dpi=dpi):
        text_data.append(image_to_string(page))
    return " ".join(text_data)


def process_patent_text(text):
    """
    Process OCR-extracted text to identify structured fields.
    """
    normalized_text = re.sub(r'\s+', ' ', text.strip())
    structured_data = {
        key: (pattern.search(normalized_text).group(1).strip() if pattern.search(normalized_text) else "Not Found")
        for key, pattern in REGEX_PATTERNS.items()
    }
    # Add full description
    structured_data["Description"] = text.strip()
    return structured_data


def process_single_pdf(file_path):
    """
    Process a single PDF file: OCR and extract structured data.
    """
    try:
        raw_text = extract_text_from_pdf(file_path)
        structured_data = process_patent_text(raw_text)
        return file_path, structured_data
    except Exception as e:
        return file_path, {"Error": str(e)}


def select_pdfs_with_prefixes(directory, prefixes):
    """
    Filter PDF files in the directory (including subdirectories) by specific prefixes.
    """
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.pdf') and any(file.startswith(prefix) for prefix in prefixes):
                file_paths.append(os.path.join(root, file))
    return file_paths






In [12]:

def save_to_csv(output_csv, data):
    """
    Save extracted data into a CSV file.
    """
    csv_headers = [
        "File Path",
        "Patent Number",
        "Title",
        "Applicant",
        "Application Date",
        "Patent Date",
        "References",
        "Claims",
        "Description",
    ]
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
        writer.writeheader()
        for file, structured_data in data.items():
            if "Error" in structured_data:
                row = {"File Path": file, "Patent Number": "Error", "Title": "Error", "Applicant": "Error",
                       "Application Date": "Error", "Patent Date": "Error", "References": "Error",
                       "Claims": "Error", "Description": structured_data["Error"]}
            else:
                row = {
                    "File Path": file,
                    **structured_data
                }
            writer.writerow(row)

In [14]:

# Define the directory containing PDFs and prefixes
directory = 'data'
prefixes = ["02", "D0", "PP", "RE"]

# Collect all matching PDF file paths
pdf_files = select_pdfs_with_prefixes(directory, prefixes)






In [15]:
pdf_files

['data\\02488002.pdf',
 'data\\02505009.pdf',
 'data\\D0155564.pdf',
 'data\\D0158007.pdf',
 'data\\PP000881.pdf',
 'data\\PP000892.pdf',
 'data\\RE023169.pdf',
 'data\\RE023171.pdf']

In [16]:
# Use multiprocessing for faster OCR and processing
print(f"Processing {len(pdf_files)} files...")


Processing 8 files...


In [18]:
results = {}
for file_path in pdf_files:
    file_path, structured_data = process_single_pdf(file_path)
    results[file_path] = structured_data

# results = {}
# with ProcessPoolExecutor() as executor:
#     for file_path, structured_data in executor.map(process_single_pdf, pdf_files):
#         results[file_path] = structured_data



In [19]:
# Save results to CSV
output_csv = "optimized_patent_data.csv"
save_to_csv(output_csv, results)
print(f"Data saved to {output_csv}!")

Data saved to optimized_patent_data.csv!


In [1]:
import os
import re
import csv
from pdf2image import convert_from_path
from pytesseract import image_to_string
from concurrent.futures import ThreadPoolExecutor  # Using ThreadPoolExecutor instead of ProcessPoolExecutor

# Precompiled regex patterns for efficiency
REGEX_PATTERNS = {
    "Patent Number": re.compile(r'(?:Plant Pat\.|Des\.|Patent(?:ed)?|Patent Number)\s*(\d{1,3}[,\.]?\d{3})'),
    "Title": re.compile(r'(\d{1,6}\s)([A-Z][A-Z0-9 ,.\-]+)(?=\sFiled|\sPatented|\sApplication)', re.IGNORECASE),
    "Applicant": re.compile(r'(?:Be it known that I, )(.*?)(?=, a citizen| of the)', re.IGNORECASE),
    "Application Date": re.compile(r'Filed (.*?),', re.IGNORECASE),
    "Patent Date": re.compile(r'Patented (.*?)(?=\sUNITED|\sDes|\n)', re.IGNORECASE),
    # "References": re.compile(r'(REFERENCES CITED|no reference cited)(.*?)(?=Claims|Description|$)', re.IGNORECASE),
    # "Claims": re.compile(r'I claim[:\s]*(.*?)(?=REFERENCES|Description|$)', re.IGNORECASE),
}

# Function to extract text from scanned PDFs using OCR
def extract_text_from_pdf(file_path, dpi=600):
    text_data = []
    for page in convert_from_path(file_path, dpi=dpi):
        text_data.append(image_to_string(page))
    return " ".join(text_data)

# Process patent text and structure it into fields
def process_patent_text(text, file_prefix):
    normalized_text = re.sub(r'\s+', ' ', text.strip())
    
    structured_data = {
        "Patent Number": "Not Found",
        "Title": "Not Found",
        "Applicant": "Not Found",
        "Application Date": "Not Found",
        "Patent Date": "Not Found",
        # "References": "Not Found",
        # "Claims": "Not Found",
        # "Description": text.strip()
    }
    
    # Apply regex patterns based on the file prefix
    if file_prefix == "D0":
        # structured_data["Patent Number"] = re.search(r"(Des\.?\s?\d+)", normalized_text).group(1) if re.search(r"(Des\.?\s?\d+)", normalized_text) else "Not Found"
        structured_data["Patent Number"] = re.search(r"(Des\.?\s?\d{1,3}(?:,\d{3})*)", normalized_text).group(1) if re.search(r"(Des\.?\s?\d{1,3}(?:,\d{3})*)", normalized_text) else "Not Found"
        # structured_data["Title"] = re.search(r'UNITED STATES PATENT OFFICE\s+\d+[\s\n]+([A-Z0-9 ,.\-]+)', normalized_text).group(1) if re.search(r'UNITED STATES PATENT OFFICE\s+\d+[\s\n]+([A-Z0-9 ,.\-]+)', normalized_text) else "Not Found"
        structured_data["Title"] = re.search(r'UNITED STATES PATENT OFFICE\s*\n+\s*\d+[,\d]*\s*\n+\s*([A-Z0-9 ,.\-]+)', normalized_text).group(1) if re.search(r'UNITED STATES PATENT OFFICE\s*\n+\s*\d+[,\d]*\s*\n+\s*([A-Z0-9 ,.\-]+)', normalized_text) else "Not Found"


        # structured_data["Patent Date"] = re.search(r"Patented\s(.*?)\sUNITED", normalized_text).group(1) if re.search(r"Patented\s(.*?)\sUNITED", normalized_text) else "Not Found"
        structured_data["Patent Date"] = re.search(r"Patented\s([A-Z][a-z]+\.\s\d{1,2},\s\d{4})", normalized_text).group(1) if re.search(r"Patented\s([A-Z][a-z]+\.\s\d{1,2},\s\d{4})", normalized_text) else "Not Found"

        
        # structured_data["Applicant"] = re.search(r"Be it known that I, (.*?) of", normalized_text).group(1) if re.search(r"Be it known that I, (.*?) of", normalized_text) else "Not Found"
        structured_data["Applicant"] = re.search(r"(\b[A-Z][A-Z. ]+)", normalized_text).group(1) if re.search(r"(\b[A-Z][A-Z. ]+)", normalized_text) else "Not Found"

        # structured_data["Application Date"] = re.search(r"Filed (.*?),", normalized_text).group(1) if re.search(r"Filed (.*?),", normalized_text) else "Not Found"
        structured_data["Application Date"] = re.search(r"Application\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", normalized_text).group(1) if re.search(r"Application\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", normalized_text) else "Not Found"

    
    elif file_prefix == "02":
        # Similar structure as 'D0' but adjusted for "02" prefix logic
        # structured_data["Patent Number"] = re.search(r"(Patent\sNumber\s\d+)", normalized_text).group(1) if re.search(r"(Patent\sNumber\s\d+)", normalized_text) else "Not Found"
        # structured_data["Patent Number"] = re.search(r"UNITED\sSTATES\sPATENT\sOFFICE.*?(Patent\sNumber\s\d{1,3}(?:,\d{3})*)", normalized_text).group(1) if re.search(r"UNITED\sSTATES\sPATENT\sOFFICE.*?(Patent\sNumber\s\d{1,3}(?:,\d{3})*)", normalized_text) else "Not Found"
        structured_data["Patent Number"] = re.search(r"UNITED\sSTATES\sPATENT\sOFFICE\s+(\d{1,3}(?:,\d{3})*)", normalized_text).group(1) if re.search(r"UNITED\sSTATES\sPATENT\sOFFICE\s+(\d{1,3}(?:,\d{3})*)", normalized_text) else "Not Found"

        # structured_data["Patent Date"] = re.search(r"Patented\s(.*?)\sUNITED", normalized_text).group(1) if re.search(r"Patented\s(.*?)\sUNITED", normalized_text) else "Not Found"
        # structured_data["Title"] = re.search(r'UNITED STATES PATENT OFFICE\s+\d+[\s\n]+([A-Z0-9 ,.\-]+)', normalized_text).group(1) if re.search(r'UNITED STATES PATENT OFFICE\s+\d+[\s\n]+([A-Z0-9 ,.\-]+)', normalized_text) else "Not Found"
        structured_data["Title"] = re.search(r'UNITED STATES PATENT OFFICE\s*\n+\s*\d+[,\d]*\s*\n+\s*([A-Z0-9 ,.\-]+)', normalized_text).group(1) if re.search(r'UNITED STATES PATENT OFFICE\s*\n+\s*\d+[,\d]*\s*\n+\s*([A-Z0-9 ,.\-]+)', normalized_text) else "Not Found"


        structured_data["Patent Date"] = re.search(r"Patented\s([A-Z][a-z]+\.\s\d{1,2},\s\d{4})", normalized_text).group(1) if re.search(r"Patented\s([A-Z][a-z]+\.\s\d{1,2},\s\d{4})", normalized_text) else "Not Found"
        structured_data["Application Date"] = re.search(r"Application\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", normalized_text).group(1) if re.search(r"Application\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", normalized_text) else "Not Found"

    elif file_prefix == "PP":
        # Plant patents start with "Plant Pat."
        structured_data["Patent Number"] = re.search(r"(Plant Pat\.\s\d+)", normalized_text).group(1) if re.search(r"(Plant Pat\.\s\d+)", normalized_text) else "Not Found"
        # structured_data["Patent Date"] = re.search(r"Patented\s(.*?)\sUNITED", normalized_text).group(1) if re.search(r"Patented\s(.*?)\sUNITED", normalized_text) else "Not Found"
        structured_data["Patent Date"] = re.search(r"Patented\s([A-Z][a-z]+\.\s\d{1,2},\s\d{4})", normalized_text).group(1) if re.search(r"Patented\s([A-Z][a-z]+\.\s\d{1,2},\s\d{4})", normalized_text) else "Not Found"
        # structured_data["Title"] = re.search(r'UNITED STATES PATENT OFFICE\s+\d+[\s\n]+([A-Z0-9 ,.\-]+)', normalized_text).group(1) if re.search(r'UNITED STATES PATENT OFFICE\s+\d+[\s\n]+([A-Z0-9 ,.\-]+)', normalized_text) else "Not Found"
        structured_data["Title"] = re.search(r'UNITED STATES PATENT OFFICE\s*\n\s*\d+[,\d]*\s*\n\s*([A-Z0-9 ,.\-]+)', normalized_text).group(1) if re.search(r'UNITED STATES PATENT OFFICE\s*\n\s*\d+[,\d]*\s*\n\s*([A-Z0-9 ,.\-]+)', normalized_text) else "Not Found"


        # structured_data["Title"] = re.search(r'^\d+\s+([A-Z0-9 ,.\-]+)', normalized_text).group(1) if re.search(r'^\d+\s+([A-Z0-9 ,.\-]+)', normalized_text) else "Not Found"
        # structured_data["Applicant"] = re.search(r'([A-Za-z .]+), [A-Za-z]+, [A-Za-z]+, assignor to .+?,', normalized_text).group(1) if re.search(r'([A-Za-z .]+), [A-Za-z]+, [A-Za-z]+, assignor to .+?,', normalized_text) else "Not Found"
        # structured_data["Application Date"] = re.search(r'Application\s+(\w+\s+\d{1,2},\s+\d{4})', normalized_text).group(1) if re.search(r'Application\s+(\w+\s+\d{1,2},\s+\d{4})', normalized_text) else "Not Found"
        structured_data["Application Date"] = re.search(r"Application\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", normalized_text).group(1) if re.search(r"Application\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", normalized_text) else "Not Found"
        
 

        
    elif file_prefix == "RE":
        # Reissued patents start with "Re."
        # structured_data["Patent Number"] = re.search(r"(Re\.\s\d+)", normalized_text).group(1) if re.search(r"(Re\.\s\d+)", normalized_text) else "Not Found"
        structured_data["Patent Number"] = re.search(r"(Re\.?\s?\d{1,3}(?:,\d{3})*)", normalized_text).group(1) if re.search(r"(Re\.?\s?\d{1,3}(?:,\d{3})*)", normalized_text) else "Not Found"
        # structured_data["Title"] = re.search(r'UNITED STATES PATENT OFFICE\s*\d+[,\d]*\s*([\w ,.\-]+)', normalized_text).group(1) if re.search(r'UNITED STATES PATENT OFFICE\s*\d+[,\d]*\s*([\w ,.\-]+)', normalized_text) else "Not Found"
        structured_data["Title"] = re.search(r'UNITED STATES PATENT OFFICE\s*\n+\s*\d+[,\d]*\s*\n+\s*([A-Z0-9 ,.\-]+)', normalized_text).group(1) if re.search(r'UNITED STATES PATENT OFFICE\s*\n+\s*\d+[,\d]*\s*\n+\s*([A-Z0-9 ,.\-]+)', normalized_text) else "Not Found"



        # structured_data["Patent Date"] = re.search(r"Reissued\s(.*?)\sApplication", normalized_text).group(1) if re.search(r"Reissued\s(.*?)\sApplication", normalized_text) else "Not Found"
        structured_data["Patent Date"] = re.search(r"Reissued\s([A-Z][a-z]+[.,]\s\d{1,2},\s\d{4})", normalized_text).group(1) if re.search(r"Reissued\s([A-Z][a-z]+[.,]\s\d{1,2},\s\d{4})", normalized_text) else "Not Found"
        structured_data["Application Date"] = (
    re.search(
        r"Application for re[-\s]?issue\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", 
        normalized_text, 
        re.IGNORECASE
    ).group(1)
    if re.search(
        r"Application for re[-\s]?issue\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", 
        normalized_text, 
        re.IGNORECASE
    )
    else "Not Found"
)


        # structured_data["Application Date"] = re.search(r"Application for reissue\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", normalized_text).group(1) if re.search(r"Application for reissue\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", normalized_text) else "Not Found"
       


    # Apply common patterns for references and claims
    # structured_data["References"] = re.search(REGEX_PATTERNS["References"], normalized_text).group(1) if re.search(REGEX_PATTERNS["References"], normalized_text) else "Not Found"
    # structured_data["Claims"] = re.search(REGEX_PATTERNS["Claims"], normalized_text).group(1) if re.search(REGEX_PATTERNS["Claims"], normalized_text) else "Not Found"

    return structured_data

# Process a single PDF and return structured data
def process_single_pdf(file_path, file_prefix):
    try:
        raw_text = extract_text_from_pdf(file_path)
        structured_data = process_patent_text(raw_text, file_prefix)
        return file_path, structured_data
    except Exception as e:
        return file_path, {"Error": str(e)}

# Select PDF files based on their prefixes
def select_pdfs_with_prefixes(directory, prefixes):
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.pdf') and any(file.startswith(prefix) for prefix in prefixes):
                file_paths.append(os.path.join(root, file))
    return file_paths

# Save extracted data into CSV
def save_to_csv(output_csv, data):
    csv_headers = [
        "File Path", "Patent Number", "Title", "Applicant", "Application Date", "Patent Date", 
        # "References", "Claims", "Description"
    ]
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
        writer.writeheader()
        for file, structured_data in data.items():
            if "Error" in structured_data:
                # row = {"File Path": file, "Patent Number": "Error", "Title": "Error", "Applicant": "Error",
                #        "Application Date": "Error", "Patent Date": "Error", "References": "Error",
                #        "Claims": "Error", "Description": structured_data["Error"]}
                
                row = {"File Path": file, "Patent Number": "Error", "Title": "Error", "Applicant": "Error",
                       "Application Date": "Error", "Patent Date": "Error"}
            else:
                row = {
                    "File Path": file,
                    **structured_data
                }
            writer.writerow(row)

# Directory and file prefix setup
directory = 'data'
prefixes = ["02", "D0", "PP", "RE"]

# Collect matching PDF file paths
pdf_files = select_pdfs_with_prefixes(directory, prefixes)

# Use threading for faster OCR and processing (ThreadPoolExecutor works better in Jupyter)
print(f"Processing {len(pdf_files)} files...")

# Process files concurrently using ThreadPoolExecutor
results = {}
with ThreadPoolExecutor() as executor:
    futures = []
    for file_path in pdf_files:
        file_prefix = file_path.split(os.path.sep)[-1][:2]  # Extract prefix from filename
        futures.append(executor.submit(process_single_pdf, file_path, file_prefix))
    
    for future in futures:
        file_path, structured_data = future.result()
        results[file_path] = structured_data




Processing 8 files...


In [2]:
# Save results to CSV
output_csv = "optimized_patent_data.csv"
save_to_csv(output_csv, results)
print(f"Data saved to {output_csv}!")

Data saved to optimized_patent_data.csv!
