In [1]:
import pandas as pd
import re
from pdf2image import convert_from_path
import pytesseract
from PIL import Image



In [2]:
# Function to extract text from a PDF file using OCR
def extract_text_from_pdf(pdf_path, dpi=300):
    pages = convert_from_path(pdf_path, dpi)
    text_data = ''
    for page in pages:
        text_data += pytesseract.image_to_string(page)
    return text_data




In [3]:
# Function to clean and structure the extracted text
def process_patent_text(text):
    # Normalize the text by removing extra spaces and line breaks
    normalized_text = re.sub(r'\s+', ' ', text.strip())

    # Extract key fields using improved regex patterns
    patent_number = re.search(r'(?:Patent(?:ed)?|Des\.)\s*(\d{1,3},\d{3})', normalized_text)  # Match patterns like 155,564
    title = re.search(r'(?<=COMBINATION\s).*?(?=\sFiled)', normalized_text, re.IGNORECASE)  # Extract title before "Filed"
    applicant = re.search(r'(?:Be it known that I, )(.*?)(?=, a citizen)', normalized_text, re.IGNORECASE)  # Match applicant name
    application_date = re.search(r'Filed (.*?),', normalized_text, re.IGNORECASE)  # Match filed date
    patent_date = re.search(r'Patented (.*?)(?=\sUNITED|Des)', normalized_text, re.IGNORECASE)  # Match patented date
    claims = re.search(r'I claim:(.*?)(?=REFERENCES|$)', normalized_text, re.IGNORECASE)  # Match claims before "REFERENCES"
    references = re.search(r'REFERENCES CITED(.*?)$', normalized_text, re.IGNORECASE)  # Match references section

    # Create a dictionary of the structured data
    structured_data = {
        "Patent Number": patent_number.group(1) if patent_number else None,
        "Title": title.group(0).strip() if title else None,
        "Applicant": applicant.group(1).strip() if applicant else None,
        "Application Date": application_date.group(1).strip() if application_date else None,
        "Patent Date": patent_date.group(1).strip() if patent_date else None,
        "Claims": claims.group(1).strip() if claims else None,
        "References": references.group(1).strip() if references else None,
        "Description": text.strip(),  # Keep the full OCR text for reference
    }

    return structured_data



In [4]:
# Function to convert structured data to a DataFrame
def create_dataframe_from_patent(data):
    df = pd.DataFrame([data])
    return df



In [5]:
# Main function to handle the entire process
def process_patent_file(pdf_path):
    raw_text = extract_text_from_pdf(pdf_path)
    structured_data = process_patent_text(raw_text)
    df = create_dataframe_from_patent(structured_data)
    return df



In [6]:
# Path to the patent PDF file
pdf_file_path = '02488002.pdf'

# Process the file and display the DataFrame
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
patent_df = process_patent_file(pdf_file_path)
print(patent_df)



  Patent Number Title Applicant Application Date      Patent Date  \
0          None  None      None          Oct. 17  Nov. 15, 1949 _   

                                              Claims  \
0  I. A trailer for use in orchards and the like ...   

                                          References  \
0  The following references are of record. in: th...   

                                         Description  
0  G. CARRAHER _ 2,488,002\n\nTRAILER FRAME AND W...  


In [7]:
# Save the DataFrame to a CSV file
patent_df.to_csv(f"patent_data_with_description_{pdf_file_path}.csv", index=False)

In [30]:
import os
import platform
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import cv2
import numpy as np

# Function to set the Tesseract OCR path based on the operating system
def set_tesseract_path():
    system = platform.system()
    if system == "Windows":
        # Update this path if Tesseract is installed elsewhere on your Windows machine
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
    elif system == "Linux":
        pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
    elif system == "Darwin":  # macOS
        pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract"
    else:
        raise EnvironmentError(f"Unsupported OS: {system}")
    print(f"Tesseract path set to: {pytesseract.pytesseract.tesseract_cmd}")

# Debugging OCR for problematic PDFs
def debug_pdf_ocr(pdf_path, output_folder="debug_pages", dpi=1200):
    os.makedirs(output_folder, exist_ok=True)
    pages = convert_from_path(pdf_path, dpi)
    
    for idx, page in enumerate(pages):
        # Save the page for visual inspection
        image_path = os.path.join(output_folder, f"page_{idx + 1}.png")
        page.save(image_path, "PNG")
        
        # Convert to grayscale and apply preprocessing for OCR enhancement
        gray_image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2GRAY)
        _, binarized_image = cv2.threshold(gray_image, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        # Convert back to PIL Image for OCR processing
        processed_image = Image.fromarray(binarized_image)
        
        # Extract OCR text and detailed data
        ocr_data = pytesseract.image_to_data(processed_image, output_type=pytesseract.Output.DICT)
        print(f"Processing page {idx + 1}...")
        for i in range(len(ocr_data["text"])):
            word = ocr_data["text"][i].strip()
            confidence = ocr_data["conf"][i]
            if word:  # Only print meaningful words
                print(f"Word: {word}, Confidence: {confidence}")
    print(f"All pages processed. Debug images saved to: {output_folder}")




In [31]:

# Set Tesseract path for the current system
set_tesseract_path()

# Specify the PDF file and output folder
pdf_file_path = "RE023169.pdf"  # Replace with your PDF file path
output_folder = "debug_RE023169"  # Folder to save processed images

# Run the debug OCR
debug_pdf_ocr(pdf_file_path, output_folder=output_folder)

Tesseract path set to: C:\Program Files (x86)\Tesseract-OCR\tesseract.exe




Processing page 1...
Word: Nov., Confidence: 96
Word: 22,, Confidence: 96
Word: 1949, Confidence: 96
Word: F., Confidence: 88
Word: J., Confidence: 96
Word: HORTON, Confidence: 96
Word: Re., Confidence: 80
Word: 23,169, Confidence: 95
Word: WIRE, Confidence: 96
Word: BENDING, Confidence: 95
Word: MACHINE, Confidence: 96
Word: Original, Confidence: 95
Word: Filed, Confidence: 96
Word: Jan., Confidence: 96
Word: 21., Confidence: 91
Word: 1938, Confidence: 96
Word: 9, Confidence: 92
Word: Sheets-Sheet, Confidence: 60
Word: 1, Confidence: 95
Word: x, Confidence: 51
Word: LO, Confidence: 87
Word: ME, Confidence: 45
Word: oe:, Confidence: 22
Word: i”, Confidence: 8
Word: |, Confidence: 94
Word: tt, Confidence: 4
Word: i, Confidence: 24
Word: (NVENTOR, Confidence: 65
Word: ,, Confidence: 57
Word: ora, Confidence: 0
Word: S7, Confidence: 69
Word: Oe, Confidence: 28
Word: yieeess, Confidence: 0
Word: KS,, Confidence: 73
Processing page 2...
Word: Nov., Confidence: 95
Word: 22,1949, Confidence: 

In [None]:
import os
import platform
import re
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import cv2
import numpy as np

# Function to set the Tesseract OCR path based on the operating system
def set_tesseract_path():
    system = platform.system()
    if system == "Windows":
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
    elif system == "Linux":
        pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
    elif system == "Darwin":  # macOS
        pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract"
    else:
        raise EnvironmentError(f"Unsupported OS: {system}")
    print(f"Tesseract path set to: {pytesseract.pytesseract.tesseract_cmd}")

# Function to preprocess the image
def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    _, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return Image.fromarray(thresh)

# Extract text from PDF
def extract_text_from_pdf(pdf_path, dpi=1200):
    pages = convert_from_path(pdf_path, dpi)
    text_data = ''
    for idx, page in enumerate(pages):
        print(f"Processing page {idx + 1}...")
        processed_image = preprocess_image(page)
        text_data += pytesseract.image_to_string(processed_image) + '\n'
    return text_data

# Process and structure the extracted text
def process_patent_text(text):
    # Normalize the text
    normalized_text = re.sub(r'\s+', ' ', text.strip())

    # Extract fields using regex
    patent_number = re.search(r'(?:Patent(?:ed)?|Des\.)\s*(\d{1,3},\d{3})', normalized_text)
    title = re.search(r'(?<=COMBINATION\s).*?(?=\sFiled)', normalized_text, re.IGNORECASE)
    applicant = re.search(r'(?:Be it known that I, )(.*?)(?=, a citizen)', normalized_text, re.IGNORECASE)
    application_date = re.search(r'Filed (.*?),', normalized_text, re.IGNORECASE)
    patent_date = re.search(r'Patented (.*?)(?=\sUNITED|Des)', normalized_text, re.IGNORECASE)
    claims = re.search(r'I claim:(.*?)(?=REFERENCES|$)', normalized_text, re.IGNORECASE)
    references = re.search(r'REFERENCES CITED(.*?)$', normalized_text, re.IGNORECASE)

    # Create a dictionary of structured data
    structured_data = {
        "Patent Number": patent_number.group(1) if patent_number else None,
        "Title": title.group(0).strip() if title else None,
        "Applicant": applicant.group(1).strip() if applicant else None,
        "Application Date": application_date.group(1).strip() if application_date else None,
        "Patent Date": patent_date.group(1).strip() if patent_date else None,
        "Claims": claims.group(1).strip() if claims else None,
        "References": references.group(1).strip() if references else None,
        "Description": text.strip(),  # Keep full OCR text
    }

    return structured_data

# Save structured data to a CSV file
def save_to_csv(data, output_csv):
    df = pd.DataFrame([data])
    df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")

# Main function
def main(pdf_path, output_csv):
    # Set the Tesseract OCR path
    set_tesseract_path()

    # Extract and process the text
    raw_text = extract_text_from_pdf(pdf_path)
    structured_data = process_patent_text(raw_text)

    # Save the structured data to a CSV file
    save_to_csv(structured_data, output_csv)

# Specify file paths
pdf_file_path = "RE023169.pdf"  # Replace with your PDF file path
output_csv_path = "RE023169_data.csv"  # CSV file to save structured data

# Run the script
if __name__ == "__main__":
    main(pdf_file_path, output_csv_path)


Tesseract path set to: C:\Program Files (x86)\Tesseract-OCR\tesseract.exe
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Data saved to RE023169_data.csv


In [32]:
import os
import platform
import re
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import cv2
import numpy as np

# Function to set the Tesseract OCR path based on the operating system
def set_tesseract_path():
    system = platform.system()
    if system == "Windows":
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
    elif system == "Linux":
        pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
    elif system == "Darwin":  # macOS
        pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract"
    else:
        raise EnvironmentError(f"Unsupported OS: {system}")
    print(f"Tesseract path set to: {pytesseract.pytesseract.tesseract_cmd}")

# Function to preprocess the image
def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    _, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return Image.fromarray(thresh)

# Debugging OCR for problematic PDFs with high DPI
def debug_pdf_ocr(pdf_path, output_folder="debug_pages", dpi=1200):
    os.makedirs(output_folder, exist_ok=True)
    pages = convert_from_path(pdf_path, dpi)
    text_data = ''
    
    for idx, page in enumerate(pages):
        # Save the page for visual inspection
        image_path = os.path.join(output_folder, f"page_{idx + 1}.png")
        page.save(image_path, "PNG")
        
        # Preprocess the image for better OCR
        processed_image = preprocess_image(page)
        
        # Extract OCR text
        text = pytesseract.image_to_string(processed_image)
        text_data += text + '\n'
        
        # Extract OCR data with confidence
        ocr_data = pytesseract.image_to_data(processed_image, output_type=pytesseract.Output.DICT)
        print(f"Processing page {idx + 1}...")
        for i in range(len(ocr_data["text"])):
            word = ocr_data["text"][i].strip()
            confidence = ocr_data["conf"][i]
            if word:  # Only print meaningful words
                print(f"Word: {word}, Confidence: {confidence}")
    
    print(f"All pages processed. Debug images saved to: {output_folder}")
    return text_data

# Process and structure the extracted text
def process_patent_text(text):
    normalized_text = re.sub(r'\s+', ' ', text.strip())

    # Extract fields using regex
    patent_number = re.search(r'(?:Patent(?:ed)?|Des\.)\s*(\d{1,3},\d{3})', normalized_text)
    title = re.search(r'(?<=COMBINATION\s).*?(?=\sFiled)', normalized_text, re.IGNORECASE)
    applicant = re.search(r'(?:Be it known that I, )(.*?)(?=, a citizen)', normalized_text, re.IGNORECASE)
    application_date = re.search(r'Filed (.*?),', normalized_text, re.IGNORECASE)
    patent_date = re.search(r'Patented (.*?)(?=\sUNITED|Des)', normalized_text, re.IGNORECASE)
    claims = re.search(r'I claim:(.*?)(?=REFERENCES|$)', normalized_text, re.IGNORECASE)
    references = re.search(r'REFERENCES CITED(.*?)$', normalized_text, re.IGNORECASE)

    # Create a dictionary of structured data
    structured_data = {
        "Patent Number": patent_number.group(1) if patent_number else None,
        "Title": title.group(0).strip() if title else None,
        "Applicant": applicant.group(1).strip() if applicant else None,
        "Application Date": application_date.group(1).strip() if application_date else None,
        "Patent Date": patent_date.group(1).strip() if patent_date else None,
        "Claims": claims.group(1).strip() if claims else None,
        "References": references.group(1).strip() if references else None,
        "Description": text.strip(),  # Keep full OCR text
    }
    return structured_data

# Save structured data to a CSV file
def save_to_csv(data, output_csv):
    df = pd.DataFrame([data])
    df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")

# Main function
def main(pdf_path, output_folder, output_csv):
    # Set Tesseract path
    set_tesseract_path()

    # Perform OCR with debugging
    raw_text = debug_pdf_ocr(pdf_path, output_folder=output_folder)

    # Process the text to extract structured data
    structured_data = process_patent_text(raw_text)

    # Save the structured data to a CSV file
    save_to_csv(structured_data, output_csv)

# Specify file paths
pdf_file_path = "RE023169.pdf"  # Replace with your PDF file path
output_folder = "debug_RE023169"  # Folder to save debug images
output_csv_path = "RE023169_data.csv"  # CSV file to save structured data

# Run the script
if __name__ == "__main__":
    main(pdf_file_path, output_folder, output_csv_path)


Tesseract path set to: C:\Program Files (x86)\Tesseract-OCR\tesseract.exe
Processing page 1...
Word: Nov., Confidence: 96
Word: 22,, Confidence: 96
Word: 1949, Confidence: 96
Word: F., Confidence: 88
Word: J., Confidence: 96
Word: HORTON, Confidence: 96
Word: Re., Confidence: 80
Word: 23,169, Confidence: 95
Word: WIRE, Confidence: 96
Word: BENDING, Confidence: 95
Word: MACHINE, Confidence: 96
Word: Original, Confidence: 95
Word: Filed, Confidence: 96
Word: Jan., Confidence: 96
Word: 21., Confidence: 91
Word: 1938, Confidence: 96
Word: 9, Confidence: 92
Word: Sheets-Sheet, Confidence: 60
Word: 1, Confidence: 95
Word: x, Confidence: 51
Word: LO, Confidence: 87
Word: ME, Confidence: 45
Word: oe:, Confidence: 22
Word: i”, Confidence: 8
Word: |, Confidence: 94
Word: tt, Confidence: 4
Word: i, Confidence: 24
Word: (NVENTOR, Confidence: 65
Word: ,, Confidence: 57
Word: ora, Confidence: 0
Word: S7, Confidence: 69
Word: Oe, Confidence: 28
Word: yieeess, Confidence: 0
Word: KS,, Confidence: 73
P

In [None]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os

# Debugging OCR for problematic PDFs
def debug_pdf_ocr(pdf_path, output_folder="debug_pages", dpi=300):
    os.makedirs(output_folder, exist_ok=True)
    pages = convert_from_path(pdf_path, dpi)
    
    for idx, page in enumerate(pages):
        image_path = os.path.join(output_folder, f"page_{idx + 1}.png")
        page.save(image_path, "PNG")
        print(f"Saved page {idx + 1} as {image_path}")
        
        # Extract detailed OCR data
        ocr_data = pytesseract.image_to_data(page, output_type=pytesseract.Output.DICT)
        for i in range(len(ocr_data["text"])):
            print(f"Word: {ocr_data['text'][i]}, Confidence: {ocr_data['conf'][i]}")
            
    return pages

import cv2
import numpy as np

def preprocess_image(image):
    # Convert to grayscale
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    
    # Apply thresholding
    _, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    return Image.fromarray(thresh)

def extract_text_with_preprocessing(pdf_path, dpi=300):
    pages = convert_from_path(pdf_path, dpi)
    text_data = ''
    for idx, page in enumerate(pages):
        print(f"Processing page {idx + 1} with preprocessing...")
        preprocessed_image = preprocess_image(page)
        text = pytesseract.image_to_string(preprocessed_image)
        print(f"Extracted text for page {idx + 1}: {text[:100]}...")
        text_data += text + '\n'
    return text_data

from PyPDF2 import PdfReader

def inspect_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text_detected = []
    for page in reader.pages:
        text_detected.append(page.extract_text())
    return text_detected





In [22]:
extract_text_with_preprocessing('PP000881.pdf')

Processing page 1 with preprocessing...
Extracted text for page 1: Oct. 18, 1949.

Cc. D. CORLISS
EVONYMUS FORTUNEI PLANT

Filed Dec. 19, 1947

Plant Pat. 881

a
...
Processing page 2 with preprocessing...
Extracted text for page 2: Patented Oct. 18, 1949

Plant Pat. 881

UNITED STATES PATENT OFFICE

881
EUONYMUS FORTUNEI PLANT

Cl...
Processing page 3 with preprocessing...
Extracted text for page 3: 3
6red with a pinkish tinge, opening loculicidal
to expose one or two seeds in each cell, each
seed ...


"Oct. 18, 1949.\n\nCc. D. CORLISS\nEVONYMUS FORTUNEI PLANT\n\nFiled Dec. 19, 1947\n\nPlant Pat. 881\n\na\n\nPatented Oct. 18, 1949\n\nPlant Pat. 881\n\nUNITED STATES PATENT OFFICE\n\n881\nEUONYMUS FORTUNEI PLANT\n\nClifford D. Corliss, Magnolia, Mass., assignor to\nCorliss Bros. Ine., Gloucester, Mass., a corpora-\ntion of Massachusetts\n\nApplication December 19, 1947, Serial No. 792,707\n\n1 Claim.\n\n1\n\nThis invention relates to a new and distinct\nvariety of Euonymus fortunei plant, an evergreen\noriginated by me from a selected seedling pro-\nduced by crossing Euonymus fortunei vegeta and\nHuonymus fortunei radicans.\n\nThis new variety possesses a number of desir-\nable characteristics clearly distinguishing it from\nits parents and other similar evergreen varieties,\nand the following comparisons will suffice to\npoint out its particular advantages of utility for\nornamental purposes.\n\nIt is well known that Euonymus fortunei varie-\nties of vegeta and radicans embody irregul