In [35]:
import openai
import pdfplumber
import pytesseract
import streamlit as st
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI
import os
from dotenv import load_dotenv


In [36]:
# Load environment variables from .env file
load_dotenv()

# Set your OpenAI API key from the .env file
openai.api_key = os.getenv("OPENAI_API_KEY")

In [37]:
# Helper function to extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_file):
    text = ""
    try:
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
    return text

In [38]:
# OCR function for scanned PDFs
def extract_text_with_ocr(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            img = page.to_image()
            text += pytesseract.image_to_string(img)
    return text

In [40]:
# LangChain to format the extracted data into a structured summary
def generate_health_summary(extracted_text):
    # Define the prompt template for structuring the summary
    prompt_template = """
    Extract relevant information from the following veterinary medical record and generate a structured health summary in the specified format. 
    The extracted information should be categorized under 'Client Information', 'Patient Information', 'Veterinary Clinic Information', 'Vaccinations', 'Medical Information', 'Patient Alerts', 'Medications', and 'Laboratory Graphs'.
    
    Extracted Text:
    {extracted_text}

    Output format should be:

    Client Information:
    Name: 
    Phone:
    Address:
    Email:

    Patient Information:
    Name:
    Breed:
    DOB or Age:
    Gender:
    Microchip Number:

    Veterinary Clinic Information:
    Visit Date:
    Clinic Name:
    Phone Number:

    Vaccinations:
    Vaccine | Date | Manufacturer tag number

    Medical Information:
    Marker | Date 1 | Date 2 | Date 3 | Date 4 | Date 5 | Weight

    Patient Alerts:
    Alert 1 | Alert 2

    Medications:
    Medicine Name | Strength | Number | RX description

    Laboratory graphs:
    Chemistry:
    Marker | Date 1 | Date 2 | Date 3 | Date 4 | Date 5 | Date 6
    CBC:
    Marker | Date 1 | Date 2 | Date 3 | Date 4 | Date 5 | Date 6
    Endocrinology:
    Marker | Date 1 | Date 2 | Date 3 | Date 4 | Date 5 | Date 6
    Urinalysis:
    Marker | Date 1 | Date 2 | Date 3 | Date 4 | Date 5 | Date 6
    Heartworm test:
    Marker | Date 1 | Date 2 | Date 3 | Date 4 | Date 5 | Date 6
    Fecal results:
    Marker | Date 1 | Date 2 | Date 3 | Date 4 | Date 5 | Date 6
    Other Tests:
    Marker | Date 1 | Date 2 | Date 3 | Date 4 | Date 5 | Date 6
    """

    # Format the prompt with the extracted text
    prompt = PromptTemplate(input_variables=["extracted_text"], template=prompt_template)
    
    # Initialize OpenAI LLM
    llm = OpenAI(model="text-davinci-003", openai_api_key=openai.api_key)

    # Create a chain using LangChain
    chain = LLMChain(llm=llm, prompt=prompt)

    # Run the chain and generate the summary
    structured_summary = chain.run({"extracted_text": extracted_text})

    return structured_summary

In [43]:
import fitz  # PyMuPDF

# Function to extract text using PyMuPDF
def extract_text_from_pdf_pymupdf(pdf_file):
    text = ""
    doc = fitz.open(pdf_file)
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")  # Extract text from the page
    return text

# Example usage
pdf_file = "C:\\Users\\DELL\\Downloads\\sample_input.pdf"
text = extract_text_from_pdf_pymupdf(pdf_file)
print("Extracted Text:", text)

Extracted Text: 


In [48]:
import cv2
import pytesseract
from PIL import Image
import pdfplumber
import numpy as np

def preprocess_image_for_ocr(image_obj):
    # Convert the PDF image to a format suitable for OCR
    img = np.array(image_obj)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    return thresh

# OCR function with preprocessing
def extract_text_with_ocr_preprocessed(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            # Extract images from each page
            for img in page.images:
                img_obj = page.to_image()
                preprocessed_img = preprocess_image_for_ocr(img_obj.original)
                text += pytesseract.image_to_string(preprocessed_img)  # Perform OCR on the preprocessed image
    return text

# Example usage
ocr_text = extract_text_with_ocr_preprocessed(pdf_file)
print("OCR Text:", ocr_text)

def extract_tables_with_bbox(pdf_file):
    tables = []
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            # Extract table layout with bounding boxes
            table = page.extract_tables(table_settings={"explicit_vertical_lines": True, "explicit_horizontal_lines": True})
            tables.append(table)
    return tables

# Example usage
tables = extract_tables_with_bbox(pdf_file)
print("Extracted Tables with BBox:", tables)


OCR Text: 
Extracted Tables with BBox: []


In [49]:
import cv2
import pytesseract
from PIL import Image
import pdfplumber
import numpy as np

# Preprocess image for better OCR recognition
def preprocess_image_for_ocr(image_obj):
    # Convert the PDF image to a format suitable for OCR
    img = np.array(image_obj)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.medianBlur(gray, 3)  # Reduce noise in the image
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)  # Binarize the image for better OCR
    return thresh

# OCR function with preprocessing to handle image extraction from the PDF
def extract_text_with_ocr_preprocessed(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            print(f"Processing page {page_num}...")
            for img in page.images:
                # Extract image from page and preprocess for OCR
                img_obj = page.to_image()
                im = img_obj.original
                preprocessed_img = preprocess_image_for_ocr(im)
                
                # Perform OCR on the preprocessed image
                ocr_result = pytesseract.image_to_string(preprocessed_img)
                print(f"OCR Result on Page {page_num}: {ocr_result[:100]}...")  # Print first 100 characters of OCR text
                text += ocr_result
    return text

# Function to extract tables with bounding boxes (bbox)
def extract_tables_with_bbox(pdf_file):
    tables = []
    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            print(f"Extracting tables from page {page_num}...")
            # Try a more flexible table extraction setting
            table = page.extract_tables(table_settings={"vertical_strategy": "text", "horizontal_strategy": "text"})
            if table:
                print(f"Extracted tables from page {page_num}")
                tables.append(table)
            else:
                print(f"No tables found on page {page_num}")
    return tables

# Example usage
pdf_file = "C:\\Users\\DELL\\Downloads\\sample_input.pdf"  # Replace with the path to your PDF file

# Extract text using OCR after preprocessing
ocr_text = extract_text_with_ocr_preprocessed(pdf_file)
print("OCR Text Extracted:")
print(ocr_text)

# Extract tables with bounding boxes
tables = extract_tables_with_bbox(pdf_file)
print("Extracted Tables:")
print(tables)


OCR Text Extracted:

Extracted Tables:
[]


In [2]:
import cv2
import pytesseract
from PIL import Image
import pdfplumber
import numpy as np

# Preprocess image for better OCR recognition
def preprocess_image_for_ocr(image_obj):
    img = np.array(image_obj)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.medianBlur(gray, 3)  # Reduce noise in the image
    _, thresh = cv2.threshold(gray, 120, 255, cv2.THRESH_BINARY)  # Lower threshold value for better contrast
    return thresh

# OCR function with preprocessing to handle image extraction from the PDF
def extract_text_with_ocr_preprocessed(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            print(f"Processing page {page_num}...")
            for img in page.images:
                # Extract image from page and preprocess for OCR
                img_obj = page.to_image()
                im = img_obj.original
                preprocessed_img = preprocess_image_for_ocr(im)
                
                # Perform OCR on the preprocessed image
                ocr_result = pytesseract.image_to_string(preprocessed_img, config='--psm 6')  # Specify OCR mode
                print(f"OCR Result on Page {page_num}: {ocr_result[:100]}...")  # Print first 100 characters of OCR text
                text += ocr_result
    return text

# Function to extract tables with bounding boxes (bbox)
def extract_tables_with_bbox(pdf_file):
    tables = []
    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            print(f"Extracting tables from page {page_num}...")
            # Try more flexible table extraction settings
            table = page.extract_tables(table_settings={"vertical_strategy": "text", "horizontal_strategy": "text"})
            if table:
                print(f"Extracted tables from page {page_num}")
                tables.append(table)
            else:
                print(f"No tables found on page {page_num}")
            
            # Try extracting words if no table found
            words = page.extract_words()
            if words:
                print(f"Extracted words from page {page_num}")
                print(words[:5])  # Display a few words for debugging
                
    return tables

# Example usage
pdf_file = "C:\\Users\\DELL\\Downloads\\sample_input.pdf"  # Replace with the path to your PDF file

# Extract text using OCR after preprocessing
ocr_text = extract_text_with_ocr_preprocessed(pdf_file)
print("OCR Text Extracted:")
print(ocr_text)

# Extract tables with bounding boxes
tables = extract_tables_with_bbox(pdf_file)
print("Extracted Tables:")
print(tables)


OCR Text Extracted:

Extracted Tables:
[]


In [1]:
import pytesseract
from pdf2image import convert_from_path

# convert to image using resolution 600 dpi 
pages = convert_from_path("C:\\Users\\DELL\\Downloads\\sample_input.pdf", 600)

# extract text
text_data = ''
for page in pages:
    text = pytesseract.image_to_string(page)
    text_data += text + '\n'
print(text_data)

Medical Chart

Animal Care Center at Stonebridge Ranch

SEX Neutered Male SREED French Bulldog POB Feb 28, 2018 “POS CBIOO Victory Ave BOE!

Blue MARK white down neck & chest Dallas, TX 75219

0 -90005 farroo | R09 1920021462501 |” Ss
Ni

OTES DIET: Yin & Qi Stew made by Dr Grand ONLY // MEDS: Triheart, Bravecto, LongDan, Epimedium, SiWuXiaoFeng

COLOR

REASON FOR VISIT iN

PROBLEMS REMINDERS

Jan 01, 2001
Jun 01, 2019
Jun 21, 2019
Dec 24, 2022
Nov 07, 2024
May 07, 2025
May 07, 2025

DO NOT VACCINATE

Rabies Vaccination

DA2P-Parvo

Comprehensive Physical Examination
Bordetella 0.4ml Dose (InterVet)
Heartworm Antigen T615

Fecal O&P T805

DoNotVacc
RvCanine
DogVacc
Exam
Bordetella
A-HW
A-Fecal

@—"__ Just Buck it.

WEIGHT HISTORY
Sep 12, 2024
May 08, 2024
May 03, 2023
Feb 28, 2023

PATIENT ALERTS (MANUAL)

YELLOW - opens doors

ACCSR MASCOT!

NO AZATHIOPRINE DO NOT VACCINATE
GI UPSET- careful with treats

26.00 Ib
25.50 Ib
20.40 Ib
21.50 Ib

ASSIGNED TO

TMG
TMG
TMG

STATUS CRITICAL DU

In [6]:
import pytesseract
from pdf2image import convert_from_path
import re

# Function to extract text from PDF and convert it to text using OCR
def extract_text_from_pdf(pdf_path):
    # Convert PDF to image using 600 dpi resolution
    pages = convert_from_path(pdf_path, 600)

    # Extract text from each page using pytesseract
    text_data = ''
    for page in pages:
        text = pytesseract.image_to_string(page)
        text_data += text + '\n'
    return text_data

# Function to summarize the extracted text based on the provided structure
def summarize_health_summary(text_data):
    # Dictionary to store extracted data
    summary = {
        "Client Information": {"Name": "", "Phone": "", "Address": "", "Email": ""},
        "Patient Information": {"Name": "", "Breed": "", "DOB or Age": "", "Gender": "", "Microchip Number": ""},
        "Veterinary Clinic Information": {"Visit Date": "", "Clinic Name": "", "Phone Number": ""},
        "Vaccinations": {"Vaccine": "", "Date": "", "Manufacturer tag number": ""},
        "Medical Information": {"Marker": "", "Dates": [], "Weight": ""},
        "Patient Alerts": {"Alert 1": "", "Alert 2": ""},
        "Medications": {"Medicine Name": "", "Strength": "", "Number": "", "RX description": ""},
        "Laboratory Graphs": {"Chemistry": {}, "CBC": {}, "Endocrinology": {}, "Urinalysis": {}, "Heartworm Test": {}, "Fecal Results": {}, "Other Tests": {}}
    }

    # Regular expressions for matching categories and extracting data
    # Client Information
    client_info_pattern = r"Client Information.*?((Name:.*?)(Phone:.*?)(Address:.*?)(Email:.*?))"
    client_info_match = re.search(client_info_pattern, text_data, re.DOTALL)
    if client_info_match:
        summary["Client Information"] = { 
            "Name": client_info_match.group(2).split(":")[1].strip(),
            "Phone": client_info_match.group(3).split(":")[1].strip(),
            "Address": client_info_match.group(4).split(":")[1].strip(),
            "Email": client_info_match.group(5).split(":")[1].strip()
        }

    # Patient Information
    patient_info_pattern = r"Patient Information.*?((Name:.*?)(Breed:.*?)(DOB or Age:.*?)(Gender:.*?)(Microchip Number:.*?))"
    patient_info_match = re.search(patient_info_pattern, text_data, re.DOTALL)
    if patient_info_match:
        summary["Patient Information"] = {
            "Name": patient_info_match.group(2).split(":")[1].strip(),
            "Breed": patient_info_match.group(3).split(":")[1].strip(),
            "DOB or Age": patient_info_match.group(4).split(":")[1].strip(),
            "Gender": patient_info_match.group(5).split(":")[1].strip(),
            "Microchip Number": patient_info_match.group(6).split(":")[1].strip()
        }

    # Veterinary Clinic Information
    clinic_info_pattern = r"Veterinary Clinic Information.*?((Visit Date:.*?)(Clinic Name:.*?)(Phone Number:.*?))"
    clinic_info_match = re.search(clinic_info_pattern, text_data, re.DOTALL)
    if clinic_info_match:
        summary["Veterinary Clinic Information"] = {
            "Visit Date": clinic_info_match.group(2).split(":")[1].strip(),
            "Clinic Name": clinic_info_match.group(3).split(":")[1].strip(),
            "Phone Number": clinic_info_match.group(4).split(":")[1].strip()
        }

    # Vaccinations
    vaccinations_pattern = r"Vaccinations.*?((Vaccine:.*?)(Date:.*?)(Manufacturer tag number:.*?))"
    vaccinations_match = re.search(vaccinations_pattern, text_data, re.DOTALL)
    if vaccinations_match:
        summary["Vaccinations"] = {
            "Vaccine": vaccinations_match.group(2).split(":")[1].strip(),
            "Date": vaccinations_match.group(3).split(":")[1].strip(),
            "Manufacturer tag number": vaccinations_match.group(4).split(":")[1].strip()
        }

    # Medical Information
    medical_info_pattern = r"Medical information.*?((Marker:.*?)(Weight:.*?)(Date \d+:.*?))"
    medical_info_match = re.search(medical_info_pattern, text_data, re.DOTALL)
    if medical_info_match:
        summary["Medical Information"] = {
            "Marker": medical_info_match.group(2).split(":")[1].strip(),
            "Weight": medical_info_match.group(3).split(":")[1].strip(),
            "Dates": [medical_info_match.group(i).split(":")[1].strip() for i in range(4, 9)]
        }

    # Patient Alerts
    alerts_pattern = r"Patient Alerts.*?((Alert 1:.*?)(Alert 2:.*?))"
    alerts_match = re.search(alerts_pattern, text_data, re.DOTALL)
    if alerts_match:
        summary["Patient Alerts"] = {
            "Alert 1": alerts_match.group(2).split(":")[1].strip(),
            "Alert 2": alerts_match.group(3).split(":")[1].strip()
        }

    # Medications
    medications_pattern = r"Medications.*?((Medicine Name:.*?)(Strength:.*?)(Number:.*?)(RX description:.*?))"
    medications_match = re.search(medications_pattern, text_data, re.DOTALL)
    if medications_match:
        summary["Medications"] = {
            "Medicine Name": medications_match.group(2).split(":")[1].strip(),
            "Strength": medications_match.group(3).split(":")[1].strip(),
            "Number": medications_match.group(4).split(":")[1].strip(),
            "RX description": medications_match.group(5).split(":")[1].strip()
        }

    # Laboratory Graphs (CBC, Chemistry, etc.)
    lab_graphs_patterns = {
        "Chemistry": r"Chemistry.*?((Marker:.*?)(Date \d+:.*?))",
        "CBC": r"CBC.*?((Marker:.*?)(Date \d+:.*?))",
        "Endocrinology": r"Endocrinology.*?((Marker:.*?)(Date \d+:.*?))",
        "Urinalysis": r"Urinalysis.*?((Marker:.*?)(Date \d+:.*?))",
        "Heartworm test": r"Heartworm test.*?((Marker:.*?)(Date \d+:.*?))",
        "Fecal results": r"Fecal results.*?((Marker:.*?)(Date \d+:.*?))",
        "Other Tests": r"Other Tests.*?((Marker:.*?)(Date \d+:.*?))"
    }

    for test, pattern in lab_graphs_patterns.items():
        match = re.search(pattern, text_data, re.DOTALL)
        if match:
            summary["Laboratory Graphs"][test] = {
                "Marker": match.group(1).split(":")[1].strip(),
                "Dates": [match.group(i).split(":")[1].strip() for i in range(2, 8)]
            }

    return summary

# Main function to extract and summarize the PDF text
pdf_path = "C:\\Users\\DELL\\Downloads\\sample_input.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
summarized_data = summarize_health_summary(extracted_text)

# Print summarized data
for section, data in summarized_data.items():
    print(f"{section}:")
    for key, value in data.items():
        print(f"  {key}: {value}")
    print("\n")


Client Information:
  Name: 
  Phone: 
  Address: 
  Email: 


Patient Information:
  Name: 
  Breed: 
  DOB or Age: 
  Gender: 
  Microchip Number: 


Veterinary Clinic Information:
  Visit Date: 
  Clinic Name: 
  Phone Number: 


Vaccinations:
  Vaccine: 
  Date: 
  Manufacturer tag number: 


Medical Information:
  Marker: 
  Dates: []
  Weight: 


Patient Alerts:
  Alert 1: 
  Alert 2: 


Medications:
  Medicine Name: 
  Strength: 
  Number: 
  RX description: 


Laboratory Graphs:
  Chemistry: {}
  CBC: {}
  Endocrinology: {}
  Urinalysis: {}
  Heartworm Test: {}
  Fecal Results: {}
  Other Tests: {}




In [None]:
# Step 1: Define Functions for Each Stage of the Pipeline

def load_data_from_file(file_path):
    """Load raw data from a text file."""
    with open(file_path, 'r') as file:
        return file.read()

def process_data(raw_data):
    """Process the raw data and return a structured dictionary."""
    data_lines = raw_data.splitlines()
    
    # Debugging: Print the data_lines to check the structure
    print("Data Lines:")
    for line in data_lines:
        print(line)
    
    if len(data_lines) < 18:  # Check if the data has enough lines
        raise ValueError("Insufficient data in the input file. Expected at least 18 lines.")

    # Process client info
    client_info = {
        'name': data_lines[0].split(":")[1].strip() if len(data_lines) > 0 else '',
        'phone': data_lines[1].split(":")[1].strip() if len(data_lines) > 1 else '',
        'address': data_lines[2].split(":")[1].strip() if len(data_lines) > 2 else '',
        'email': data_lines[3].split(":")[1].strip() if len(data_lines) > 3 else ''
    }
    
    # Process patient info
    patient_info = {
        'name': data_lines[5].split(":")[1].strip() if len(data_lines) > 5 else '',
        'breed': data_lines[6].split(":")[1].strip() if len(data_lines) > 6 else '',
        'dob': data_lines[7].split(":")[1].strip() if len(data_lines) > 7 else '',
        'gender': data_lines[8].split(":")[1].strip() if len(data_lines) > 8 else '',
        'microchip_number': data_lines[9].split(":")[1].strip() if len(data_lines) > 9 else ''
    }
    
    # Process clinic info
    clinic_info = {
        'visit_date': data_lines[11].split(":")[1].strip() if len(data_lines) > 11 else '',
        'clinic_name': data_lines[12].split(":")[1].strip() if len(data_lines) > 12 else '',
        'phone_number': data_lines[13].split(":")[1].strip() if len(data_lines) > 13 else ''
    }
    
    # Process vaccination info
    vaccinations = {
        'vaccine': data_lines[15].split(":")[1].strip() if len(data_lines) > 15 else '',
        'date': data_lines[16].split(":")[1].strip() if len(data_lines) > 16 else '',
        'manufacturer_tag': data_lines[17].split(":")[1].strip() if len(data_lines) > 17 else ''
    }
    
    return {
        'client_info': client_info,
        'patient_info': patient_info,
        'clinic_info': clinic_info,
        'vaccinations': vaccinations
    }


def generate_summary(data):
    """Generate a dynamic summary based on the processed data."""
    client_info = data['client_info']
    patient_info = data['patient_info']
    clinic_info = data['clinic_info']
    vaccinations = data['vaccinations']
    
    summary = f"""
    Client Information:
    Name: {client_info['name']}
    Phone: {client_info['phone']}
    Address: {client_info['address']}
    Email: {client_info['email']}

    Patient Information:
    Name: {patient_info['name']}
    Breed: {patient_info['breed']}
    DOB: {patient_info['dob']}
    Gender: {patient_info['gender']}
    Microchip Number: {patient_info['microchip_number']}

    Veterinary Clinic Information:
    Visit Date: {clinic_info['visit_date']}
    Clinic Name: {clinic_info['clinic_name']}
    Phone Number: {clinic_info['phone_number']}

    Vaccinations:
    Vaccine: {vaccinations['vaccine']}
    Date: {vaccinations['date']}
    Manufacturer tag number: {vaccinations['manufacturer_tag']}
    """
    
    return summary

def save_summary_to_file(summary, file_path):
    """Save the generated summary to a file."""
    with open(file_path, 'w') as file:
        file.write(summary)

# Step 2: Define the Pipeline

def health_summary_pipeline(input_file, output_file):
    """Main pipeline for loading data, processing, and generating summary."""
    raw_data = load_data_from_file(input_file)  # Load raw data
    processed_data = process_data(raw_data)    # Process the data into a dictionary
    summary = generate_summary(processed_data) # Generate the summary text
    save_summary_to_file(summary, output_file)  # Save the summary to a file
    
    return summary

# Step 3: Run the Pipeline
if __name__ == "__main__":
    input_file = "C:\\Users\\DELL\\Downloads\\\Medical_Chart.txt"   # Input file containing raw data
    output_file = 'generated_summary.txt'  # Output file where the summary will be saved

    # Execute the pipeline
    generated_summary = health_summary_pipeline(input_file, output_file)
    
    # Optionally, print the generated summary to the console
    print(generated_summary)


IndexError: list index out of range