In [None]:
import os
from PyPDF2 import PdfReader
import openai
import re
import csv
import os

# Set your OpenAI API key here
openai.api_key = "API HERE"

def extract_pdf_data(pdf_file_path):
    """Extracts basic information from a PDF file."""
    try:
        with open(pdf_file_path, 'rb') as pdf_file_obj:
            pdf_reader = PdfReader(pdf_file_obj)
            extracted_data = {
                "filename": os.path.basename(pdf_file_path),
                "chart_number": "",
                "progress_note_date": "",
                "detected_text": ""
            }
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                extracted_data["detected_text"] += page_text + '\n\n'
                
                # Extract chart number and progress note date
                if "Chart No.:" in page_text and not extracted_data["chart_number"]:
                    potential_chart_number = page_text.split("Chart No.:")[1].split()[0].strip()
                    if not potential_chart_number.startswith("SLC"):
                        potential_chart_number = "SLC" + potential_chart_number
                    extracted_data["chart_number"] = potential_chart_number
                if "Visit Date:" in page_text and not extracted_data["progress_note_date"]:
                    extracted_data["progress_note_date"] = page_text.split("Visit Date:")[1].split()[0].strip()
            
            # Simple text cleaning
            extracted_data["detected_text"] = extracted_data["detected_text"].replace('\n\n', ' ')
            return extracted_data
    except Exception as e:
        print(f"Error processing PDF: {pdf_file_path} - {e}")
        return None

def generate_openai_query(extracted_data):
    """Constructs the query for the LLM."""
    query = ("""Read the provided medical note and extract the following information:
1. **ICD-10 Codes**: Extract all ICD-10 codes listed in the note.
2. **CPT Codes**: Extract all CPT codes listed in the note.
3. **CPT II Codes**: Extract all CPT II codes listed in the note.
4. **HCPCS Codes**: Extract all HCPCS codes listed in the note.

Additional Processing:
1. **Create ICD-10 Codes**: If there is mention of mastectomy, hysterectomy, active diagnoses of cancer, or social determinants of health, extract and list any newly created ICD-10 codes.
2. **Create CPT Codes**: Add any CPT codes not already found in the note.
3. **Create CPT II Codes**: 
    - Include one CPT II code for systolic and one for diastolic blood pressure if there is a blood pressure measurement mentioned in the note. 
    - Ensure there are always 2 CPT II codes for blood pressure, starting with 3 and ending with an F.
    - Create a CPT II code if there is a mention of Hemoglobin A1C levels.

Format the output exactly like this:

File: [Filename]
Chart Number: [Chart Number]
Date of Encounter: [Date of Encounter]
ICD-10 Codes: [ICD-10 Codes]
CPT Codes: [CPT Codes]
CPT II Codes: [CPT II Codes]
HCPCS Codes: [HCPCS Codes]
Create ICD-10 Codes: [Newly Created ICD-10 Codes]
Create CPT Codes: [Newly Created CPT Codes]
Create CPT II Codes: [Newly Created CPT II Codes]

Make sure to replace placeholders with the actual extracted data and ensure that all fields are filled out appropriately.""")

    return query

def process_openai_response(extracted_data, response):
    """Parses the LLM response and formats the output."""
    output_content = f"File: {extracted_data['filename']}\n"
    output_content += f"Chart Number: {extracted_data['chart_number']}\n"
    output_content += f"Progress Note Date: {extracted_data['progress_note_date']}\n"
    output_content += response['choices'][0]['message']['content']
    output_content += "\n--------------------\n"
    return output_content

# Define the directory containing PDF files
pdf_directory = "/Users/rolandomantilla/Desktop/Enc/Test/"

# Process each PDF file in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_file_path = os.path.join(pdf_directory, filename)
        extracted_data = extract_pdf_data(pdf_file_path)
        if extracted_data:
            user_msg = extracted_data["detected_text"] + " " + generate_openai_query(extracted_data)

            # Call OpenAI API for completion
            try:
                response = openai.ChatCompletion.create(
                    model="use an open AI model",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": user_msg},
                    ],
                )
                output_content = process_openai_response(extracted_data, response)

                # Create a distinct text file for each PDF
                output_filename = f"{os.path.splitext(filename)[0]}_output.txt"
                output_file_path = os.path.join(pdf_directory, output_filename)
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(output_content)

            except openai.error.APIError as e:
                print(f"OpenAI API Error: {e}")
            except openai.error.APIConnectionError as e:
                print(f"OpenAI API Connection Error: {e}")

print("Text files generated successfully!")

# Define the directory containing text files
text_directory = "/PATH/"

# Updated regex patterns for extracting data
chart_no_pattern = re.compile(r'[\-\*\s]*[Cc]hart\s*[Nn](?:o|umber)[\.:]*\s*[^\w]*(SCL\d{5})')
visit_date_pattern = re.compile(r'\b(?:Date of Encounter|Visit|Encounter)\b.*?(\d{2}-\d{2}-\d{4})', re.IGNORECASE)

# Patterns to extract the first occurrence of codes
icd10_pattern = re.compile(r'ICD-10 Codes[:\*\s]*([\w,. ]+?)(?=(\n|$))', re.IGNORECASE)
cpt_pattern = re.compile(r'CPT Codes[:\*\s]*([\w,. ]+?)(?=(\n|$))', re.IGNORECASE)
cptii_pattern = re.compile(r'CPT II Codes[:\*\s]*([\w,. ]+?)(?=(\n|$))', re.IGNORECASE)
hcpcs_pattern = re.compile(r'HCPCS Codes[:\*\s]*([\w,. ]+?)(?=(\n|$))', re.IGNORECASE)
new_icd10_pattern = re.compile(r'Create ICD-10 Codes[:\*\s]*([\w,. ]+?)(?=(\n|$))', re.IGNORECASE)
new_cpt_pattern = re.compile(r'Create CPT Codes[:\*\s]*([\w,. ]+?)(?=(\n|$))', re.IGNORECASE)
new_cptii_pattern = re.compile(r'Create CPT II Codes:\s*((?:\d{4}F(?:,\s*)?)+)', re.IGNORECASE)



# Handle alternate formats for data extraction
def extract_data_from_text(text):
    # Extract data using regular expressions
    chart_no = chart_no_pattern.search(text)
    visit_date = visit_date_pattern.search(text)
    icd10_codes = icd10_pattern.search(text)
    cpt_codes = cpt_pattern.search(text)
    cptii_codes = cptii_pattern.search(text)
    hcpcs_codes = hcpcs_pattern.search(text)
    new_icd10_codes = new_icd10_pattern.search(text)
    new_cpt_codes = new_cpt_pattern.search(text)
    new_cptii_codes = new_cptii_pattern.search(text)

    # Prepare the extracted data
    return [
        chart_no.group(1).strip() if chart_no else '',
        visit_date.group(1) if visit_date else '',
        icd10_codes.group(1).strip() if icd10_codes else '',
        cpt_codes.group(1).strip() if cpt_codes else '',
        cptii_codes.group(1).strip() if cptii_codes else '',
        hcpcs_codes.group(1).strip() if hcpcs_codes else '',
        new_icd10_codes.group(1).strip() if new_icd10_codes else '',
        new_cpt_codes.group(1).strip() if new_cpt_codes else '',
        new_cptii_codes.group(1).strip() if new_cptii_codes else ''
    ]

# Create a CSV file to save the extracted data
output_csv_file = "/PATH/extracted_data1.csv"
with open(output_csv_file, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([
        'Chart Number', 'Date of Encounter', 'ICD-10 Codes', 'CPT Codes',
        'CPT II Codes', 'HCPCS Codes', 'Created ICD-10 Codes',
        'Created CPT Codes', 'Created CPT II Codes'
    ])

    # Process each text file in the directory
    for filename in os.listdir(text_directory):
        if filename.endswith(".txt"):
            text_file_path = os.path.join(text_directory, filename)
            try:
                with open(text_file_path, 'r', encoding='utf-8') as text_file:
                    text = text_file.read()
                    extracted_data = extract_data_from_text(text)
                    writer.writerow(extracted_data)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

print("Data extraction and CSV file generation completed successfully!")
