In [10]:
pip uninstall frontend -y

Found existing installation: frontend 0.0.3
Uninstalling frontend-0.0.3:
  Successfully uninstalled frontend-0.0.3
Note: you may need to restart the kernel to use updated packages.


In [18]:
import os
import pdfplumber
import pypdfium2
import pdfminer.high_level
from llama_index.core.readers import download_loader

# Define PDF path
pdf_path = "/Users/sivaguganjayachandran/Documents/python programming/Undergraduate_Chemistry/01/pdf_01.pdf"

# Verify file exists
if not os.path.exists(pdf_path):
    raise FileNotFoundError("The specified PDF file does not exist.")

# Get directory of the PDF file
output_dir = os.path.dirname(pdf_path)

# Dictionary to store extracted text
extracted_text = {}

## Unstructured (Requires setup)
try:
    UnstructuredReader = download_loader("UnstructuredReader")
    loader = UnstructuredReader()
    docs = loader.load_data(pdf_path)
    extracted_text["Unstructured"] = "\n".join([doc.text for doc in docs])
except Exception as e:
    extracted_text["Unstructured"] = f"Error: {e}"

## PDFPlumber
try:
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
        extracted_text["PDFPlumber"] = text
except Exception as e:
    extracted_text["PDFPlumber"] = f"Error: {e}"

## PDFMiner
try:
    text = pdfminer.high_level.extract_text(pdf_path)
    extracted_text["PDFMiner"] = text
except Exception as e:
    extracted_text["PDFMiner"] = f"Error: {e}"

## PyPDFium2
try:
    pdf = pypdfium2.PdfDocument(pdf_path)
    text = "\n".join([page.get_textpage().get_text_range() for page in pdf])
    extracted_text["PyPDFium2"] = text
except Exception as e:
    extracted_text["PyPDFium2"] = f"Error: {e}"

# Save results to individual Markdown files
for loader, text in extracted_text.items():
    output_file = os.path.join(output_dir, f"{loader}.md")
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(f"## {loader} Extraction\n\n{text}\n")

print(f"Extraction complete. Results saved in {output_dir}.")

  UnstructuredReader = download_loader("UnstructuredReader")


Extraction complete. Results saved in /Users/sivaguganjayachandran/Documents/python programming/Undergraduate_Chemistry/01.


In [12]:
pip install pymupdf

Note: you may need to restart the kernel to use updated packages.


In [22]:
path = "/Users/sivaguganjayachandran/Documents/python programming/Undergraduate_Chemistry/01/"

print(os.listdir(path))

['.Rhistory', 'PDFPlumber.md', 'PyPDFium2.md', 'Unstructured.md', 'PDFMiner.md', 'pdf_01.pdf']


325321

In [23]:
import os
import openai

# Set your OpenAI API key
#openai.api_key = "123"  # Replace with your actual API key

# Define directory path
path = "/Users/sivaguganjayachandran/Documents/python programming/Undergraduate_Chemistry/01/"

# Identify the PDF and Markdown files
pdf_file = os.path.join(path, "pdf_01.pdf")
md_files = [f for f in os.listdir(path) if f.endswith(".md")]

# Read the original PDF content
with open(pdf_file, "rb") as f:
    pdf_content = f.read()  # Keeping it as raw binary (GPT-4 can't read PDFs directly)


In [42]:
import os
import openai

# Set up OpenAI client with API key
client = openai.OpenAI(api_key=key)  # Replace with your actual API key

# Define directory path
path = "/Users/sivaguganjayachandran/Documents/python programming/Undergraduate_Chemistry/01/"

# Identify the PDF and Markdown files
pdf_file = os.path.join(path, "pdf_01.pdf")
md_files = [f for f in os.listdir(path) if f.endswith(".md")]

# Markdown file to store results
output_md = os.path.join(path, "GPT4_PDF_Conversion_Assessment.md")

# Set a limit on characters to avoid exceeding token limits
MAX_CHARACTERS = 5000  # Adjust if needed

# Function to read Markdown file
def read_markdown(file_path, max_chars=MAX_CHARACTERS):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
        return content[:max_chars]  # Truncate to avoid token overflow

# Store all results in a list
results = []

# Iterate through each Markdown file and compare it with the original PDF
for md_file in md_files:
    md_content = read_markdown(os.path.join(path, md_file))

    # Create a prompt for GPT-4
    prompt = f"""
    You are an expert in document conversion quality assessment.
    Evaluate the quality of the Markdown file conversion.

    **Key aspects to evaluate (score: 0-10 for each):**
    1. **Text Accuracy:** How correctly is the text represented?
    2. **Structural Preservation:** Are headings, paragraphs, lists, and indentation well-maintained?
    3. **Content Capture:** How well are tables, formulas, and figures retained?
    4. **Readability & Formatting:** Is the Markdown output clean, structured, and easy to read?
    5. **Overall Quality Score:** Mean of the above four scores.

    **Converted Markdown ({md_file}) (Truncated to 5000 characters):**
    {md_content}

    Please assign a quality score (0-10) for each category. please only provide a score for all markdowns in a tabular format. 
    no need for any explanation
    """

    # Send to OpenAI API (Updated format)
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an expert in document conversion quality assessment."},
            {"role": "user", "content": prompt}
        ]
    )

    # Extract response content
    assessment = response.choices[0].message.content

    # Print assessment
    print(f"\n### Assessment for {md_file} ###\n")
    print(assessment)

    # Save to results list
    results.append(f"## Assessment for {md_file}\n\n{assessment}\n\n")

# Save results to a Markdown file
with open(output_md, "w", encoding="utf-8") as f:
    f.writelines(results)

print(f"\nAssessment complete! Results saved to {output_md}")


### Assessment for PDFPlumber.md ###

|    Category    | Score |
| -------------- | ------ |
| Text Accuracy | 9 |
| Structural Preservation | 8 |
| Content Capture | 7 |
| Readability & Formatting | 8 |
| Overall Quality Score | 8 |

### Assessment for PyPDFium2.md ###

| Category                              | Score |
|-------------------------------------------|-------|
| Text Accuracy                              | 9.5   |
| Structural Preservation                | 9.5   |
| Content Capture                         | 7     |
| Readability & Formatting             | 9     |
| Overall Quality Score                  | 8.75  |

### Assessment for Unstructured.md ###

Without having an original document to compare with and without further information about the converted Markdown, a comprehensive assessment cannot be provided. However, as a placeholder, I can provide a general insight. 

| Aspect | Quality Score |
| ------ | ------------- |
| Text Accuracy | - |
| Structural Preservation