In [None]:
#Here’s a cleaned-up version of your notebook, focused on experimentation and documentation, with all redundant/conflicting code and all %%writefile app.py cells removed.
#This version:

#Does not overwrite app.py
#Removes all system package install cells
#Uses only pure Python and API-based OCR code
#Keeps only relevant package installs for notebook use
#Keeps your Streamlit pages code generation if needed


In [None]:
# Install only if not already in requirements.txt
# %pip install pymupdf easyocr
import fitz  # PyMuPDF
from PIL import Image
import io
import easyocr

In [None]:
# PDF to image using PyMuPDF
def pdf_to_img(pdf_file):
    images = []
    pdf_file.seek(0)
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    for page in doc:
        pix = page.get_pixmap()
        img_bytes = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_bytes))
        images.append(img)
    return images

In [None]:
# OCR with easyocr
reader = easyocr.Reader(['en'])
def ocr_with_easyocr(image):
    result = reader.readtext(image, detail=0)
    return "\n".join(result)

In [None]:
# Extract text from PDF using easyocr
def extract_text_from_pdf(pdf_file):
    images = pdf_to_img(pdf_file)
    extracted_text = ""
    for img in images:
        extracted_text += ocr_with_easyocr(img) + "\n\n"
    return extracted_text

## Example usage
You can upload a PDF and extract text using the above functions.

In [None]:
# Example: Use extract_text_from_pdf with a file
# with open('yourfile.pdf', 'rb') as f:
#     text = extract_text_from_pdf(f)
#     print(text)

## Streamlit Pages (Optional)
You can still generate Streamlit pages for AboutUs or Methodology if needed.

In [None]:
# Example: Write AboutUs page
with open('pages/AboutUs.py', 'w') as f:
    f.write('''
import streamlit as st

st.title("About Us")

st.write("""
This tool extracts text from PDF documents using OCR.
Features:
- Upload multiple PDFs
- Summarize extracted text
- Chat with your documents (RAG)
""")
''')

In [None]:
# Example: Write Methodology page
with open('pages/Methodology.py', 'w') as f:
    f.write('''
import streamlit as st
import graphviz

st.title("Methodology")

st.header("Process Flowchart")

graph = graphviz.Digraph()
graph.node("A", "User uploads PDF(s)")
graph.node("B", "Extract text using OCR")
graph.node("C", "Display extracted text")
graph.node("D", "Summarize the text")
graph.node("E", "Display summary")
graph.node("F", "Chat with documents (RAG)")
graph.edge("A", "B")
graph.edge("B", "C")
graph.edge("C", "D")
graph.edge("D", "E")
graph.edge("E", "F")
st.graphviz_chart(graph)
''')