In [1]:
# (Colab cell: change cell type to "Code" and paste)
# Install Tesseract and poppler (system packages)
!apt-get update -qq
!apt-get install -y -qq tesseract-ocr poppler-utils

# Install Python packages
!pip install -q pytesseract pdf2image Pillow


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 125081 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.11_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.11) ...
Setting up poppler-utils (22.02.0-2ubuntu0.11) ...
Processing triggers for man-db (2.10.2-1) ...


In [2]:
# (Code cell)
import os
import io
import tempfile
from typing import List
from PIL import Image
import pytesseract
from pdf2image import convert_from_path

# Tesseract executable path on Colab
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"

# Helper functions
def image_to_text(image_path: str, lang: str = "eng") -> str:
    """Extract text from an image file."""
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img, lang=lang)
    return text

def pdf_to_text(pdf_path: str, lang: str = "eng", dpi: int = 300) -> str:
    """Convert PDF to images then extract text from each page."""
    texts: List[str] = []
    # convert_from_path uses poppler-utils (installed above)
    images = convert_from_path(pdf_path, dpi=dpi)
    for i, img in enumerate(images):
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
            tmp_path = fp.name
            img.save(tmp_path, "PNG")
        try:
            texts.append(pytesseract.image_to_string(Image.open(tmp_path), lang=lang))
        finally:
            os.remove(tmp_path)
    return "\n".join(texts)

def file_to_text(file_path: str, lang: str = "eng", dpi: int = 300) -> str:
    """Dispatch by file extension to image or pdf OCR."""
    ext = os.path.splitext(file_path)[1].lower()
    if ext in {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}:
        return image_to_text(file_path, lang=lang)
    elif ext == ".pdf":
        return pdf_to_text(file_path, lang=lang, dpi=dpi)
    else:
        raise ValueError(f"Unsupported file type: {ext}")


In [3]:
# (Code cell)
from google.colab import files
import os

UPLOAD_DIR = "/content/uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)

print("Select one or more image/PDF files to upload (PNG, JPG, PDF, TIFF, BMP)...")
uploaded = files.upload()  # opens a file chooser

# Save uploaded files to UPLOAD_DIR
saved_paths = []
for name, data in uploaded.items():
    path = os.path.join(UPLOAD_DIR, name)
    with open(path, "wb") as f:
        f.write(data)
    saved_paths.append(path)

print("\nSaved files:")
for p in saved_paths:
    print(" -", p)


Select one or more image/PDF files to upload (PNG, JPG, PDF, TIFF, BMP)...


Saving 2310030415_CO3_one min paper_DNA.pdf to 2310030415_CO3_one min paper_DNA.pdf

Saved files:
 - /content/uploads/2310030415_CO3_one min paper_DNA.pdf


In [4]:
# (Code cell)
from google.colab import files
import os
import zipfile

LANG = "eng"        # change if you installed other tesseract languages
DPI  = 300          # higher = better OCR for PDFs, but slower

output_txts = []

for file_path in saved_paths:
    print(f"\nProcessing: {file_path}")
    try:
        text = file_to_text(file_path, lang=LANG, dpi=DPI)
    except Exception as e:
        print("  Error during OCR:", e)
        continue

    # Save result to a .txt file next to the uploaded file
    txt_path = file_path + ".txt"
    with open(txt_path, "w", encoding="utf-8") as out:
        out.write(text)
    output_txts.append(txt_path)
    print(f"  Extracted {len(text)} characters -> saved to {txt_path}")

# Offer downloads for each text file
print("\nYou can download the extracted text files below:")
for t in output_txts:
    files.download(t)



Processing: /content/uploads/2310030415_CO3_one min paper_DNA.pdf
  Extracted 223 characters -> saved to /content/uploads/2310030415_CO3_one min paper_DNA.pdf.txt

You can download the extracted text files below:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
# (Code cell)
from IPython.display import Markdown, display

for txt in output_txts:
    print("\n===== Preview:", txt, "=====\n")
    with open(txt, "r", encoding="utf-8") as f:
        snippet = f.read(2000)  # show first 2000 chars
    display(Markdown(f"**{os.path.basename(txt)}**\n\n```\n{snippet}\n```"))



===== Preview: /content/uploads/2310030415_CO3_one min paper_DNA.pdf.txt =====



**2310030415_CO3_one min paper_DNA.pdf.txt**

```
DNA 23100 30419

Co-3 a _â€”

ONE MINVTE PA PER

=> Dense Nk & Eficienk Architecture

dn hadibeonal NN )
> Cake Lage in, onl 4p the next
7

po de Dee NE

FR ae)
vTheve ,
a, 4 lp of yee L
Wy 7 Comp oxrte

[m4 % Came e ; a

 

```