In [None]:
!pip install PyMuPDF
import fitz  # PyMuPDF
from PIL import Image
import io
import os


def extract_images(file_path) -> list[Image.Image]:
    """
    Извлекает встроенные изображения из PDF и возвращает список PIL.Image.
    принимает путь до pdf файла
    возвращает список списков картинок
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Файл {file_path} не найден")

    doc = fitz.open(file_path)
    images = [[] for _ in range(len(doc))]
    seen_xrefs = set()  # Для избежания дубликатов

    for page_num,page in enumerate(doc):
        img_list = page.get_images(full=True)

        for img_info in img_list:
            xref = img_info[0]
            if xref in seen_xrefs:
                continue
            seen_xrefs.add(xref)

            try:
                base_image = doc.extract_image(xref)
                image_data = base_image["image"]
                image = Image.open(io.BytesIO(image_data))
                images[page_num].append(image)
            except Exception as e:
                print(f"Ошибка при извлечении изображения (xref={xref}): {e}")

    doc.close()
    return images



In [None]:
images = extract_images("/content/RNN.pdf")

# Сохранить изображения
for i, img in enumerate(images):
  for j,imgj in enumerate(img):
    imgj.save(f"image_{i}.png")

In [None]:
print(f"PyMuPDF version: {fitz.__doc__.split(' ')[1]}")
print(f"pytesseract version: {pytesseract.__version__}")

PyMuPDF version: 1.25.5:
pytesseract version: 0.3.13


In [6]:
!sudo apt update
!sudo apt install tesseract-ocr -y
!sudo apt install libtesseract-dev -y
!sudo apt install tesseract-ocr-eng -y  # Для английского языка
!sudo apt install tesseract-ocr-rus -y  # Для русского языка
%pip install PyMuPDF pytesseract
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import os
import json
"""
для colab

"""

def extract_text(pres_path) -> None:
    """
    извлекает текст из pdf файла
    принимает путь до файла и создает файл с именем file формата json
    структура:
          "amount_slides": int - число слайдов,
          "titles": list - заголовки(первые строки) страниц,
          "bodies": list - тело страниц,
          "text_images": list[list] - распознаный текст с картинок
    """
    pdf_filename = os.path.basename(pres_path)
    json_filename = os.path.splitext(pdf_filename)[0] + ".json"
    output = {
        "amount_slides": 0,
        "titles": [],
        "bodies": [],
        "text_images": []
    }

    doc = fitz.open(pres_path)
    output["amount_slides"] = len(doc)

    for page_num, page in enumerate(doc):
        # Извлечение заголовка
        title = ""
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block and block["type"] == 0:
                for line in block["lines"]:
                    for span in line["spans"]:
                        if not title and span["text"].strip():
                            title = span["text"].strip()
                            break
                if title:
                    break
        output["titles"].append(title)

        # Извлечение основного текста
        full_text = page.get_text().strip()
        body = full_text.replace(title, "", 1).strip() if title else full_text
        output["bodies"].append(body)

        # Извлечение текста из изображений текущего слайда
        slide_images_text = []
        img_list = page.get_images(full=True)

        for img_info in img_list:
            xref = img_info[0]
            try:
                base_image = doc.extract_image(xref)
                image_data = base_image["image"]
                img = Image.open(io.BytesIO(image_data))
                text = pytesseract.image_to_string(img, lang="rus+eng").strip()
                if text:
                    slide_images_text.append(text)
            except Exception as e:
                print(f"Ошибка в слайде {page_num + 1}: {e}")

        output["text_images"].append(slide_images_text)

    doc.close()

    # Сохранение в JSON
    with open(json_filename, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=4)

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 257 kB in 1s (183 kB/s)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
262 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources'

In [7]:
extract_text('/content/RNN.pdf')