In [6]:
import importlib.metadata
libs = [
    'PIL',           # Для Pillow
    'transformers',
    'torch',
    'PyMuPDF',       # Для fitz
    'tqdm',
    'json',
    'io',
    'os'
]

for lib in libs:
    try:
        version = importlib.metadata.version(lib)
        print(f"{lib}: {version}")
    except importlib.metadata.PackageNotFoundError:
        print(f"{lib}: Not found or version not available")

# Для библиотек с разными именами при импорте:
print("\nСпециальные случаи:")
print(f"PyMuPDF (fitz): {importlib.metadata.version('PyMuPDF')}")
print(f"Pillow (PIL): {importlib.metadata.version('Pillow')}")

PIL: Not found or version not available
transformers: 4.47.0
torch: 2.5.1+cu121
PyMuPDF: 1.25.5
tqdm: 4.67.1
json: Not found or version not available
io: Not found or version not available
os: Not found or version not available

Специальные случаи:
PyMuPDF (fitz): 1.25.5
Pillow (PIL): 11.0.0


In [1]:
from PIL import Image as PILImage
from transformers import AutoTokenizer, VisionEncoderDecoderModel, ViTImageProcessor
%pip install PyMuPDF
from PIL import Image
import torch
import json
import fitz
import io
import os
import traceback
from tqdm.auto import tqdm



In [2]:
def safe_image_process(img_data: bytes) -> PILImage.Image:
    """Безопасная загрузка изображения с обработкой исключений"""
    try:
        with io.BytesIO(img_data) as img_buffer:
            img = PILImage.open(img_buffer)
            img.load()
            return img.convert("RGB")
    except Exception as e:
        print(f"Ошибка загрузки изображения: {str(e)}")
        return None

# Загрузка модели и компонентов
model_name = "tuman/vit-rugpt2-image-captioning"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Инициализация компонентов
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name).to(device)
model.eval()

def recognize_image(image: PILImage.Image) -> str:
    """Генерация описания изображения"""
    try:
        # Обработка изображения
        image_tensor = feature_extractor(
            images=image,
            return_tensors="pt"
        ).pixel_values.to(device)

        # Специальный промпт для детализации
        task_prefix = "Опиши изображение очень подробно на русском языке:"
        inputs = tokenizer(task_prefix, return_tensors="pt").to(device)

        # Генерация с расширенными параметрами
        with torch.no_grad():
            output = model.generate(
                image_tensor,
                **inputs,
                max_length=60,
                num_beams=5,
                no_repeat_ngram_size=3,
                do_sample=True,
                temperature=0.5,
                top_k=50,
                top_p=0.95,
                repetition_penalty=1.2,
                early_stopping=True
            )

        # Постобработка
        caption = tokenizer.decode(
            output[0],
            skip_special_tokens=True
        ).replace(task_prefix, "").strip()

        return caption

    except Exception as e:
        print(f"Ошибка: {str(e)}")
        return ""

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224-in21k",
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size"

In [3]:
def extract_text(pres_path: str) -> None:
    pdf_filename = os.path.basename(pres_path)
    json_filename = os.path.splitext(pdf_filename)[0] + ".json"
    output = {
        "amount_slides": 0,
        "titles": [],
        "bodies": [],
        "text_images": []
    }

    try:
        doc = fitz.open(pres_path)
        output["amount_slides"] = len(doc)

        for page_index, page in tqdm(enumerate(doc),
                                  total=len(doc),
                                  desc="Обработка слайдов"):
            # Извлечение текста
            title = ""
            full_text = ""
            try:
                blocks = page.get_text("dict").get("blocks", [])
                for block in blocks:
                    if block.get("type") == 0:
                        for line in block.get("lines", []):
                            spans = line.get("spans", [])
                            if spans:
                                span = max(spans, key=lambda s: s.get("size", 0))
                                if not title:
                                    title = span["text"].strip()
                                full_text += span["text"] + " "
                full_text = full_text.strip()
                body = full_text.replace(title, "", 1).strip()
            except Exception as e:
                print(f"Ошибка извлечения текста на странице {page_index+1}: {str(e)}")
                title = ""
                body = ""

            output["titles"].append(title)
            output["bodies"].append(body)

            # Обработка изображений
            slide_images_text = []
            try:
                img_list = page.get_images(full=True)
                for img_info in img_list:
                    try:
                        xref = img_info[0]
                        base_image = doc.extract_image(xref)

                        if not base_image or "image" not in base_image:
                            continue

                        img_data = base_image.get("image")
                        if not img_data or len(img_data) < 100:
                            continue

                        img = safe_image_process(img_data)
                        if img is None:
                            continue

                        description = recognize_image(img)
                        if description:
                            slide_images_text.append(description)

                    except Exception as e:
                        print(f"Ошибка обработки изображения {xref} на странице {page_index+1}:")
                        print(traceback.format_exc())
                        continue

            except Exception as e:
                print(f"Ошибка получения изображений на странице {page_index+1}: {str(e)}")

            output["text_images"].append(slide_images_text)

        doc.close()

    except Exception as e:
        print(f"Критическая ошибка обработки файла: {str(e)}")
        if 'doc' in locals():
            doc.close()

    with open(json_filename, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=4)

In [4]:
extract_text('/content/RNN.pdf')

Обработка слайдов:   0%|          | 0/21 [00:00<?, ?it/s]