In [133]:
import fitz  # PyMuPDF
import io
from PIL import Image
import os
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTImage, LTFigure,LTTextBox
import pdfplumber
import PyPDF2

In [52]:
def extract_images_from_pdf(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf = fitz.open(pdf_path)
    for page_num in range(pdf.page_count):
        page = pdf.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            image_filename = os.path.join(output_folder, f"page{page_num + 1}_img{img_index + 1}.png")
            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
    pdf.close()

def extract_text_from_pdf(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf = fitz.open(pdf_path)
    for page_num in range(pdf.page_count):
        # 获取页面
        page = pdf.load_page(page_num)
        
        # 提取文本
        text = page.get_text()

        # 保存文本到文件
        text_filename = os.path.join(output_folder, f"page_{page_num + 1}.txt")
        with open(text_filename, "w", encoding="utf-8") as text_file:
            text_file.write(text)

    pdf.close()      
    return text

def extract_abstract_from_pdf(pdf_path):
    pdf = fitz.open(pdf_path)
    abstract_texts = []

    for page_num in range(pdf.page_count):
        page = pdf.load_page(page_num)
        text = page.get_text("text")

        # 搜索 'Abstract' 关键字的位置
        start_idx = text.lower().find("abstract")
        if start_idx != -1:
            # 如果找到了 'Abstract'，那么继续搜索 'Introduction' 作为结束位置
            end_idx = text.lower().find("introduction", start_idx)
            if end_idx == -1:
                # 如果没有找到 'Introduction'，则提取从 'Abstract' 开始的其余文本
                end_idx = len(text)
            
            abstract = text[start_idx:end_idx].strip()
            abstract_texts.append(abstract)
            text_filename = os.path.join(output_folder, f"abstract.txt")
            with open(text_filename, "w", encoding="utf-8") as text_file:
                text_file.write(abstract)

    pdf.close()
    return abstract_texts



In [129]:
def extract_images_and_text_below(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 使用 PDFMiner 提取图像的边界信息
    image_rects = []

    for page_num, page_layout in enumerate(extract_pages(pdf_path)):
        for element in page_layout:
            if isinstance(element, LTFigure):
                for item in element:
                    if isinstance(item, LTImage):
                        # 记录图像的页面号和位置，并将坐标转换为百分比
                        x0, y0, x1, y1 = item.bbox
                        width, height = page_layout.width, page_layout.height
                        bbox_percentage = (x0/width, y0/height, x1/width, y1/height)
                        image_rects.append((page_num, bbox_percentage))

    # 使用 PyMuPDF (fitz) 提取图像和图像下方的文本
    pdf = fitz.open(pdf_path)

    for i, (page_num, bbox_percentage) in enumerate(image_rects):
        print(page_num, bbox_percentage)
        page = pdf[page_num]
        px0, py0, px1, py1 = bbox_percentage
        width, height = page.rect.width, page.rect.height

        # 将百分比坐标转换回 PyMuPDF 的坐标
        rect = fitz.Rect(px0 * width, (1 - py1) * height, px1 * width, (1 - py0) * height)
        print(rect)
        # 提取图像
        img_list = page.get_images(full=True)
        img_info = pdf.extract_image(img_list[0][0])

        # 保存图像
        img_filename = f"page{page_num + 1}_img{i + 1}.png"
        img_filepath = os.path.join(output_folder, img_filename)
        with open(img_filepath, "wb") as f:
            f.write(img_info["image"])



        # 提取并保存图像下方的文本
        text_below = page.get_textbox(rect)
        text_filename = f"page{page_num + 1}_img{i + 1}_text.txt"
        text_filepath = os.path.join(output_folder, text_filename)
        with open(text_filepath, "w", encoding="utf-8") as f:
            f.write(text_below)

    pdf.close()

In [167]:
pdf_path = "C:\\Users\\shifu\\bio\paper\\file.pdf"
output_folder = "extracted_elements"
extract_images_and_text_below(pdf_path, output_folder)

In [166]:
def extract_images_and_text_below(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf = fitz.open(pdf_path)

    for page_num, page_layout in enumerate(extract_pages(pdf_path)):
        images = []
        text_blocks = []

        # 获取图像和文本块的位置信息
        for element in page_layout:
            if isinstance(element, LTFigure):
                for item in element:
                    if isinstance(item, LTImage):
                        images.append(item.bbox)
            elif isinstance(element, LTTextBox):
                text_blocks.append((element.bbox, element.get_text()))

        page = pdf[page_num]

        for i, img_bbox in enumerate(images):
            # 找到距离图片最近的文本块
            nearest_text = None
            min_distance = float('inf')
            img_x0, img_y0, img_x1, img_y1 = img_bbox

            for text_bbox, text_content in text_blocks:
                text_x0, text_y0, text_x1, text_y1 = text_bbox
                if text_y1 <= img_y0:  # 文本块位于图像下方
                    distance = (img_y0 - text_y1) + abs(img_x0-text_x0)   # 计算文本块和图像的垂直距离
                    if distance < min_distance:
                        min_distance = distance
                        nearest_text = text_content

            if nearest_text:
                # 使用 PDFMiner 提取并保存最近的文本块
                text_filename = f"page{page_num + 1}_text_below_img{i + 1}.txt"
                text_filepath = os.path.join(output_folder, text_filename)
                with open(text_filepath, "w", encoding="utf-8") as f:
                    f.write(nearest_text)

            # 使用 PyMuPDF 提取并保存图像
            img_list = page.get_images(full=True)
            if img_list:
                img_info = pdf.extract_image(img_list[0][0])
                if img_info:
                    img_data = img_info['image']
                    img_filename = f"page{page_num + 1}_img{i + 1}.png"
                    img_filepath = os.path.join(output_folder, img_filename)
                    with open(img_filepath, 'wb') as f:
                        f.write(img_data)

    pdf.close()