In [12]:
import PyPDF2
import re


def extract_text_from_pdf(pdf_path, output_md_path, bottom=80, top=580):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            
            parts = []
            def visitor_body(text, cm, tm, fontDict, fontSize, bottom=bottom, top=top):
                y = tm[5]
                if y > bottom and y < top:
                    parts.append(text)

            page.extract_text(visitor_text=visitor_body)
            page_text = ''.join(parts)
            text += page_text + "\n"

    with open(output_md_path, 'w', encoding='utf-8') as md_file:
        md_file.write(text)

def add_blank_lines_to_paragraphs(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # 마침표 뒤에 단어가 나오지 않는 경우를 기준으로 문단 구분
    paragraphs = re.split(r'(\.[\n\r]+)', content)
    
    # 빈 줄 추가
    formatted_text = ''
    for i in range(len(paragraphs)):
        formatted_text += paragraphs[i]
        if re.match(r'\.[\n\r]+', paragraphs[i]):
            formatted_text += '\n'  # 빈 줄 추가
    
    return formatted_text

def identify_headings(text):
    lines = text.split('\n')
    result_lines = []
    
    for line in lines:
        # 제목으로 간주할 수 있는 패턴 확인
        is_heading = False
        
        # 숫자 패턴으로 시작하는지 확인 (예: "1.", "1.2", "1.2.3")
        if re.match(r'^\d+(\.\d+)*\.?\s', line) and len(line.strip()) < 40 and not line.strip().endswith('.'):
            is_heading = True
        
        # 짧은 라인이고 마침표로 끝나지 않는 경우
        # elif len(line.strip()) < 40 and not line.strip().endswith('.'):
        #     is_heading = True
        
        # 모든 대문자로 이루어진 짧은 라인
        elif line.strip().isupper() and len(line.strip()) < 30:
            is_heading = True

        # Chapter로 시작하는 경우
        elif line.strip().startswith("Chapter") and len(line.strip()) < 40:
            is_heading = True

        if is_heading and line.strip():  # 빈 라인은 건너뜀
            result_lines.append(f"## {line}")
        else:
            result_lines.append(line)
    
    return '\n'.join(result_lines)

input_pdf = "documents/Probably Overthinking It.pdf"
output_md = "extracted/Probably Overthinking It2.qmd"

extract_text_from_pdf(input_pdf, output_md, bottom=70, top=580)

# 문단 구분을 위해 빈 줄 추가
formatted_content = add_blank_lines_to_paragraphs(output_md)

# 제목 식별 및 마크다운 형식 적용
formatted_content = identify_headings(formatted_content)

# 결과를 새 파일로 저장
with open(output_md, 'w', encoding='utf-8') as file:
    file.write(formatted_content)

## Heading과 footer의 위치 가늠하기

In [None]:
from PyPDF2 import PdfReader

reader = PdfReader("documents/conte.pdf")
page = reader.pages[3]

parts = []


def visitor_body(text, cm, tm, fontDict, fontSize, bottom=80, top=580):
    y = tm[5]
    if y > bottom and y < top:
        parts.append(text)


page.extract_text(visitor_text=visitor_body)
text_body = "".join(parts)

print(text_body)

humanintelligence,inalldirections,andprogress"™*"throughalltimes,thediscoveryarisesofa°*greatfundamentallaw,towhichitisnecessarilysubject,andwhichhasasolidfoundationofpi'oof,bothinthefactsofourorganizationandinourhistoricalexperience.Thelawisthis:—thateachofourleadingconceptions,
I.B



In [7]:
from PyPDF2 import PdfReader
import svgwrite

reader = PdfReader("documents/conte.pdf")
page = reader.pages[3]

dwg = svgwrite.Drawing("conte.svg", profile="tiny")


def visitor_svg_rect(op, args, cm, tm):
    if op == b"re":
        (x, y, w, h) = (args[i].as_numeric() for i in range(4))
        dwg.add(dwg.rect((x, y), (w, h), stroke="red", fill_opacity=0.05))


def visitor_svg_text(text, cm, tm, fontDict, fontSize):
    (x, y) = (tm[4], tm[5])
    dwg.add(dwg.text(text, insert=(x, y), fill="blue"))


page.extract_text(
    visitor_operand_before=visitor_svg_rect, visitor_text=visitor_svg_text
)
dwg.save()