In [22]:
import fitz  # PyMuPDF
import pdfplumber
import re

def extract_text_with_equations(pdf_path):
    """
    PDF에서 텍스트와 수식을 추출하여 Markdown 형식으로 반환
    """
    doc = fitz.open(pdf_path)
    md_content = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, (fitz_page, plumber_page) in enumerate(zip(doc, pdf.pages), 1):
            # PyMuPDF로 기본 텍스트 추출
            text = fitz_page.get_text()
            
            # pdfplumber로 세부 레이아웃 분석
            words = plumber_page.extract_words(extra_attrs=["fontname", "size"])
            
            # 수식 감지 및 Markdown 형식 변환
            processed_text = process_page(text, words)
            
            md_content.append(f"# Page {page_num}\n\n{processed_text}")
    
    return "\n\n".join(md_content)

def process_page(text, words):
    """
    페이지 텍스트를 분석하여 수식을 Markdown 형식으로 변환
    """
    # 수식 감지를 위한 휴리스틱
    equation_keywords = {
        r'\\[a-zA-Z]+',  # LaTeX 명령어
        r'[\^\_]',        # 위/아래첨자
        r'[\{\}]',        # 그룹 기호
        r'[\d\w]+\(.*?\)' # 함수형 표현
    }
    
    # 단어 단위 처리
    processed = []
    for word in words:
        original = word['text']
        modified = original
        
        # 글꼴 크기/종류로 수식 판별 (예시)
        if "Math" in word['fontname'] or word['size'] > 12:
            modified = f"${original}$"
        
        # 정규식으로 추가 판별
        elif any(re.search(pattern, original) for pattern in equation_keywords):
            modified = f"${original}$"
            
        processed.append(modified)
    
    return " ".join(processed)

def pdf_to_markdown(pdf_path, output_md):
    content = extract_text_with_equations(pdf_path)
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(content)
    print(f"Markdown 파일이 생성되었습니다: {output_md}")

In [24]:
# 사용 예시
pdf_path = "documents/conte.pdf"
output_md = "extracted/conte3.qmd"

pdf_to_markdown(pdf_path, output_md)


Markdown 파일이 생성되었습니다: extracted/conte3.qmd


In [None]:
# 

In [18]:
import pdfplumber
import re
from collections import defaultdict

def extract_text_with_equations(pdf_path):
    """
    개선된 PDF 텍스트 추출 함수
    - 헤더/푸터 자동 감지
    - 페이지 번호 필터링
    - 제목 추출
    """
    with pdfplumber.open(pdf_path) as pdf:
        md_content = []
        first_page = True
        prev_footer = None
        
        # 제목 후보 추출 (첫 페이지에서 가장 큰 폰트)
        title_candidates = defaultdict(int)
        first_page_words = pdf.pages[0].extract_words(extra_attrs=["fontname", "size"])
        for word in first_page_words:
            title_candidates[word['text']] = max(title_candidates[word['text']], word['size'])
        title = max(title_candidates, key=title_candidates.get, default="") if first_page_words else ""

        for page_num, page in enumerate(pdf.pages, 1):
            # 헤더/푸터 영역 계산 (페이지 높이 기준)
            header_margin = page.height * 0.1  # 상단 10%
            footer_margin = page.height * 0.9  # 하단 10%
            
            # 페이지 단어 추출 (좌표 정보 포함)
            words = page.extract_words(
                extra_attrs=["fontname", "size"],
                keep_blank_chars=True
            )
            
            # 평균 폰트 크기 계산
            font_sizes = [word['size'] for word in words]
            avg_font_size = sum(font_sizes)/len(font_sizes) if font_sizes else 0

            filtered_words = []
            current_footer = []
            for word in words:
                # 헤더/푸터 영역 필터링
                if word['top'] < header_margin or word['bottom'] > footer_margin:
                    current_footer.append(word['text'])
                    continue
                
                # 페이지 번호 패턴 필터링 (숫자만 있는 경우)
                if re.fullmatch(r'^\d+$', word['text']):
                    continue
                
                filtered_words.append(word)

            # 반복되는 푸터 제거
            if prev_footer and current_footer == prev_footer:
                continue
            prev_footer = current_footer

            # 페이지 처리
            processed_text = process_page(filtered_words, avg_font_size)
            
            # 첫 페이지에 제목 추가
            if first_page and title:
                md_content.append(f"# {title}\n\n")
                first_page = False
            
            md_content.append(processed_text)
    
    return "\n\n".join(md_content)

def process_page(words, avg_font_size):
    """페이지 단어 리스트 처리 및 Markdown 변환"""
    processed = []
    equation_buffer = []
    
    for word in words:
        text = word['text']
        
        # 수식 감지 조건
        is_equation = (
            ("Math" in word['fontname']) or
            (word['size'] < avg_font_size * 0.9) or
            (re.search(r'\\[a-zA-Z]{2,}|[\^_\{\}]', text)))
        
        if is_equation:
            # 수식 버퍼에 추가
            equation_buffer.append(text)
        else:
            if equation_buffer:
                # 버퍼에 있는 수식 처리
                processed.append(f" $ {' '.join(equation_buffer)} $ ")
                equation_buffer = []
            processed.append(text)
    
    # 남은 수식 처리
    if equation_buffer:
        processed.append(f" $ {' '.join(equation_buffer)} $ ")
    
    # 문단 결합 규칙
    text = ' '.join(processed)
    text = re.sub(r'\s+([.,;:!?])', r'\1', text)  # 문장 부호 정규화
    text = re.sub(r'\s{2,}', '\n\n', text)       # 이중 공백을 개행으로
    
    return text

def pdf_to_markdown(pdf_path, output_md):
    content = extract_text_with_equations(pdf_path)
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(content)
    print(f"생성된 파일: {output_md}")

In [19]:
input_pdf = "documents/conte.pdf"
output_md = "extracted/conte2.qmd"

pdf_to_markdown(input_pdf, output_md)

생성된 파일: extracted/conte2.qmd


In [20]:
import PyPDF2
import re

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            text += page_text + "\n"
    return text

def remove_headers_and_footers(text):
    """
    각 줄이 단순 숫자나 'Page <숫자>'와 같이 헤더/푸터 역할을 하는 경우,
    혹은 ©, Copyright 포함 문구가 있을 경우 제거합니다.
    """
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        if re.match(r"^\s*\d+\s*$", line):
            continue
        if re.match(r"^\s*Page\s+\d+\s*$", line, re.IGNORECASE):
            continue
        if "©" in line or "Copyright" in line:
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def add_markdown_headings(text):
    """
    논문에서 흔히 사용되는 섹션 제목을 Markdown 헤딩으로 변환합니다.
    (대문자 섹션 제목을 찾아 # heading 형식으로 치환)
    """
    # 공통 섹션 목록 (필요 시 추가하거나 변경 가능)
    sections = ["ABSTRACT", "INTRODUCTION", "METHODS", "RESULTS", "DISCUSSION", "CONCLUSION", "REFERENCES"]
    
    def heading_repl(match):
        section = match.group(1).strip().title()
        return "\n# " + section + "\n"
    
    for section in sections:
        pattern = r"\n\s*(" + section + r")\s*\n"
        text = re.sub(pattern, heading_repl, text, flags=re.IGNORECASE)
    return text

def save_to_markdown(text, output_path):
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(text)

def main(pdf_path, output_path):
    raw_text = extract_text_from_pdf(pdf_path)
    no_headers = remove_headers_and_footers(raw_text)
    md_text = add_markdown_headings(no_headers)
    save_to_markdown(md_text, output_path)
    print(f"Markdown file saved to: {output_path}")

In [21]:
input_pdf = "documents/Counterfactuals and Causal Inference.pdf"
output_md = "extracted/Counterfactuals and Causal Inference4.qmd"

input_pdf = "documents/conte.pdf"
output_md = "extracted/conte2.qmd"

main(input_pdf, output_md)

Markdown file saved to: extracted/conte2.qmd


# Pick2

In [12]:
import PyPDF2
import re

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            text += page_text + "\n"
    return text

def remove_headers_and_footers(text):
    """
    각 줄이 단순 숫자나 'Page <숫자>'와 같이 헤더/푸터 역할을 하는 경우,
    혹은 ©, Copyright 포함 문구가 있을 경우 제거합니다.
    """
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        if re.match(r"^\s*\d+\s*$", line):
            continue
        if re.match(r"^\s*Page\s+\d+\s*$", line, re.IGNORECASE):
            continue
        if "©" in line or "Copyright" in line:
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def add_markdown_headings(text):
    """
    논문에서 흔히 사용되는 섹션 제목을 Markdown 헤딩으로 변환합니다.
    (대문자 섹션 제목을 찾아 # heading 형식으로 치환)
    """
    # 공통 섹션 목록 (필요 시 추가하거나 변경 가능)
    sections = ["ABSTRACT", "INTRODUCTION", "METHODS", "RESULTS", "DISCUSSION", "CONCLUSION", "REFERENCES"]
    
    def heading_repl(match):
        section = match.group(1).strip().title()
        return "\n# " + section + "\n"
    
    for section in sections:
        pattern = r"\n\s*(" + section + r")\s*\n"
        text = re.sub(pattern, heading_repl, text, flags=re.IGNORECASE)
    
    # 추가적인 heading 패턴 (예: 1. Introduction, 2. Methods 등)
    text = re.sub(r"\n(\d+\.\s+[A-Za-z].*)\n", r"\n## \1\n", text)
    
    return text

def save_to_markdown(text, output_path):
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(text)

def main(pdf_path, output_path):
    raw_text = extract_text_from_pdf(pdf_path)
    no_headers = remove_headers_and_footers(raw_text)
    md_text = add_markdown_headings(no_headers)
    save_to_markdown(md_text, output_path)
    print(f"Markdown file saved to: {output_path}")

In [13]:
input_pdf = "documents/conte.pdf"
output_md = "extracted/conte2.qmd"
main(input_pdf, output_md)

Markdown file saved to: extracted/conte2.qmd


# Pick 1

In [9]:
import pdfplumber
import re
from collections import defaultdict

def pdf_to_markdown(
    pdf_path,
    output_md_path,
    header_threshold=0.12,
    footer_threshold=0.12,
    heading_scale=1.3,
    min_heading_length=5
):
    # 폰트 크기 분석을 위한 데이터 수집
    all_font_sizes = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for char in page.chars:
                all_font_sizes.append(char['size'])

    # 본문 폰트 크기 결정
    font_size_counts = defaultdict(int)
    for size in all_font_sizes:
        font_size_counts[size] += 1
    body_font_size = max(font_size_counts, key=font_size_counts.get)

    markdown_lines = []
    previous_line_was_heading = False

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_height = page.height
            header_bound = page_height * header_threshold
            footer_bound = page_height * (1 - footer_threshold)

            lines = page.extract_text_lines()
            
            for line in lines:
                text = line['text'].strip()
                top = line['top']

                # 헤더/푸터 제거
                if top < header_bound or top > footer_bound:
                    continue

                # 빈 줄 필터링
                if not text:
                    continue

                # 글자 크기 기반 제목 검출
                line_font_sizes = [char['size'] for char in line['chars']]
                avg_font_size = sum(line_font_sizes) / len(line_font_sizes)
                is_heading = avg_font_size > body_font_size * heading_scale
                is_heading = is_heading and len(text) >= min_heading_length

                # 밑줄 제거 및 텍스트 정규화
                text = re.sub(r'\s+', ' ', text)
                text = re.sub(r'-\n', '', text)

                # Markdown 형식 변환
                if is_heading:
                    heading_level = 1 if avg_font_size > body_font_size * 1.8 else 2
                    markdown_line = f"{'#' * heading_level} {text}\n\n"
                    previous_line_was_heading = True
                else:
                    if previous_line_was_heading:
                        markdown_line = f"{text}\n\n"
                        previous_line_was_heading = False
                    else:
                        markdown_line = f"{text}\n"

                markdown_lines.append(markdown_line)

    # 결과 저장
    with open(output_md_path, 'w', encoding='utf-8') as md_file:
        md_file.writelines(markdown_lines)


def add_blank_lines_to_paragraphs(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # 마침표 뒤에 단어가 나오지 않는 경우를 기준으로 문단 구분
    paragraphs = re.split(r'(\.[\n\r]+)', content)
    
    # 빈 줄 추가
    formatted_text = ''
    for i in range(len(paragraphs)):
        formatted_text += paragraphs[i]
        if re.match(r'\.[\n\r]+', paragraphs[i]):
            formatted_text += '\n'  # 빈 줄 추가
    
    return formatted_text

In [55]:
input_pdf = "documents/The genetical theory of natural selection.pdf"
output_md = ""

# input_pdf에서 파일이름을 추출한 후 출력 파일 경로 만들기
import os

if not bool(output_md):
    output_md = os.path.splitext(os.path.basename(input_pdf))[0] + ".qmd"
    output_md = os.path.join("extracted", output_md)

# PDF를 Markdown으로 변환
pdf_to_markdown(
    input_pdf,
    output_md,
    header_threshold=0.08,
    footer_threshold=0.08,
    heading_scale=1.3,
    min_heading_length=5,
)

# 문단 구분을 위해 빈 줄 추가
formatted_content = add_blank_lines_to_paragraphs(output_md)

# 결과를 새 파일로 저장
with open(output_md, 'w', encoding='utf-8') as file:
    file.write(formatted_content)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

## Header, footer 가늠하기

In [17]:
import pdfplumber
import re
from collections import defaultdict

pdf_path = "documents/fisher-test2.pdf"
header_threshold = 0.08
footer_threshold = 0.08

pdf = pdfplumber.open(pdf_path)
page = pdf.pages[3]
page_height = page.height
header_bound = page_height * header_threshold
footer_bound = page_height * (1 - footer_threshold)

lines = page.extract_text_lines()

markdown_lines = []
for line in lines:
    text = line['text'].strip()
    top = line['top']

    # 헤더/푸터 제거
    if top < header_bound or top > footer_bound:
        continue

    # 빈 줄 필터링
    if not text:
        continue
    
    markdown_lines.append(text)

print("\n".join(markdown_lines))

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


a third race will be formed a most fertile source of the variation in
domesticatedanimals. Iffreelyallowed, the characters ofpureparents
will be lost, number of races thus [illegible] but differences [?] besides
the
[illegible].
Butifvarietiesdifferinginveryslightrespectsbeallowed
tocross, suchsmallvariationwillbedestroyed, atleastto oursenses
avariation just to be distinguished bylong legs will have offspring not
to be so distinguished. Free crossing great agent in producing uni-
formityinanybreed.
The proposition is an important one, marking as it docs the great
contrast between the blending and the particulate theories of in-
heritance. The following proof establishes itin biometricalterms.
Let x and y represent the deviations in any measurement of the
two parents from the specific mean ; if the measurement is affected
not only by inheritance, but by non-heritable (environmental)
factors also, x and ystand forthe heritable part of these deviations.
The amount of variability present 

In [None]:
with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_height = page.height
            header_bound = page_height * header_threshold
            footer_bound = page_height * (1 - footer_threshold)

            lines = page.extract_text_lines()
            
            for line in lines:
                text = line['text'].strip()
                top = line['top']

                # 헤더/푸터 제거
                if top < header_bound or top > footer_bound:
                    continue

                # 빈 줄 필터링
                if not text:
                    continue

                # 글자 크기 기반 제목 검출
                line_font_sizes = [char['size'] for char in line['chars']]
                avg_font_size = sum(line_font_sizes) / len(line_font_sizes)
                is_heading = avg_font_size > body_font_size * heading_scale
                is_heading = is_heading and len(text) >= min_heading_length

                # 밑줄 제거 및 텍스트 정규화
                text = re.sub(r'\s+', ' ', text)
                text = re.sub(r'-\n', '', text)

                # Markdown 형식 변환
                if is_heading:
                    heading_level = 1 if avg_font_size > body_font_size * 1.8 else 2
                    markdown_line = f"{'#' * heading_level} {text}\n\n"
                    previous_line_was_heading = True
                else:
                    if previous_line_was_heading:
                        markdown_line = f"{text}\n\n"
                        previous_line_was_heading = False
                    else:
                        markdown_line = f"{text}\n"

                markdown_lines.append(markdown_line)

# 줄 열로 나뉘어진 페이퍼

In [11]:
import pdfplumber
import re
from collections import defaultdict

def pdf_to_markdown(
    pdf_path,
    output_md_path,
    header_threshold=0.12,
    footer_threshold=0.12,
    heading_scale=1.3,
    min_heading_length=5
):
    # 폰트 크기 분석을 위한 데이터 수집
    all_font_sizes = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for char in page.chars:
                all_font_sizes.append(char['size'])

    # 본문 폰트 크기 결정
    font_size_counts = defaultdict(int)
    for size in all_font_sizes:
        font_size_counts[size] += 1
    body_font_size = max(font_size_counts, key=font_size_counts.get)

    markdown_lines = []
    previous_line_was_heading = False

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_height = page.height
            header_bound = page_height * header_threshold
            footer_bound = page_height * (1 - footer_threshold)

            # 두 열로 나누어진 페이지 처리
            left_col = page.within_bbox((0, 0, page.width / 2, page.height))
            right_col = page.within_bbox((page.width / 2, 0, page.width, page.height))

            for column in [left_col, right_col]:
                lines = column.extract_text_lines()
                
                for line in lines:
                    text = line['text'].strip()
                    top = line['top']

                    # 헤더/푸터 제거
                    if top < header_bound or top > footer_bound:
                        continue

                    # 빈 줄 필터링
                    if not text:
                        continue

                    # 글자 크기 기반 제목 검출
                    line_font_sizes = [char['size'] for char in line['chars']]
                    avg_font_size = sum(line_font_sizes) / len(line_font_sizes)
                    is_heading = avg_font_size > body_font_size * heading_scale
                    is_heading = is_heading and len(text) >= min_heading_length

                    # 밑줄 제거 및 텍스트 정규화
                    text = re.sub(r'\s+', ' ', text)
                    text = re.sub(r'-\n', '', text)

                    # Markdown 형식 변환
                    if is_heading:
                        heading_level = 1 if avg_font_size > body_font_size * 1.8 else 2
                        markdown_line = f"{'#' * heading_level} {text}\n\n"
                        previous_line_was_heading = True
                    else:
                        if previous_line_was_heading:
                            markdown_line = f"{text}\n\n"
                            previous_line_was_heading = False
                        else:
                            markdown_line = f"{text}\n"

                    markdown_lines.append(markdown_line)

    # 결과 저장
    with open(output_md_path, 'w', encoding='utf-8') as md_file:
        md_file.writelines(markdown_lines)


def add_blank_lines_to_paragraphs(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # 마침표 뒤에 단어가 나오지 않는 경우를 기준으로 문단 구분
    paragraphs = re.split(r'(\.[\n\r]+)', content)
    
    # 빈 줄 추가
    formatted_text = ''
    for i in range(len(paragraphs)):
        formatted_text += paragraphs[i]
        if re.match(r'\.[\n\r]+', paragraphs[i]):
            formatted_text += '\n'  # 빈 줄 추가
    
    return formatted_text