In [1]:
import pandas as pd
import torch
import fitz

from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_qdrant import Qdrant
from langchain_core.documents import Document

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
from pathlib import Path
from typing import List
from pydantic import BaseModel
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/soncy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def processing_pdf(path:Path) :
    # 한 페이지씩 들어갈 수 있도록 전처리
    save_path = "processing_pdf/{}".format("/".join(path.parts[-2:]))
    Path(save_path).parents[0].mkdir(parents=True,exist_ok=True)
    doc = fitz.open(path)
    output_doc = fitz.open()
    for i in range(len(doc)) :
        page = doc.load_page(i)
        rect = page.rect
        if i == 0 :
            base_width = rect.width
        
        width = rect.width
        height = rect.height
        mid_x = width/2
        if width != base_width :
            left_rect = fitz.Rect(0,0,mid_x,height)
            left_page = output_doc.new_page(width=mid_x,height=height)
            left_page.show_pdf_page(left_rect,doc,i,clip=left_rect)

            right_rect = fitz.Rect(width-mid_x,0,width,height)
            right_page = output_doc.new_page(width=width-mid_x,height=height)
            right_page.show_pdf_page(left_rect,doc,i,clip=right_rect)
        else :
            full_page = output_doc.new_page(width=width,height=height)
            full_page.show_pdf_page(rect,doc,i)
    output_doc.save(save_path)

In [4]:
df_train = pd.read_csv("open/train.csv")
df_train["Source_path"] = df_train["Source_path"].map(lambda x: Path(x[2:]))
df_train["processing_pdf_path"] = df_train["Source_path"].map(lambda x: Path("processing_pdf") / x)

df_test = pd.read_csv("open/test.csv")
df_test["Source_path"] = df_test["Source_path"].map(lambda x: Path(x[2:]))
df_test["processing_pdf_path"] = df_test["Source_path"].map(lambda x: Path("processing_pdf") / x)

In [5]:
# for path in df_train["Source_path"].unique() :
#     processing_pdf(Path("open") / path)

# for path in df_test["Source_path"].unique() :
#     processing_pdf(Path("open") / path)

In [6]:
pdf_file_lst = [Path(filename) for filename in df_train["processing_pdf_path"].unique()]
TRAIN_ENCODE_DIC = {
    f"train_docs_{i+1:04.0f}":filename.stem for i,filename in enumerate(pdf_file_lst)
}
TRAIN_DECODE_DIC = {
    filename.stem:f"train_docs_{i+1:04.0f}" for i,filename in enumerate(pdf_file_lst)
}
test_pdf_file_lst = [Path(filename) for filename in df_test["processing_pdf_path"].unique()]
TEST_ENCODE_DIC = {
    f"test_docs_{i+1:04.0f}":filename.stem for i, filename in enumerate(test_pdf_file_lst)
}
TEST_DECODE_DIC = {
    filename.stem:f"test_docs_{i+1:04.0f}" for i, filename in enumerate(test_pdf_file_lst)
}

In [7]:
TRAIN_ENCODE_DIC

{'train_docs_0001': '1-1 2024 주요 재정통계 1권',
 'train_docs_0002': '2024 나라살림 예산개요',
 'train_docs_0003': '재정통계해설',
 'train_docs_0004': '국토교통부_전세임대(융자)',
 'train_docs_0005': '고용노동부_청년일자리창출지원',
 'train_docs_0006': '고용노동부_내일배움카드(일반)',
 'train_docs_0007': '보건복지부_노인일자리 및 사회활동지원',
 'train_docs_0008': '중소벤처기업부_창업사업화지원',
 'train_docs_0009': '보건복지부_생계급여',
 'train_docs_0010': '국토교통부_소규모주택정비사업',
 'train_docs_0011': '국토교통부_민간임대(융자)',
 'train_docs_0012': '고용노동부_조기재취업수당',
 'train_docs_0013': '2024년도 성과계획서(총괄편)',
 'train_docs_0014': '23-3호 조세지출 연계관리',
 'train_docs_0015': '22-3호 재정융자사업',
 'train_docs_0016': '월간 나라재정 2023년 12월호'}

In [8]:
embeddings = HuggingFaceEmbeddings(model_name="BM-K/KoSimCSE-roberta-multitask")

No sentence-transformers model found with name BM-K/KoSimCSE-roberta-multitask. Creating a new one with mean pooling.


In [9]:
model = AutoModelForCausalLM.from_pretrained(
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct")

Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00,  1.12it/s]
Some parameters are on the meta device device because they were offloaded to the cpu and disk.


In [10]:
def initialize_qdrant(collection_name,data,embeddings) :
    if Path(f"db/{collection_name}").exists() :
        qdrant = Qdrant.from_existing_collection(
            embedding=embeddings,
            path=f"db/{collection_name}",
            collection_name=collection_name
        )
    else :
        docs = [Document(page_content=line,metadata={"idx":i}) for i, line in enumerate(data)]
        qdrant = Qdrant.from_documents(
            docs,
            embeddings,
            path=f"db/{collection_name}",
            collection_name=collection_name
        )
    return qdrant

In [11]:
DB = {}
DOCS = {}
pbar = tqdm(pdf_file_lst)
for filename in pbar :
    decode_filename = TRAIN_DECODE_DIC[filename.stem]
    pbar.set_description(f"Filename: {decode_filename}")
    docs = fitz.open(filename)
    
    docs2text = []
    docs_lst = []
    for i in range(len(docs)) :
        page = docs.load_page(i)
        page2text = page.get_text()
        page2line = sent_tokenize(page2text)
        docs2text.extend(page2line)
        docs_lst.append(f"Page: {i+1}\n{page2text}")
    DOCS[decode_filename] = docs_lst

    if decode_filename not in DB :
        DB[decode_filename] = initialize_qdrant(decode_filename,docs2text,embeddings)

Filename: train_docs_0016: 100%|██████████| 16/16 [00:04<00:00,  3.76it/s]


In [12]:
def create_chat_completion(messages) :
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    output = model.generate(
        input_ids.to(DEVICE),
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=128
    )
    return tokenizer.decode(output[0])

: 

In [13]:
with open("template/table_detector.md","r",encoding="utf-8") as f :
    template = f.read()

table_dic = {}
for filename in pdf_file_lst :
    decode_filename = TRAIN_DECODE_DIC[filename.stem]
    print(f"Table detecting: {decode_filename}")
    prompt = template.format(
        filename=filename,
        source="\n\n".join(DOCS[decode_filename])
    )
    messages = [
        {"role":"system","content":"너는 글을 보고 목차를 만들어주는 역할이야. 글의 내용을 보고, 목차에 맞춰서 페이지 할당해줘."},
        {"role":"user","content":prompt}
    ]
    response = create_chat_completion(messages)
    break

Table detecting: train_docs_0001
