In [41]:
import fitz
from PIL import Image
import io
import numpy as np
import cv2
import pytesseract
import re

def clean_text(s: str):
    return re.sub(r"\s+", " ", s.replace("\n", " ").strip())

def pil_from_pix(pix):
    img_bytes = pix.tobytes("png")
    return Image.open(io.BytesIO(img_bytes))

def to_cv2(img_pil):
    img = np.array(img_pil)
    if img.ndim == 2:
        return img
    return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

def ocr_image_region(cv_img, psm=6):
    config = f"--oem 3 --psm {psm}"
    pil = Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
    text = pytesseract.image_to_string(pil, lang="rus+eng", config=config)
    return clean_text(text)

def detect_table_regions(cv_img, scale=15, min_area=2000):
    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (3,3),0)
    thr = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV,15,9)

    horizontal = thr.copy()
    vertical = thr.copy()
    horiz_size = max(10, horizontal.shape[1]//scale)
    vert_size = max(8, vertical.shape[0]//scale)
    horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horiz_size,1))
    vert_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,vert_size))
    horizontal = cv2.dilate(cv2.erode(horizontal,horiz_kernel), horiz_kernel)
    vertical = cv2.dilate(cv2.erode(vertical,vert_kernel), vert_kernel)

    mask = cv2.add(horizontal, vertical)
    mask = cv2.dilate(mask, cv2.getStructuringElement(cv2.MORPH_RECT,(3,3)), iterations=2)

    contours,_ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    bboxes = []
    for cnt in contours:
        x,y,w,h = cv2.boundingRect(cnt)
        if w*h > min_area:
            bboxes.append((x,y,w,h))

    return sorted(bboxes, key=lambda b: b[1])  # сортировка по Y

def split_table_cells(region):
    gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)
    thr = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV,15,9)
    h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (max(2, region.shape[1]//20),1))
    v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,max(2, region.shape[0]//20)))
    horizontal = cv2.dilate(cv2.erode(thr,h_kernel), h_kernel)
    vertical = cv2.dilate(cv2.erode(thr,v_kernel), v_kernel)
    inter = cv2.bitwise_and(horizontal, vertical)
    ys, xs = np.where(inter>0)

    if len(xs)<2 or len(ys)<2:
        return None

    def cluster_coords(coords, eps=10):
        coords = sorted(coords)
        clusters=[]
        cur=[coords[0]]
        for c in coords[1:]:
            if abs(c-cur[-1])<=eps:
                cur.append(c)
            else:
                clusters.append(int(sum(cur)/len(cur)))
                cur=[c]
        clusters.append(int(sum(cur)/len(cur)))
        return clusters

    xc = cluster_coords(list(xs))
    yc = cluster_coords(list(ys))

    if len(xc)<2 or len(yc)<2:
        return None

    table=[]
    for i in range(len(yc)-1):
        row=[]
        for j in range(len(xc)-1):
            x0,y0=xc[j],yc[i]
            x1,y1=xc[j+1],yc[i+1]
            row.append((x0,y0,x1-x0,y1-y0))
        table.append(row)
    return table

def parse_pdf(path, dpi=300):
    doc = fitz.open(path)
    output=[]

    for page in doc:
        pix = page.get_pixmap(dpi=dpi)
        pil = pil_from_pix(pix)
        cv_img = to_cv2(pil)

        # detect tables
        table_bboxes = detect_table_regions(cv_img)
        used_mask = np.zeros(cv_img.shape[:2], dtype=np.uint8)

        page_blocks=[]

        for x,y,w,h in table_bboxes:
            region = cv_img[y:y+h, x:x+w]
            cells = split_table_cells(region)
            if cells:
                table_rows=[]
                for row in cells:
                    row_texts=[]
                    for cell in row:
                        cx,cy,cw,ch=cell
                        cell_img=region[cy:cy+ch, cx:cx+cw]
                        txt = ocr_image_region(cell_img)
                        row_texts.append(txt)
                    table_rows.append(row_texts)
                page_blocks.append({"type":"table","content":table_rows})
            else:
                txt=ocr_image_region(region)
                page_blocks.append({"type":"table","content":[[txt]]})
            used_mask[y:y+h, x:x+w]=255

        # OCR for remaining text
        full_text = pytesseract.image_to_string(pil, lang="rus+eng", config="--oem 3 --psm 3")
        lines = [clean_text(l) for l in full_text.split("\n") if clean_text(l)]
        for line in lines:
            page_blocks.append({"type":"text","content":line})

        output.extend(page_blocks)

    doc.close()
    return output


In [44]:
data = parse_pdf("dataset/ПНАЭ Г-7-018-89.pdf", dpi=400)

In [43]:
data

[{'type': 'table', 'content': [['ии ии ии РУ ии']]},
 {'type': 'table',
  'content': [['ии РУРУУ И {_ ии иииииииииии РУ Ив ХГ["][["|']]},
 {'type': 'text', 'content': 'Государственный комитет СССР по надзору'},
 {'type': 'text',
  'content': 'за безопасным ведением работ в атомной энергетике'},
 {'type': 'text', 'content': 'ПРАВИЛА И НОРМЫ В АТОМНОЙ ЭНЕРГЕТИКЕ'},
 {'type': 'text', 'content': 'УНИФИЦИРОВАННАЯ МЕТОДИКА КОНТРОЛЯ ОСНОВНЫХ'},
 {'type': 'text',
  'content': 'МАТЕРИАЛОВ (ПОЛУФАБРИКАТОВ), СВАРНЫХ СОЕДИНЕНИЙ И'},
 {'type': 'text', 'content': 'НАПЛАВКИ ОБОРУДОВАНИЯ И ТРУБОПРОВОДОВ АЭУ'},
 {'type': 'text', 'content': 'КАПИЛЛЯРНЫЙ КОНТРОЛЬ'},
 {'type': 'text', 'content': 'ПНАЭ Г-7-018-89'},
 {'type': 'text', 'content': 'Дата введения'},
 {'type': 'text', 'content': '01.07.90.'},
 {'type': 'text', 'content': 'Москва 1990'},
 {'type': 'text',
  'content': 'Обязательны для всех министерств, ведомств, организаций и предприятий, осуществ-'},
 {'type': 'text',
  'content': 'ляющих проек

In [21]:
import hashlib


# ============================================================
#           ВСПОМОГАТЕЛЬНЫЕ ФУНКЦИИ
# ============================================================

def tokenize_len(text: str) -> int:
    """Простейший токенайзер: считает слова."""
    return len(text.split())


def hash_text(text: str) -> str:
    """MD5-хэш текста — чтобы помечать таблицы."""
    return hashlib.md5(text.encode("utf-8")).hexdigest()


def convert_table_to_text(table):
    """
    Преобразует таблицу (list[list[str]]) в текст формата:
    | A | B | C |
    """
    lines = []
    for row in table:
        line = "| " + " | ".join(str(c) for c in row) + " |"
        lines.append(line)
    return "\n".join(lines)


# ============================================================
#      МЯГКОЕ ДЕЛЕНИЕ СЛИШКОМ БОЛЬШОГО ЛОГИЧЕСКОГО БЛОКА
# ============================================================

def split_block_soft(block, min_size, max_size):
    """Режет слишком большой логический блок по словам."""
    words = block.split()
    parts = []
    cur = []
    cur_len = 0

    for w in words:
        wl = tokenize_len(w)
        if cur_len + wl > max_size:
            parts.append(" ".join(cur))
            cur = [w]
            cur_len = wl
        else:
            cur.append(w)
            cur_len += wl

    if cur:
        parts.append(" ".join(cur))

    return parts


# ============================================================
#             ЛОГИЧЕСКОЕ ДЕЛЕНИЕ ТАБЛИЦЫ
# ============================================================

def split_table_logically(table_text, min_size, max_size):
    """
    Делит таблицу логически корректно:
    - строки с пустой первой ячейкой считаются продолжением предыдущей
    - блоки не рвутся
    - слишком большие блоки мягко режутся
    """

    lines = table_text.split("\n")

    blocks = []
    current_block = []

    def flush_block():
        nonlocal current_block
        if current_block:
            blocks.append("\n".join(current_block))
            current_block = []

    for line in lines:
        parts = [p.strip() for p in line.split("|")]

        first_cell = parts[1] if len(parts) > 1 else ""

        if not current_block:
            current_block.append(line)
            continue

        if first_cell == "":
            current_block.append(line)
        else:
            flush_block()
            current_block.append(line)

    flush_block()

    # собираем чанки
    result = []
    cur = []
    cur_len = 0

    def flush_chunk():
        nonlocal cur, cur_len
        if cur:
            result.append("\n".join(cur))
            cur = []
            cur_len = 0

    for block in blocks:
        bsize = tokenize_len(block)

        # если блок огромный — мягко режем
        if bsize > max_size:
            for small in split_block_soft(block, min_size, max_size):
                ssize = tokenize_len(small)
                if cur_len + ssize > max_size:
                    flush_chunk()
                cur.append(small)
                cur_len += ssize
            continue

        # обычный случай
        if cur_len + bsize > max_size:
            flush_chunk()

        cur.append(block)
        cur_len += bsize

    flush_chunk()
    return result


# ============================================================
#         ДЕЛЕНИЕ ОГРОМНОГО ТЕКСТА НА ЧАНКИ
# ============================================================

def split_text_by_max_size(text, min_size, max_size, strict=True):
    """
    Делит текст по словам так, чтобы каждый чанк ≤ max_size.
    """
    words = text.split()
    chunks = []
    cur = []
    cur_len = 0

    for w in words:
        wl = 1
        if cur_len + wl > max_size:
            chunks.append(" ".join(cur))
            cur = [w]
            cur_len = wl
        else:
            cur.append(w)
            cur_len += wl

    if cur:
        chunks.append(" ".join(cur))

    return chunks


# ============================================================
#                ОСНОВНАЯ ФУНКЦИЯ CHUNKING
# ============================================================

def normalize_pre_chunks(pre_chunks, min_size=50, max_size=200, strict=True):
    """
    Объединяет текст, режет большие блоки,
    логически режет таблицы, формирует конечные чанки.
    """

    result_chunks = []
    current_text = []
    current_tables = []
    current_len = 0

    def flush_chunk():
        nonlocal current_text, current_len, current_tables
        if current_text:
            result_chunks.append({
                "text": "\n\n".join(current_text),
                "chunkSize": current_len,
                "tables": current_tables.copy()
            })
            current_text = []
            current_tables = []
            current_len = 0

    # ---------------------------
    # 1. основной цикл
    # ---------------------------
    for item in pre_chunks:

        # ======================================
        # TEXT
        # ======================================
        if item["type"] == "text":
            text = item["content"]
            t_size = tokenize_len(text)

            # если большой текст — режем
            if t_size > max_size:
                pieces = split_text_by_max_size(text, min_size, max_size, strict)

                for part in pieces:
                    psize = tokenize_len(part)
                    if current_len + psize > max_size:
                        flush_chunk()
                    current_text.append(part)
                    current_len += psize
                continue

            # обычный текст
            if current_len + t_size > max_size:
                flush_chunk()

            current_text.append(text)
            current_len += t_size
            continue

        # ======================================
        # TABLE
        # ======================================
        elif item["type"] == "table":
            table_text = convert_table_to_text(item["content"])
            table_parts = split_table_logically(table_text, min_size, max_size)

            for part in table_parts:
                part_size = tokenize_len(part)
                part_hash = hash_text(part)

                if current_len + part_size > max_size:
                    flush_chunk()

                current_text.append(part)
                current_tables.append(part_hash)
                current_len += part_size

            continue

    flush_chunk()

    # ======================================
    # Финальное объединение маленьких чанков
    # ======================================
    final = []
    buffer = None

    for ch in result_chunks:
        if ch["chunkSize"] < min_size:
            if buffer is None:
                buffer = ch
            else:
                merged = {
                    "text": buffer["text"] + "\n\n" + ch["text"],
                    "chunkSize": buffer["chunkSize"] + ch["chunkSize"],
                    "tables": buffer["tables"] + ch["tables"]
                }
                buffer = merged
        else:
            if buffer:
                if buffer["chunkSize"] + ch["chunkSize"] <= max_size:
                    merged = {
                        "text": buffer["text"] + "\n\n" + ch["text"],
                        "chunkSize": buffer["chunkSize"] + ch["chunkSize"],
                        "tables": buffer["tables"] + ch["tables"]
                    }
                    final.append(merged)
                else:
                    final.append(buffer)
                    final.append(ch)
                buffer = None
            else:
                final.append(ch)

    if buffer:
        final.append(buffer)

    return final


In [None]:
{"type": ("text"/"table"), "content": (str/array)}

In [12]:
from object.LoadDOC_RTF import parse_doc_or_rtf

data_test = parse_doc_or_rtf("../dataset/НП-064-17.rtf")

Error: source file could not be loaded


PackageNotFoundError: Package not found at '/var/folders/xl/0bqpzy9s59s2vz7tlq_2f5rc0000gn/T/tmpsyamnpsc/НП-064-17.docx'

In [None]:
tess = normalize_pre_chunks(data_test, 50, 120)

In [None]:
for i in tess:
    if i["chunkSize"] > 120:
        print(i)

In [13]:
from object.LoadDOC_RTF import parse_doc_or_rtf

In [16]:
pre_chunk = parse_doc_or_rtf('./dataset/НП-064-17.rtf')

convert /Users/odner/Git/NeuroFile/ai-agent/dataset/НП-064-17.rtf as a Writer document -> /private/var/folders/xl/0bqpzy9s59s2vz7tlq_2f5rc0000gn/T/tmpa5pav0qk/НП-064-17.docx using filter : Office Open XML Text


In [26]:
tess = normalize_pre_chunks(pre_chunk, 100, 200)

In [27]:
tess

[{'text': 'ФЕДЕРАЛЬНАЯ СЛУЖБА ПО ЭКОЛОГИЧЕСКОМУ, ТЕХНОЛОГИЧЕСКОМУ И АТОМНОМУ НАДЗОРУ ПРИКАЗ от 30 ноября 2017 года N 514 Об утверждении федеральных норм и правил в области использования атомной энергии "Учет внешних воздействий природного и техногенного происхождения на объекты использования атомной энергии" В соответствии со статьей 6 Федерального закона от 21 ноября 1995 г. N 170-ФЗ "Об использовании атомной энергии" (Собрание законодательства Российской Федерации, 1995, N 48, ст.4552; 1997, N 7, ст.808; 2001, N 29, ст.2949; 2002, N 1, ст.2; N 13, ст.1180; 2003, N 46, ст.4436; 2004, N 35, ст.3607; 2006, N 52, ст.5498; 2007, N 7, ст.834; N 49, ст.6079; 2008, N 29, ст.3418; N 30, ст.3616; 2009, N 1, ст.17; N 52, ст.6450; 2011, N 29, ст.4281; N 30, ст.4590, ст.4596; N 45, ст.6333; N 48, ст.6732; N 49, ст.7025; 2012, N 26, ст.3446; 2013, N 27, ст.3451; 2016, N 14, ст.1904; N 15, ст.2066; N 27, ст.4289), подпунктом 5.2.2.1 пункта 5 Положения о Федеральной службе по экологическому, техноло

In [25]:
for i in tess:
    if i["chunkSize"] > 120:
        print(i)

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "cointegrated/rubert-base-cased-nli-threeway"
local_dir = "./model/lr"

# Скачивает модель в указанную директорию
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer.save_pretrained(local_dir)
model.save_pretrained(local_dir)

print("Saved to:", local_dir)

Saved to: ./model/lr
