# Content Clean Up

In [4]:
from app.repositories.LocalFolderRepo import LocalFolderRepo
from app.repositories.HashedContentStorage import HashedContentStorage, HashedContentStatus
from app.services.ContentService import ContentService

local_folder = LocalFolderRepo("./hashed_files")
hashed_content_storage = HashedContentStorage(local_folder)
content_service = ContentService(hashed_content_storage)



In [6]:
hash = '4c1299d6bb20b61c4594292009d5aad77bdee147273a71cdd8616a2696f2218d'
raw_md = content_service.get_markdown(hash)
print(len(raw_md))

30554


In [None]:
from __future__ import annotations
import re, pathlib, textwrap
from typing import List

###############################################################################
# 1.  Basic clean -------------------------------------------------------------
###############################################################################


def clean(md: str) -> str:
    md = md.replace("\f", "\n")  # page breaks
    md = md.replace("\u00a0", " ")  # NB-spaces
    # merge multiple blank lines → exactly one
    md = re.sub(r"\n{2,}", "\n\n", md)
    return md.strip()


md = clean(raw_md)

###############################################################################
# 2.  Paragraph-level segmentation -------------------------------------------
###############################################################################

paragraphs: List[str] = [p.strip() for p in md.split("\n\n") if p.strip()]

###############################################################################
# 3A.  Helper heuristics ------------------------------------------------------
###############################################################################

LEGAL_TRIGGERS = re.compile(
    r"\b(disclaimer|DBS accepts no liability|regulatory disclosure|restrictions on distribution)\b",
    re.I,
)
NUMBERISH = re.compile(r"[0-9]")
TABLE_LINE_RE = re.compile(r"\s{2,}|[|]")

CONTENT_KEYWORDS = re.compile(
    r"\b(deliver(y|ies)|margin|revenue|eps|valuation|target price|risks?|outlook|overview|cash ?flow|ebit(?:da)?)\b",
    re.I,
)


def is_legal(p: str) -> bool:
    return bool(LEGAL_TRIGGERS.search(p)) or p.lower().startswith(
        "gener(al)? disclosure"
    )


def looks_like_table(p: str) -> bool:
    lines = p.splitlines()
    tableish = sum(1 for ln in lines if TABLE_LINE_RE.search(ln))
    digitish = sum(1 for ln in lines if len(NUMBERISH.findall(ln)) > len(ln) * 0.3)
    return (tableish >= len(lines) / 2) or (digitish >= len(lines) / 2)


def is_content(p: str) -> bool:
    words = p.split()
    if len(words) < 25:  # too short
        return False
    letters = sum(ch.isalpha() for ch in p)
    digits = sum(ch.isdigit() for ch in p)
    if digits > letters:  # mostly numbers
        return False
    return bool(CONTENT_KEYWORDS.search(p)) or len(words) > 40


###############################################################################
# 3B.  Classify every paragraph ----------------------------------------------
###############################################################################

labels = []
for p in paragraphs:
    if is_legal(p) or looks_like_table(p):
        labels.append("junk")
    elif is_content(p):
        labels.append("keep")
    else:
        labels.append("maybe")  # fallback bucket

###############################################################################
# 4.  Stitch neighbouring “keep” paras into bigger chunks ---------------------
###############################################################################

MAX_WORDS = 300  # change if you like
chunks, buf, count = [], [], 0
for p, lab in zip(paragraphs, labels):
    if lab == "keep":
        w = len(p.split())
        if count + w > MAX_WORDS and buf:
            chunks.append("\n\n".join(buf))
            buf, count = [], 0
        buf.append(p)
        count += w
    else:
        if buf:
            chunks.append("\n\n".join(buf))
            buf, count = [], 0
# flush tail
if buf:
    chunks.append("\n\n".join(buf))

for c in chunks:
    print(len(c))

1375
1684
228
1977
673
713
640
1848
1553
324
