In [1]:
# ==== One-cell Cleaner → TXT + clean PDF (OCR, icon labels, table-preserving) ====
# Upload PDFs → Convert → Download ZIP (TXT + _CLEAN.pdf)
# NOTE: Nessuna "invenzione": usa solo testo estratto (nativo o OCR).
#       Le icone vengono sostituite con etichette testuali; le tabelle testuali
#       vengono rese in Markdown (indicizzabili da Copilot).

!apt -qq update
!apt -qq install -y poppler-utils tesseract-ocr > /dev/null
!pip -q install gradio pypdf pdf2image pytesseract unidecode reportlab

import os, re, hashlib, unicodedata, shutil, zipfile, textwrap
from pathlib import Path
from typing import List, Dict
from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract
from unidecode import unidecode
import gradio as gr

# --------- Paths ---------
DATA_DIR = Path("data"); RAW = DATA_DIR/"raw"; OUT = DATA_DIR/"out"
for p in [RAW, OUT]: p.mkdir(parents=True, exist_ok=True)

# --------- Icon/emoji → labels (ITA) ---------
SYM_MAP = {
    "📎":"[Allegato]","↗":"[Link]","→":"->","✓":"[Check]","●":"-","•":"-","▶":"-",
}
ICON_MAP = {
    # Cestino
    "cestin":"[Cestino]","elimin":"[Cestino]","delete":"[Cestino]","trash":"[Cestino]",
    # Stampante
    "stamp":"[Stampa]","printer":"[Stampa]","print":"[Stampa]",
    # Graffetta / allegato
    "allegat":"[Allegato]","graffett":"[Allegato]","attach":"[Allegato]","clip":"[Allegato]",
    # Invio / inviare / submit
    "inviar":"[Invia]","invia":"[Invia]","submit":"[Invia]","send":"[Invia]",
    # Livelli
    "livell":"[Livelli]","layers":"[Livelli]",
}

def replace_private_use_glyphs(line:str)->str:
    # Heuristica: se una "parola" è composta da soli simboli non ASCII, etichettala come [Icona]
    toks = line.split()
    out=[]
    for t in toks:
        if not any(ch.isalnum() for ch in t) and any(ord(ch)>127 for ch in t):
            out.append("[Icona]")
        else:
            out.append(t)
    return " ".join(out)

def label_for_line(line: str) -> str:
    low = line.lower()
    for key, label in ICON_MAP.items():
        if key in low:
            return label
    return "[Icona]"

def normalize_symbols(text:str)->str:
    for k,v in SYM_MAP.items():
        text = text.replace(k, v)
    text = unicodedata.normalize("NFKC", text)
    text = unidecode(text)
    text = re.sub(r"[ \t]+"," ", text)
    text = re.sub(r"\n{3,}","\n\n", text)
    # Etichetta glifi "strani" isolati
    lines=[]
    for ln in text.splitlines():
        ln2 = replace_private_use_glyphs(ln)
        # Se la riga sembra solo icona -> sostituisci con etichetta specifica (se rilevabile)
        if ln2.strip() in {"[Icona]","-"} and ln.strip() and not any(ch.isalnum() for ch in ln):
            ln2 = label_for_line(ln)
        lines.append(ln2)
    return "\n".join(lines).strip()

# --------- Tabelle in Markdown (heuristic, no hallucinations) ---------
def looks_like_table_block(lines):
    score = 0
    for ln in lines:
        if re.search(r"\t| {2,}", ln):  # tab o molti spazi = potenziale colonna
            score += 1
    return score >= 3

def split_columns(ln):
    if "\t" in ln:
        cols = [c.strip() for c in ln.split("\t")]
    else:
        cols = [c.strip() for c in re.split(r" {2,}", ln)]
    return [c for c in cols if c != ""]

def block_to_markdown_table(lines):
    rows = [split_columns(ln) for ln in lines if ln.strip()]
    if not rows: return "\n".join(lines)
    maxc = max(len(r) for r in rows)
    rows = [(r + [""]*(maxc-len(r))) for r in rows]
    header = rows[0]; sep = ["---"]*maxc; body = rows[1:] if len(rows)>1 else []
    def to_md_row(r): return "| " + " | ".join(r) + " |"
    out = [to_md_row(header), to_md_row(sep)] + [to_md_row(r) for r in body]
    return "\n".join(out)

def preserve_tables_markdown(text):
    blocks, cur = [], []
    lines = text.splitlines()
    def flush():
        if not cur: return
        if looks_like_table_block(cur):
            blocks.append(block_to_markdown_table(cur))
        else:
            blocks.append("\n".join(cur))
        cur.clear()
    for ln in lines:
        if ln.strip()=="":
            flush(); blocks.append("")
        else:
            cur.append(ln)
    flush()
    return "\n".join(blocks)

# --------- PDF → text (native first, OCR fallback) ---------
def pdf_to_text_with_ocr(pdf_path: Path) -> str:
    # 1) testo nativo
    native = ""
    try:
        reader = PdfReader(str(pdf_path))
        native = "\n\n".join((page.extract_text() or "") for page in reader.pages)
    except Exception:
        native = ""
    if native.strip():
        return native

    # 2) OCR da immagini (ignora screenshot come immagini: estrai solo testo OCR)
    try:
        images = convert_from_path(str(pdf_path), dpi=250)  # Poppler
        ocr_blocks = [pytesseract.image_to_string(im, lang="ita+eng") for im in images]
        return "\n\n".join(ocr_blocks)
    except Exception:
        return ""

# --------- Write clean PDF with selectable text ---------
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Preformatted
from reportlab.lib.units import mm
from reportlab.lib.styles import ParagraphStyle
from reportlab.pdfbase.pdfmetrics import stringWidth

def write_clean_pdf(text:str, out_pdf:Path, max_width=A4[0]-30*mm):
    out_pdf.parent.mkdir(parents=True, exist_ok=True)
    doc = SimpleDocTemplate(str(out_pdf), pagesize=A4, leftMargin=15*mm, rightMargin=15*mm, topMargin=15*mm, bottomMargin=15*mm)
    styles = getSampleStyleSheet()
    mono = ParagraphStyle('mono', parent=styles['Normal'], fontName='Helvetica', fontSize=10, leading=13)
    story=[]
    # Usiamo Preformatted per preservare tabelle Markdown, elenchi e interruzioni di linea
    story.append(Preformatted(text, mono))
    doc.build(story)

# --------- Pipeline ---------
def upload_pdfs(paths):
    saved=0
    for p in paths or []:
        if p and os.path.exists(p):
            shutil.copy2(p, RAW/Path(p).name)
            saved+=1
    return f"Uploaded {saved} PDF(s)."

def convert_all():
    # pulisci out
    for p in OUT.glob("*"):
        if p.is_file(): p.unlink()
    log=[]
    converted_txt=0; converted_pdf=0; failures=0
    for pdf in RAW.glob("*.pdf"):
        raw = pdf_to_text_with_ocr(pdf)
        if not raw.strip():
            failures += 1
            log.append(f"❌ {pdf.name} — nessun testo estratto (anche con OCR)")
            continue
        clean = normalize_symbols(raw)
        clean = preserve_tables_markdown(clean)

        # TXT
        txt_path = OUT/(pdf.stem + ".txt")
        txt_path.write_text(clean, encoding="utf-8")
        converted_txt += 1

        # PDF pulito (solo testo)
        pdf_out = OUT/(pdf.stem + "_CLEAN.pdf")
        try:
            write_clean_pdf(clean, pdf_out)
            converted_pdf += 1
            log.append(f"✅ {pdf.name} → {txt_path.name} + {pdf_out.name}")
        except Exception as e:
            log.append(f"⚠️ {pdf.name} → TXT ok, PDF fallito: {e}")

    # ZIP
    zip_path = Path("clean_bundle.zip")
    if zip_path.exists(): zip_path.unlink()
    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
        for f in OUT.glob("*"):
            z.write(f, arcname=f.name)
    summary = f"TXT: {converted_txt} | PDF: {converted_pdf} | Failures: {failures}"
    return summary + "\n" + "\n".join(log), str(zip_path)

# --------- UI ---------
with gr.Blocks() as demo:
    gr.Markdown("## KB Cleaner → TXT + clean PDF\nCarica PDF → **Converti** → Scarica ZIP (TXT + _CLEAN.pdf).")
    files = gr.File(label="Upload PDF(s)", type="filepath", file_count="multiple", file_types=[".pdf"])
    up_btn = gr.Button("Upload")
    up_out = gr.Markdown()
    conv_btn = gr.Button("Converti (OCR + pulizia + tabelle Markdown)")
    conv_log = gr.Markdown()
    zip_out = gr.File(label="Download clean_bundle.zip")

    up_btn.click(upload_pdfs, inputs=[files], outputs=[up_out])
    conv_btn.click(convert_all, outputs=[conv_log, zip_out])

demo.launch()

37 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)[0m


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hIt looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e8d5b0199473f91463.gradio.live

Th

