In [None]:
!pip -q install pymupdf pdfplumber pillow pytesseract opencv-python numpy unidecode
# Tesseract binário (Linux/Colab)
!apt -q install -y tesseract-ocr >/dev/null


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m123.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m
[?25h



In [None]:
import os, re, json, io, math, tempfile
from typing import List, Tuple, Dict, Any
from collections import Counter

import fitz                     # PyMuPDF
import pdfplumber
import numpy as np
import cv2
from PIL import Image
import pytesseract
from unidecode import unidecode

WORD_RE = re.compile(r"[A-Za-zÀ-ÿ]+", re.UNICODE)

STOPWORDS_EN = set('''a about above after again against all am an and any are aren't as at be because been before being below between both
but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have
haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its
itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd
she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're
they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which
while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves'''.split())

STOPWORDS_PT = set('''a ao aos à às acima ainda algo alguma algumas algum alguns ambas ambos ante após aquele aquela aquelas aqueles aquilo as assim
até abaixo bem boa bom bons boas cada com como contra cujo cuja cujas cujos da das de dela delas dele deles demais depois desde desta deste disto do dos
e ela elas ele eles em enquanto entre era eram será foram foi grande grandes há isso isto já la lá lhes lhe mais mas me mesma mesmas mesmo mesmos meu
meus minha minhas muito muita muitas muitos na nas não nem nos nós o os ou onde outra outras outro outros para pela pelas pelo pelos pequena pequenas pequeno
pequenos perante pode podem por porque porém pouca poucas pouco poucos portanto próprio própria próprias próprios qual quais quando que quem se sem será seu seus
sua suas sob sobre sua suas também tão tal tais te tem têm tendo tenho temos tive tivemos tiveram tua tuas teu teus tua teu um uma umas uns vendo vos vocês'''.split())

def is_pdf(path: str) -> bool:
    return path.lower().endswith(".pdf")

def tokenize(text: str) -> List[str]:
    tokens = [t.lower() for t in WORD_RE.findall(text)]
    # normalização leve ajuda o contador
    tokens = [unidecode(t) for t in tokens]
    return [t for t in tokens if t not in STOPWORDS_EN and t not in STOPWORDS_PT and len(t) > 1]

def most_frequent_words(text: str, top_k: int = 30) -> List[Tuple[str,int]]:
    return Counter(tokenize(text)).most_common(top_k)

def paragraphs_from_text(text: str) -> List[str]:
    # separa por linhas em branco; fallback por duplo espaço entre sentenças
    paras = [p.strip() for p in re.split(r"\n\s*\n+", text) if p.strip()]
    if len(paras) <= 1:
        paras = [p.strip() for p in re.split(r"(?<=[.!?])\s{2,}", text) if p.strip()]
    return paras

def looks_like_scientific_article(text: str) -> bool:
    """Heurística: presença de múltiplas pistas comuns em artigos científicos (PT/EN)."""
    if not text or len(text) < 500:
        return False
    cues = [
        r"\babstract\b", r"\bresumo\b", r"\bintroduc[aã]o\b", r"\bintroduction\b",
        r"\bmetodolog[ií]a\b", r"\bmethods?\b", r"\bmaterials\b", r"\bresults?\b", r"\bresultados\b",
        r"\bdiscuss[aã]o\b", r"\bdiscussion\b", r"\bconclus[aã]o\b", r"\bconclusion\b",
        r"\brefer[eê]ncias\b", r"\breferences\b", r"\bkeywords?\b", r"\bpalavras[-\s]?chave\b",
        r"\bdoi:\s*10\.\d{4,9}/\S+", r"\bissn\b", r"\buniversidade\b", r"\buniversity\b"
    ]
    hits = sum(1 for pat in cues if re.search(pat, text, flags=re.IGNORECASE))
    return hits >= 3


In [None]:
def extract_pdf_text_and_paragraphs(path: str) -> Tuple[str, List[str]]:
    """
    1) Tenta segmentação por blocos com PyMuPDF (layout-aware).
    2) Se vier muito pouco texto, tenta OCR por página (renderiza imagem e passa Tesseract).
    """
    text_blocks = []
    have_text = False

    with fitz.open(path) as doc:
        for page in doc:
            try:
                blocks = page.get_text("blocks")  # [(x0,y0,x1,y1,text,block_no,...)]
                blocks = sorted(blocks, key=lambda b: (round(b[1],1), round(b[0],1)))
                page_texts = [(b[4] or "").strip() for b in blocks if (b[4] or "").strip()]
                if page_texts:
                    have_text = True
                    text_blocks.extend(page_texts)
                else:
                    text_blocks.append("")  # placeholder
            except Exception:
                text_blocks.append("")  # placeholder

        joined_text = "\n\n".join([t for t in text_blocks if t])
        # Se o PDF não tinha texto (provável scan), faz OCR por página
        if not have_text or len(joined_text) < 300:
            ocr_texts = []
            for i, page in enumerate(doc):
                pix = page.get_pixmap(dpi=200)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                ocr_texts.append(pytesseract.image_to_string(img, lang="por+eng").strip())
            joined_text = "\n\n".join([t for t in ocr_texts if t])

    paragraphs = paragraphs_from_text(joined_text)
    return joined_text, paragraphs


In [None]:
def extract_image_text_and_paragraphs(path: str) -> Tuple[str, List[str]]:
    img = cv2.imread(path)
    if img is None:
        # fallback: tenta abrir com PIL (formatos diferenciados)
        pil = Image.open(path).convert("RGB")
        text = pytesseract.image_to_string(pil, lang="por+eng")
        return text, paragraphs_from_text(text)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                cv2.THRESH_BINARY_INV, 31, 15)

    # kernel que aglutina linhas em blocos tipo parágrafos
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
    closed = cv2.morphologyEx(thr, cv2.MORPH_CLOSE, kernel, iterations=2)

    contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    h, w = closed.shape
    boxes = []
    for cnt in contours:
        x, y, bw, bh = cv2.boundingRect(cnt)
        area = bw * bh
        if area < 500 or bw < 40 or bh < 20:     # ruído
            continue
        if bw > 0.98*w and bh > 0.98*h:          # quase página inteira
            continue
        boxes.append((x, y, bw, bh))

    boxes.sort(key=lambda b: (b[1], b[0]))  # top-to-bottom, left-to-right

    pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    paragraphs, parts = [], []
    for (x, y, bw, bh) in boxes:
        crop = pil_img.crop((x, y, x+bw, y+bh))
        txt = pytesseract.image_to_string(crop, lang="por+eng").strip()
        if txt:
            paragraphs.append(txt)
            parts.append(txt)

    full_text = "\n\n".join(parts)
    if not full_text:
        # fallback: OCR da imagem toda
        full_text = pytesseract.image_to_string(pil_img, lang="por+eng")
        paragraphs = paragraphs_from_text(full_text)

    return full_text, paragraphs


In [None]:
def analyze_file(path: str) -> Dict[str, Any]:
    if is_pdf(path):
        text, paras = extract_pdf_text_and_paragraphs(path)
    else:
        text, paras = extract_image_text_and_paragraphs(path)

    if text and not paras:
        paras = paragraphs_from_text(text)

    word_count = len(WORD_RE.findall(text))
    top_words = most_frequent_words(text, top_k=50)
    is_article = looks_like_scientific_article(text)
    compliant = bool(word_count > 2000 and len(paras) > 4)

    report = {
        "input_path": os.path.abspath(path),
        "is_pdf": is_pdf(path),
        "is_scientific_article": is_article,
        "paragraph_count": len(paras),
        "word_count": word_count,
        "top_words": top_words,                   # lista de [palavra, contagem]
        "paragraphs": paras,                      # parágrafos extraídos
        "compliant_to_rule": compliant,
        "rule": ">2000 palavras e >4 parágrafos"
    }
    return report

def save_report(report: Dict[str, Any]) -> str:
    base, ext = os.path.splitext(os.path.basename(report["input_path"]))
    out = f"/content/{base}_report.json"
    with open(out, "w", encoding="utf-8") as f:
        json.dump(report, f, ensure_ascii=False, indent=2)
    return out

def print_machine_summary(report: Dict[str, Any]):
    """Resumo objetivo (sem LLM): serve para rápida inspeção."""
    print("# Resumo técnico (sem LLM)")
    print(f"- Arquivo: {report['input_path']}")
    print(f"- PDF? {report['is_pdf']}")
    print(f"- Parece artigo científico? {report['is_scientific_article']}")
    print(f"- Parágrafos: {report['paragraph_count']}")
    print(f"- Palavras: {report['word_count']}")
    print(f"- Regra (>2000 palavras e >4 parágrafos): {report['compliant_to_rule']}")
    print("\nTop palavras (base PT/EN stopwords removidas):")
    for w, c in report["top_words"][:20]:
        print(f"  {w:20s} {c}")


In [None]:
from google.colab import files

print("Envie 1 ou mais arquivos (PDF/PNG/JPG):")
uploaded = files.upload()  # abra o seletor, escolha seus arquivos

reports = []
for name in uploaded.keys():
    path = f"/content/{name}"
    print(f"\n=== Processando: {path} ===")
    rep = analyze_file(path)
    out = save_report(rep)
    print_machine_summary(rep)
    print(f"\n➡ Relatório salvo em: {out}")
    reports.append(rep)

# Também salva


Envie 1 ou mais arquivos (PDF/PNG/JPG):


Saving images.jpg to images.jpg

=== Processando: /content/images.jpg ===
# Resumo técnico (sem LLM)
- Arquivo: /content/images.jpg
- PDF? False
- Parece artigo científico? False
- Parágrafos: 1
- Palavras: 5
- Regra (>2000 palavras e >4 parágrafos): False

Top palavras (base PT/EN stopwords removidas):
  oe                   1
  rh                   1
  ee                   1
  ro                   1

➡ Relatório salvo em: /content/images_report.json
