In [None]:
# ================================================================
# EduVision — PDF Lecture Summarizer (URL or local .pdf)
# Fixed to avoid torch>=2.6 requirement by preferring safetensors models
# Saves transcript + short/medium/long summaries + bullet notes + key phrases
# Output dir: C:\Users\sagni\Downloads\Edu Vision\outputs
# ================================================================
import os, re, json
from pathlib import Path
from typing import List
import requests

# ----------------- USER SETTINGS -----------------
PDF_SOURCE = r"C:\Users\sagni\Downloads\Edu Vision\R20CSE2202-OPERATING-SYSTEMS.pdf"  # URL or local path
OUT_DIR = Path(r"C:\Users\sagni\Downloads\Edu Vision\outputs")

# Model preference order (all have safetensors so we can avoid torch.load)
# You can reorder if you want higher quality (bart-base > t5-small), at the cost of size.
MODEL_CANDIDATES = [
    # compact & reliable
    ("t5-small", 1280),              # good speed; smaller context
    ("facebook/bart-base", 1536),    # a bit larger; good quality
    # add more if you want:
    # ("google/pegasus-xsum", 1024),
    # ("google/pegasus-cnn_dailymail", 1024),
]

DEVICE = "cuda" if False else "cpu"   # set True above if you have GPU configured
SHORT_MAX_WORDS  = 120
MEDIUM_MAX_WORDS = 250
LONG_MAX_WORDS   = 500
CHUNK_WORDS      = 1200
CHUNK_OVERLAP    = 150
ENABLE_OCR_FALLBACK = True
OCR_DPI = 250

# Silence Windows symlink warning from HF hub caches
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# ----------------- Imports -----------------
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from PyPDF2 import PdfReader

# Optional OCR deps
try:
    from pdf2image import convert_from_path
    have_pdf2image = True
except Exception:
    have_pdf2image = False

# ----------------- Paths & helpers -----------------
def ensure_outdir(p: Path): p.mkdir(parents=True, exist_ok=True)
def is_url(s: str) -> bool: return s.lower().startswith(("http://","https://"))

def extract_text_pypdf2(pdf_path: Path) -> str:
    try:
        reader = PdfReader(str(pdf_path))
        parts = []
        for page in reader.pages:
            parts.append(page.extract_text() or "")
        return "\n".join(parts).strip()
    except Exception as e:
        print(f"[WARN] PyPDF2 failed: {e}")
        return ""

def ocr_pdf_to_text(pdf_path: Path, dpi: int = 250) -> str:
    if not have_pdf2image:
        print("[WARN] pdf2image not installed. OCR fallback not available.")
        return ""
    try:
        from PIL import Image
        import pytesseract
    except Exception:
        print("[WARN] Pillow/pytesseract not available for OCR.")
        return ""
    try:
        images = convert_from_path(str(pdf_path), dpi=dpi)
    except Exception as e:
        print(f"[WARN] pdf2image convert failed: {e}")
        return ""
    texts = []
    for idx, img in enumerate(images, 1):
        try:
            texts.append(pytesseract.image_to_string(img))
        except Exception as e:
            print(f"[WARN] OCR page {idx} failed: {e}")
    return "\n".join(texts).strip()

def extract_text_from_pdf(pdf_path: Path) -> str:
    print("[INFO] Extracting text with PyPDF2…")
    text = extract_text_pypdf2(pdf_path)
    if text and len(text.split()) > 50:
        return text
    if ENABLE_OCR_FALLBACK:
        print("[INFO] Vector text low/empty; trying OCR fallback…")
        ocr_text = ocr_pdf_to_text(pdf_path, dpi=OCR_DPI)
        if ocr_text and len(ocr_text.split()) > 20:
            return ocr_text
    return text

def chunk_text_words(text: str, chunk_words=CHUNK_WORDS, overlap=CHUNK_OVERLAP) -> List[str]:
    words = re.findall(r"\S+", text)
    chunks, i = [], 0
    while i < len(words):
        j = min(i + chunk_words, len(words))
        chunk = " ".join(words[i:j]).strip()
        if chunk: chunks.append(chunk)
        i = j - overlap
        if i <= 0: i = j
        if i >= len(words): break
    return chunks

def words_count(s: str) -> int:
    return len(re.findall(r"\S+", s))

# ----------------- Summarizer loader (safetensors-first) -----------------
def load_summarizer_safetensors(candidates, device="cpu"):
    """
    Try models in order; force `use_safetensors=True` so we don't hit torch.load.
    Returns (tokenizer, model, max_input_tokens).
    """
    last_err = None
    for model_name, max_len in candidates:
        try:
            print(f"[INFO] Loading summarizer: {model_name} (safetensors)")
            tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
            model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                use_safetensors=True,      # <— key to avoid torch.load on .bin
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True,
            )
            if device == "cuda" and torch.cuda.is_available():
                model = model.to("cuda")
            return tok, model, max_len
        except Exception as e:
            print(f"[WARN] Could not load {model_name} with safetensors: {e}")
            last_err = e
    raise RuntimeError(f"Failed to load any summarizer. Last error: {last_err}")

def summarize_chunk(tok, model, text: str, target_words=150, max_input_tokens=1024):
    # heuristic tokens target
    max_new_tokens = max(64, int(target_words * 1.3))
    inputs = tok(
        [text],
        truncation=True, padding=True, return_tensors="pt",
        max_length=max_input_tokens
    )
    if next(model.parameters()).is_cuda:
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=4, length_penalty=2.0, early_stopping=True
        )
    return tok.decode(out[0], skip_special_tokens=True).strip()

def map_reduce_summarize(full_text: str, tok, model, target_words=200, max_input_tokens=1024):
    chunks = chunk_text_words(full_text, CHUNK_WORDS, CHUNK_OVERLAP)
    if not chunks:
        return summarize_chunk(tok, model, full_text, target_words, max_input_tokens)
    partials = [summarize_chunk(tok, model, ck, max(80, target_words//2), max_input_tokens) for ck in chunks]
    joined = " ".join(partials)
    return summarize_chunk(tok, model, joined, target_words, max_input_tokens)

def bulletize(text: str, max_bullets=14) -> List[str]:
    sents = re.split(r'(?<=[.!?])\s+', text)
    KEYS = {"key","main","important","note","definition","example","conclusion","therefore","because","causes","result","summary"}
    scored = []
    for s in sents:
        t = s.strip()
        if not t: continue
        score = -abs(len(t.split()) - 18)
        if any(k in t.lower() for k in KEYS): score += 3
        scored.append((score, t))
    scored.sort(reverse=True, key=lambda x: x[0])
    return [t for _, t in scored[:max_bullets]]

def top_key_phrases(text: str, top_k=25) -> List[str]:
    stop = set("""
        a an the and or if in on with by for to of from that this these those as is are was were be been being have has had do does did not no yes it its it's
        at into over under between within without through about across up down out off your you we they he she them his her our their than then there here
    """.split())
    tokens = [re.sub(r"[^a-z0-9\-]", "", w.lower()) for w in re.findall(r"\b[\w\-']+\b", text)]
    tokens = [t for t in tokens if t and t not in stop and not t.isdigit() and len(t) > 2]
    from collections import Counter
    unis = Counter(tokens)
    bigs = Counter([" ".join(tokens[i:i+2]) for i in range(len(tokens)-1)])
    tris = Counter([" ".join(tokens[i:i+3]) for i in range(len(tokens)-2)])
    scores = {}
    for k,v in unis.items(): scores[k] = scores.get(k,0) + v
    for k,v in bigs.items(): scores[k] = scores.get(k,0) + v*2
    for k,v in tris.items(): scores[k] = scores.get(k,0) + v*3
    phrases = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    final, used = [], set()
    for p,_ in phrases:
        if any(p in u for u in used if p != u): continue
        used.add(p); final.append(p)
        if len(final) >= top_k: break
    return final

# ----------------- Main -----------------
def main(pdf_source: str):
    ensure_outdir(OUT_DIR)
    src = pdf_source.strip().strip('"')
    if not src:
        raise SystemExit("Set PDF_SOURCE to a PDF URL or local PDF path.")

    # 1) Resolve PDF path
    if is_url(src):
        print("[INFO] Downloading PDF…")
        r = requests.get(src, timeout=60); r.raise_for_status()
        pdf_path = OUT_DIR / "downloaded_lecture.pdf"
        pdf_path.write_bytes(r.content)
        basename = "online_lecture"
    else:
        pdf_path = Path(src)
        if not pdf_path.exists():
            raise SystemExit(f"PDF not found: {pdf_path}")
        basename = pdf_path.stem

    # 2) Extract text
    text = extract_text_from_pdf(pdf_path).strip()
    if not text:
        raise SystemExit("Could not extract text from PDF (even with OCR fallback).")

    # Light cleanup
    text = re.sub(r"[ \t]+\n", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()

    transcript_path = OUT_DIR / f"{basename}_text.txt"
    transcript_path.write_text(text, encoding="utf-8")
    print(f"[SAVE] Extracted text -> {transcript_path} ({words_count(text)} words)")

    # 3) Load summarizer (safetensors-first)
    tok, model, max_input_len = load_summarizer_safetensors(MODEL_CANDIDATES, DEVICE)

    # 4) Summaries
    print("[INFO] Summarizing (short)…")
    summary_short  = map_reduce_summarize(text, tok, model, target_words=SHORT_MAX_WORDS,  max_input_tokens=max_input_len)
    print("[INFO] Summarizing (medium)…")
    summary_medium = map_reduce_summarize(text, tok, model, target_words=MEDIUM_MAX_WORDS, max_input_tokens=max_input_len)
    print("[INFO] Summarizing (long)…")
    summary_long   = map_reduce_summarize(text, tok, model, target_words=LONG_MAX_WORDS,   max_input_tokens=max_input_len)

    # 5) Notes & key phrases
    bullets    = bulletize(summary_long, max_bullets=14)
    keyphrases = top_key_phrases(text, top_k=25)

    # 6) Save outputs
    (OUT_DIR / f"{basename}_summary_short.txt").write_text(summary_short,  encoding="utf-8")
    (OUT_DIR / f"{basename}_summary_medium.txt").write_text(summary_medium, encoding="utf-8")
    (OUT_DIR / f"{basename}_summary_long.txt").write_text(summary_long,   encoding="utf-8")
    (OUT_DIR / f"{basename}_notes.md").write_text("# Bullet Notes\n\n" + "\n".join(f"- {b}" for b in bullets), encoding="utf-8")

    meta = {
        "input": src,
        "device": DEVICE,
        "model_loaded": getattr(model.config, "name_or_path", str(MODEL_CANDIDATES[0][0])),
        "max_input_tokens": max_input_len,
        "words_text": words_count(text),
        "chunks_config": {"chunk_words": CHUNK_WORDS, "overlap": CHUNK_OVERLAP},
        "lengths": {
            "short_words":  SHORT_MAX_WORDS,
            "medium_words": MEDIUM_MAX_WORDS,
            "long_words":   LONG_MAX_WORDS
        },
        "key_phrases": keyphrases[:25],
    }
    (OUT_DIR / f"{basename}_meta.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")

    print("\n[DONE] Saved:")
    print(" -", transcript_path)
    print(" -", OUT_DIR / f"{basename}_summary_short.txt")
    print(" -", OUT_DIR / f"{basename}_summary_medium.txt")
    print(" -", OUT_DIR / f"{basename}_summary_long.txt")
    print(" -", OUT_DIR / f"{basename}_notes.md")
    print(" -", OUT_DIR / f"{basename}_meta.json")

# ----------------- ENTRY -----------------
if __name__ == "__main__":
    if PDF_SOURCE:
        main(PDF_SOURCE)
    else:
        print("Please set PDF_SOURCE near the top of this script.")


[INFO] Extracting text with PyPDF2…
[SAVE] Extracted text -> C:\Users\sagni\Downloads\Edu Vision\outputs\R20CSE2202-OPERATING-SYSTEMS_text.txt (31347 words)
[INFO] Loading summarizer: t5-small (safetensors)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

[INFO] Summarizing (short)…
