In [1]:
pip install transformers sentence-transformers pdfplumber gradio


Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata 

In [8]:
from transformers import pipeline, AutoTokenizer
import pdfplumber
import gradio as gr

# Extract text from uploaded PDF
def extract_text_from_pdf(file):
    text = ""
    with pdfplumber.open(file.name) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                text += f"\n\n--- Page {page_num} ---\n\n{page_text}"
    return text

# Split text into manageable token chunks
def split_into_chunks_hf(text, max_tokens=512):
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    tokens = tokenizer.tokenize(text)
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    chunked_text = [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
    return chunked_text

# Load summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarize chunks
def generate_summary_with_prompt(chunk_text):
    summary = summarizer(chunk_text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']

# Format into bullet points
def format_summary(text):
    lines = text.strip().split('. ')
    formatted = '\n'.join([f"• {line.strip()}." for line in lines if line])
    return formatted

# Main logic
def summarize_pdf(pdf_file):
    extracted_text = extract_text_from_pdf(pdf_file)
    chunks = split_into_chunks_hf(extracted_text)
    summaries = [format_summary(generate_summary_with_prompt(chunk)) for chunk in chunks]
    final_summary = "\n\n".join(summaries)
    return final_summary, len(chunks), extracted_text

# Interface UI
with gr.Blocks(theme=gr.themes.Soft(), css=".gr-box {margin-bottom: 0 !important;}") as interface:
    gr.Markdown("""
    # 📚 Beginner-Friendly PDF Summarizer

    Upload your PDF and receive a simplified, bullet-point summary. Perfect for students, readers, and professionals!

    🔹 **Model Used:** [facebook/bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn)  
    📄 **Supports multi-page PDFs**  
    ✂️ **Text is automatically chunked for optimal summarization**  
    📌 **Summaries are beginner-friendly and in bullet-point format**
    """)

    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(label="📤 Upload PDF", file_types=[".pdf"])
            chunk_count = gr.Number(label="🔢 Number of Chunks", interactive=False)
            btn = gr.Button("🧠 Generate Summary")

        with gr.Column(scale=2):
            summary_output = gr.Textbox(label="📋 Beginner-Friendly Summary", lines=20)
            with gr.Accordion("📄 View Extracted Raw Text", open=False):
                extracted_textbox = gr.Textbox(label="📝 Raw Text Extracted", lines=15)

    btn.click(fn=summarize_pdf,
              inputs=pdf_input,
              outputs=[summary_output, chunk_count, extracted_textbox])

interface.launch(share=True)


Device set to use cuda:0


* Running on local URL:  http://127.0.0.1:7866
* Running on public URL: https://95a453ae525aa0361a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


