In [5]:
!pip install lxml_html_clean

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.1


In [6]:
!pip install newspaper3k



In [1]:
!pip install transformers gradio  PyMuPDF bert-extractive-summarizer


Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-p

In [7]:
import gradio as gr
from transformers import pipeline
from newspaper import Article
import fitz
from summarizer import Summarizer

In [26]:
def extract_text_from_pdf(pdf_file):
  doc = fitz.open(stream = pdf_file, filetype = "pdf")
  text = ""
  for page in doc:
    text += page.get_text()
  return text


In [9]:
def extract_text_from_url(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text


In [10]:
abstractive_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [18]:
def generate_abstractive_summary(text, max_length = 130, min_length = 30):
  summary = abstractive_summarizer(text,max_length = max_length, min_length = min_length, do_sample = False)
  return summary[0]['summary_text']


In [19]:
extractive_summarizer = Summarizer()

def generate_extractive_summary(text, ratio=0.3):
    return extractive_summarizer(text, ratio=ratio)

In [27]:
def summarize_text(source_type, text, pdf, url, max_length, min_length, ratio):
    input_text = ""

    try:
        if source_type == "Text" and text:
            input_text = text
        elif source_type == "PDF" and pdf is not None:
            input_text = extract_text_from_pdf(pdf)
        elif source_type == "URL" and url:
            input_text = extract_text_from_url(url)
        else:
            return "❗Please provide a valid input.", ""

        if len(input_text.strip()) == 0:
            return "❗Input is empty after extraction.", ""

        # 💡 Trim to prevent token overflow (Bart limit ~1024 tokens)
        input_text = input_text[:2000]

        abstractive = generate_abstractive_summary(input_text, max_length, min_length)
        extractive = generate_extractive_summary(input_text, ratio)

        return abstractive, extractive

    except Exception as e:
        return f"⚠️ Error: {str(e)}", ""


In [28]:
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 AI Text Summarizer\nChoose input type and get both abstractive and extractive summaries.")

    source_type = gr.Radio(["Text", "PDF", "URL"], label="Select Input Source")

    text_input = gr.Textbox(lines=8, label="Enter Text", visible=False)
    pdf_input = gr.File(label="Upload PDF", type="binary", visible=False)
    url_input = gr.Textbox(label="Enter URL", visible=False)

    max_length = gr.Slider(50, 300, step=10, value=130, label="Max Length (Abstractive)")
    min_length = gr.Slider(20, 100, step=10, value=30, label="Min Length (Abstractive)")
    ratio = gr.Slider(0.1, 1.0, step=0.1, value=0.3, label="Summary Ratio (Extractive)")

    btn = gr.Button("Generate Summaries")

    output_ab = gr.Textbox(label="Abstractive Summary")
    output_ex = gr.Textbox(label="Extractive Summary")

    def toggle_inputs(src):
        return {
            text_input: gr.update(visible=(src == "Text")),
            pdf_input: gr.update(visible=(src == "PDF")),
            url_input: gr.update(visible=(src == "URL"))
        }

    source_type.change(fn=toggle_inputs, inputs=source_type, outputs=[text_input, pdf_input, url_input])

    btn.click(
        summarize_text,
        inputs=[source_type, text_input, pdf_input, url_input, max_length, min_length, ratio],
        outputs=[output_ab, output_ex]
    )

demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8ddd5630a4ce559830.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


