In [None]:
# Install required libraries
%pip install --quiet gradio transformers nltk numpy networkx sumy langchain-google-genai pypdf langchain-community

# Import necessary libraries
import gradio as gr
from transformers import pipeline
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from langchain.document_loaders import PyPDFLoader
from langchain.chains.summarize import load_summarize_chain

# Download necessary NLTK data files
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab')

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Frequency function
def frequency_based_summary(text, word_limit):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    freq_table = {word: words.count(word) for word in words if word not in stop_words}
    sentences = sent_tokenize(text)
    sentence_scores = {sentence: sum(freq_table.get(word, 0) for word in word_tokenize(sentence.lower()))
                       for sentence in sentences}

    summary, word_count = [], 0
    for sentence in sorted(sentence_scores, key=sentence_scores.get, reverse=True):
        sentence_word_count = len(word_tokenize(sentence))

        if sentence_word_count > word_limit * 0.5:
            continue
        if word_count + sentence_word_count <= word_limit:
            summary.append(sentence)
            word_count += sentence_word_count
        else:
            break
    return " ".join(summary)

# LSA function
def lsa_summary(text, word_limit):
    try:
        sentences = sent_tokenize(text)
        if len(sentences) == 0:
            return "No content to summarize."

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(sentences)

        svd = TruncatedSVD(n_components=1)
        scores = svd.fit_transform(X).flatten()

        ranked_sentences = [sentences[i] for i in np.argsort(scores)[::-1]]

        summary = []
        word_count = 0
        for sentence in ranked_sentences:
            sentence_word_count = len(word_tokenize(sentence))
            if sentence_word_count > word_limit * 0.5:
                continue
            if word_count + sentence_word_count <= word_limit:
                summary.append(sentence)
                word_count += sentence_word_count
            else:
                break

        return " ".join(summary)
    except Exception as e:
        return f"Error generating summary with LSA: {e}"

# LexRank function
def lex_rank_summary(text, word_limit):
    try:
        sentences = sent_tokenize(text)
        vectorizer = CountVectorizer().fit_transform(sentences)
        tfidf_matrix = TfidfTransformer().fit_transform(vectorizer)
        similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()
        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)
        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        summary = []
        word_count = 0
        for _, sentence in ranked_sentences:
            sentence_word_count = len(word_tokenize(sentence))
            if word_count + sentence_word_count <= word_limit:
                summary.append(sentence)
                word_count += sentence_word_count
            else:
                break
        return " ".join(summary)
    except Exception as e:
        return f"Error generating summary with LexRank: {e}"

# Luhn function
def luhn_summary(text, word_limit):
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.luhn import LuhnSummarizer

    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LuhnSummarizer()

    preliminary_summary = summarizer(parser.document, sentences_count=len(text.split(".")))

    summary = []
    word_count = 0
    for sentence in preliminary_summary:
        sentence = str(sentence)
        sentence_word_count = len(word_tokenize(sentence))
        if word_count + sentence_word_count <= word_limit:
            summary.append(sentence)
            word_count += sentence_word_count
        else:
            break

    return " ".join(summary)



In [None]:
# T5 function
def t5_summary(text, word_limit):
    try:
        t5_summarizer = pipeline("summarization", model="t5-small")
        return t5_summarizer(text, max_length=word_limit, min_length=word_limit // 2, do_sample=False)[0]['summary_text']
    except Exception as e:
        return f"Error generating summary with T5: {e}"

# fucntion to handle large files for BART
def chunk_text(text, max_length=500):
    tokens = word_tokenize(text)
    chunks = []
    current_chunk = []

    for token in tokens:
        if len(current_chunk) + len(token) <= max_length:
            current_chunk.append(token)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [token]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# BART function
def bart_summary(text, word_limit):
    bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    chunks = chunk_text(text, max_length=500)  # Split text into chunks of 500 tokens
    summaries = []
    summary = bart_summarizer(chunks, max_length=word_limit, min_length=word_limit // 2, do_sample=False)[0]['summary_text']
    summaries.append(summary)
    final_summary = " ".join(summaries)
    return final_summary

# Large Language Models
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import Document

def get_prompt_template():

    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Write a concise summary of the following in {num_words} words:\\n\\n",
            ),
            ("human", "{context}")
        ]
    )
    return prompt

def llm_summary(text, word_limit):
    llm = ChatGoogleGenerativeAI(
      model="gemini-1.5-flash",
      temperature=0,
      max_tokens=word_limit,
      timeout=None,
      max_retries=2
)

    prompt = get_prompt_template()
    chain = prompt | llm

    # Invoke chain
    result = chain.invoke({
        "context": text,
        "num_words": word_limit
    })

    return result.content

# Map reduce function
def map_reduce_summary(text, word_limit):
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", max_tokens=word_limit,min_tokens=word_limit)
    chain = load_summarize_chain(llm, chain_type="map_reduce")
    docs = [Document(page_content=text)]
    summary = chain.invoke(docs)
    return summary['output_text']

# Iterative refinement function
def iterative_refinement_summary(text, word_limit):
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", max_tokens=word_limit)
    chain = load_summarize_chain(llm, chain_type="refine")
    docs = [Document(page_content=text)]
    summary = chain.invoke(docs)
    return summary['output_text']


In [None]:
#summarization Logic
def summarize(input_text, summarization_type, method, word_limit=None):
    extractive_methods = {
        "Frequency-based": frequency_based_summary,
        "Luhn": luhn_summary,
        "LSA": lsa_summary,
        "LexRank": lex_rank_summary,
    }
    abstractive_methods = {
        "T5": t5_summary,
        "BART": bart_summary,
    }
    llm_methods = {
        "Basic_LLM": llm_summary,
        "Map_Reduce": map_reduce_summary,
        "Iterative_Refinement": iterative_refinement_summary,
    }

    if summarization_type == "Extractive":
        summarizer_function = extractive_methods.get(method)
    elif summarization_type == "Abstractive":
        summarizer_function = abstractive_methods.get(method)
    elif summarization_type == "LLM":
        summarizer_function = llm_methods.get(method)
    else:
        return "Invalid summarization type"

    if summarizer_function is None:
        return "Invalid method"

    try:
        summary = summarizer_function(input_text, word_limit)
        return summary
    except Exception as e:
        return f"An error occurred: {str(e)}"

def summarize_pdf(pdf_file_path, summarization_type, method, word_limit):
    try:
        loader = PyPDFLoader(pdf_file_path)
        docs = loader.load_and_split()
        full_text = " ".join([doc.page_content for doc in docs])
        full_text_cleaned = clean_text(full_text)
        summary = summarize(full_text_cleaned, summarization_type, method, word_limit)
        return summary,full_text_cleaned
    except Exception as e:
        return f"Error processing PDF: {e}"

def clean_text(text):
    # Remove extra newlines and spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def run_summarization(input_text, pdf_file, method, summarization_type, limit):
    if input_text:
        summary = summarize(input_text, summarization_type, method, word_limit=int(limit) if limit else None)
        input_word_count = len(input_text.split())
    elif pdf_file:
        summary, full_text_cleaned = summarize_pdf(pdf_file.name, summarization_type, method, word_limit=int(limit) if limit else None)
        input_word_count = len(full_text_cleaned.split())
    else:
        summary = "No input provided."

    summary_word_count = len(summary.split())
    final_summary = f"{summary}\n\nInput Words: {input_word_count}\nSummary Words: {summary_word_count}"
    return final_summary

In [None]:
# Gradio Interface
import gradio as gr
import re

def toggle_input_fields(selection):
    if selection == "Input Text":
        return gr.update(visible=True, value=""), gr.update(visible=False, value=None)
    elif selection == "Upload PDF":
        return gr.update(visible=False, value=""), gr.update(visible=True, value=None)
    return gr.update(visible=False, value=""), gr.update(visible(False), value=None)

css = """
h1 {
    margin-top: 2rem;
    font-size: 2rem;
    text-align: center;
}
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown("<h1 style='text-align:center;'>Document Summarizer App</h1>")

    with gr.Row():
        with gr.Column():
            input_method = gr.Radio(["Input Text", "Upload PDF"], label="Select Input Method")

            input_text = gr.Textbox(label="Input Text", placeholder="Enter text to summarize", lines=10, visible=False)
            input_pdf = gr.File(label="Upload PDF file", visible=False)

            input_method.change(
                fn=toggle_input_fields,
                inputs=[input_method],
                outputs=[input_text, input_pdf]
            )

            with gr.Tabs():
                with gr.TabItem("Extractive"):
                    extractive_methods = gr.Dropdown(
                        ["Frequency-based", "Luhn", "LSA", "LexRank"], label="Select Extractive Method"
                    )
                    summarize_button_extractive = gr.Button("Summarize", variant="primary")

                with gr.TabItem("Abstractive"):
                    abstractive_methods = gr.Dropdown(
                        ["T5", "BART" ], label="Select Abstractive Method"
                    )
                    summarize_button_abstractive = gr.Button("Summarize", variant="primary")
                with gr.TabItem("LLM"):
                    llm_methods = gr.Dropdown(
                        ["Basic_LLM","Map_Reduce", "Iterative_Refinement"], label="Select LLM Method"
                    )
                    summarize_button_llm = gr.Button("Summarize", variant="primary")

            word_limit = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label="Word Limit")

        with gr.Column():
            output_summary = gr.Textbox(label="Generated Summary", placeholder="The summary will appear here", lines=20)
# summarize button logic
    summarize_button_extractive.click(
        fn=lambda text, pdf, method, limit: run_summarization(text, pdf, method, "Extractive", limit),
        inputs=[input_text, input_pdf, extractive_methods, word_limit],
        outputs=output_summary
    )

    summarize_button_abstractive.click(
        fn=lambda text, pdf, method, limit: run_summarization(text, pdf, method, "Abstractive", limit),
        inputs=[input_text, input_pdf, abstractive_methods, word_limit],
        outputs=output_summary
    )

    summarize_button_llm.click(
        fn=lambda text, pdf, method, limit: run_summarization(text, pdf, method, "LLM", limit),
        inputs=[input_text, input_pdf, llm_methods, word_limit],
        outputs=output_summary
    )

demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ac3ecc5d1a55ed3a21.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


