In [2]:
%pip install --quiet gradio transformers nltk numpy networkx sumy langchain-google-genai
import gradio as gr
from transformers import pipeline
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

True

In [3]:
try:
    t5_summarizer = pipeline("summarization", model="t5-small")
    bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
    print(f"Error loading models: {e}")

def frequency_based_summary(text, word_limit):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())

    # Build the frequency table for non-stop words
    freq_table = {word: words.count(word) for word in words if word not in stop_words}

    # Score each sentence based on word frequency
    sentences = sent_tokenize(text)
    sentence_scores = {sentence: sum(freq_table.get(word, 0) for word in word_tokenize(sentence.lower()))
                       for sentence in sentences}

    # Sort and select sentences within the word limit
    summary, word_count = [], 0
    for sentence in sorted(sentence_scores, key=sentence_scores.get, reverse=True):
        sentence_word_count = len(word_tokenize(sentence))
        if word_count + sentence_word_count <= word_limit:
            summary.append(sentence)
            word_count += sentence_word_count
        else:
            break

    summary_text = " ".join(summary)
    return summary_text  # Only return the summary text

def luhn_summary(text, word_limit):
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.luhn import LuhnSummarizer
    try:
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        summarizer = LuhnSummarizer()

        preliminary_summary = summarizer(parser.document, sentences_count=len(text.split(".")))

        summary = []
        word_count = 0

        # Iterate through the preliminary summary sentences
        for sentence in preliminary_summary:
            sentence = str(sentence)
            sentence_word_count = len(word_tokenize(sentence))

            if word_count + sentence_word_count <= word_limit:
                summary.append(sentence)
                word_count += sentence_word_count
            else:
                break

        return " ".join(summary)

    except Exception as e:
        return f"Error generating summary with Luhn: {e}"

def lsa_summary(text, word_limit):
    try:
        # Tokenize sentences
        sentences = sent_tokenize(text)

        # Vectorize sentences
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(sentences)

        # Apply LSA (SVD) to reduce dimensionality
        svd = TruncatedSVD(n_components=1)
        svd.fit(X)
        scores = svd.transform(X).flatten()

        # Rank sentences by their scores
        ranked_sentences = [sentences[i] for i in np.argsort(scores)[::-1]]

        # Select sentences until the word limit is reached
        summary = []
        word_count = 0
        for sentence in ranked_sentences:
            sentence_word_count = len(word_tokenize(sentence))
            if word_count + sentence_word_count <= word_limit:
                summary.append(sentence)
                word_count += sentence_word_count
            else:
                break

        # Final summary
        summary_text = " ".join(summary)
        return summary_text
    except Exception as e:
        return f"Error generating summary with LSA: {e}"

def lex_rank_summary(text, word_limit):
    try:
        # Tokenize sentences
        sentences = sent_tokenize(text)

        # Vectorize sentences and calculate similarity matrix
        vectorizer = CountVectorizer().fit_transform(sentences)
        tfidf_matrix = TfidfTransformer().fit_transform(vectorizer)
        similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()

        # Build graph and calculate PageRank
        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)

        # Rank sentences by PageRank scores
        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

        # Select sentences until the word limit is reached
        summary = []
        word_count = 0
        for _, sentence in ranked_sentences:
            sentence_word_count = len(word_tokenize(sentence))
            if word_count + sentence_word_count <= word_limit:
                summary.append(sentence)
                word_count += sentence_word_count
            else:
                break

        # Final summary
        summary_text = " ".join(summary)
        return summary_text

    except Exception as e:
        return f"Error generating summary with LexRank: {e}"

def t5_summary(text, word_limit):
    try:
        return t5_summarizer(text, max_length=word_limit, min_length=word_limit // 2, do_sample=False)[0]['summary_text']
    except Exception as e:
        return f"Error generating summary with T5: {e}"

def bart_summary(text, word_limit):
    try:
        return bart_summarizer(text, max_length=word_limit, min_length=word_limit // 2, do_sample=False)[0]['summary_text']
    except Exception as e:
        return f"Error generating summary with BART: {e}"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [4]:
def summarize(text, summarization_type, method, word_limit=None):
    extractive_methods = {
        "Frequency-based": frequency_based_summary,
        "Luhn": luhn_summary,
        "LSA": lsa_summary,
        "LexRank": lex_rank_summary,
    }
    abstractive_methods = {
        "T5": t5_summary,
        "BART": bart_summary,
        "LLM": llm_summary,
    }

    if summarization_type == "Extractive":
        summarizer_function = extractive_methods.get(method)
    elif summarization_type == "Abstractive":
        summarizer_function = abstractive_methods.get(method)
    else:
        return "Invalid summarization type"

    if summarizer_function is None:
        return "Invalid method"

    try:
        summary = summarizer_function(text, word_limit)
        return summary
    except Exception as e:
        return f"An error occurred: {str(e)}"


In [5]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

def load_llm(model="gemini-1.5-pro", max_tokens=None):
    if model == "gemini-1.5-pro":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-pro",
            temperature=0,
            max_tokens=max_tokens,
            timeout=None,
            max_retries=2
        )
        return llm
    elif model == "gemini-1.5-flash":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2
        )
        return llm
    else:
        raise ValueError("Invalid model name")

def get_prompt_template():
    # Define prompt
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Write a concise summary of the following in {num_words} words:\\n\\n",
            ),
            ("human", "{context}")
        ]
    )
    return prompt

def llm_summary(text, num_words=50, model="gemini-1.5-pro"):
    # Load LLM with the specified max_tokens
    llm = load_llm(model=model, max_tokens=num_words)  # Pass num_words as max_tokens

    # Get Prompt Template
    prompt = get_prompt_template()

    # Instantiate chain
    chain = prompt | llm

    # Invoke chain
    result = chain.invoke({
        "context": text,
        "num_words": num_words
    })

    # Return result
    return result.content


In [8]:
css = """
    .container {
        border: 1px solid #FFA500;  /* Light orange border */
        box-shadow: 0px 4px 10px rgba(255, 165, 0, 0.2); /* Light orange shadow */
        padding: 20px;
        border-radius: 8px;
    }
    .title {
        text-align: center;
        font-size: 24px;
        color: #333; /* Text color */
        margin-bottom: 20px;
    }
    .gr-button {
        background-color: #4A90E2; /* Button background color */
        color: white; /* Button text color */
        border: none;
        border-radius: 5px; /* Rounded corners for buttons */
    }
    .gr-button:hover {
        background-color: #357ABD; /* Button hover color */
    }
"""


In [11]:
import gradio as gr

with gr.Blocks(css=css) as demo:
    gr.Markdown("<h1 style='text-align:center;'>Document Summarizer App</h1>")

    with gr.Tabs() as tabs:
        # Extractive Summarization Tab
        with gr.TabItem("Extractive"):
            extractive_methods = gr.Dropdown(
                ["Frequency-based", "Luhn", "LSA", "LexRank"], label="Select Extractive Method"
            )
            with gr.Row():
                input_text_extractive = gr.Textbox(label="Input Text", placeholder="Enter text to summarize", lines=10)
                output_summary_extractive = gr.Textbox(label="Generated Summary", placeholder="The summary will appear here", lines=10)

        # Abstractive Summarization Tab
        with gr.TabItem("Abstractive"):
            abstractive_methods = gr.Dropdown(
                ["T5", "BART", "LLM"], label="Select Abstractive Method"
            )
            with gr.Row():
                input_text_abstractive = gr.Textbox(label="Input Text", placeholder="Enter text to summarize", lines=10)
                output_summary_abstractive = gr.Textbox(label="Generated Summary", placeholder="The summary will appear here", lines=10)

    word_limit = gr.Slider(minimum=30, maximum=1000, step=1, value=100, label="Word Limit")

    summarize_button = gr.Button("Summarize", variant="primary")


    def run_summarization(input_text, method, tab, limit):
        if tab == "Extractive":
            summary = summarize(input_text, "Extractive", method, word_limit=int(limit) if limit else None)
        elif tab == "Abstractive":
            summary = summarize(input_text, "Abstractive", method, word_limit=int(limit) if limit else None)
        else:
            return "Invalid summarization type"
        input_word_count = len(input_text.split())
        summary_word_count = len(summary.split())

        final_summary = f"{summary}\n\nInput Words: {input_word_count}\nSummary Words: {summary_word_count}"
        return final_summary

    summarize_button.click(
        fn=lambda text, method, limit: run_summarization(text, method, "Extractive", limit),
        inputs=[input_text_extractive, extractive_methods, word_limit],
        outputs=output_summary_extractive
    )

    summarize_button.click(
        fn=lambda text, method, limit: run_summarization(text, method, "Abstractive", limit),
        inputs=[input_text_abstractive, abstractive_methods, word_limit],
        outputs=output_summary_abstractive
    )

demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d818444a62264d4295.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


