# Final Gradio Implementation

In [None]:
# Gradio and dependencies installation
!pip install gradio --quiet
!pip install sumy --quiet
!pip install langchain_google_genai --quiet
!pip install pypdf --quiet

from google.colab import userdata
import os
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.8/63.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [18]:
# Importing necessary libraries
import gradio as gr
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BartForConditionalGeneration, BartTokenizer
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.summarize import load_summarize_chain
from pypdf import PdfReader

# Download required NLTK data
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Frequency-based summarization
def frequency_based_summary(text, max_words):
    try:
        max_words = int(max_words)
        words = word_tokenize(text.lower())
        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word.isalnum() and word not in stop_words]

        word_freq = {}
        for word in words:
            word_freq[word] = word_freq.get(word, 0) + 1

        sentences = sent_tokenize(text)
        sentence_scores = {}

        for sentence in sentences:
            for word in word_tokenize(sentence.lower()):
                if word in word_freq:
                    sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]

        sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
        summary = []
        word_count = 0

        for sentence in sorted_sentences:
            word_count += len(word_tokenize(sentence))
            if word_count <= max_words:
                summary.append(sentence)
            else:
                break

        return " ".join(summary)
    except Exception as e:
        return f"Error: {str(e)}"

# LexRank summarization
def lexrank_summary(text, max_words):
    try:
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        summarizer = LexRankSummarizer()
        sentences = parser.document.sentences

        summary = []
        word_count = 0
        for sentence in summarizer(parser.document, len(sentences)):
            sentence_words = len(word_tokenize(str(sentence)))
            if word_count + sentence_words <= int(max_words):
                summary.append(str(sentence))
                word_count += sentence_words
            else:
                break

        return " ".join(summary)
    except Exception as e:
        return f"Error: {str(e)}"

# TextRank summarization
def textrank_summary(text, max_words):
    try:
        max_words = int(max_words)
        sentences = sent_tokenize(text)

        clean_sentences = []
        stop_words = set(stopwords.words("english"))
        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            words = [word for word in words if word.isalnum() and word not in stop_words]
            clean_sentences.append(" ".join(words))

        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(clean_sentences)
        similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

        graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(graph)
        ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)

        summary = []
        word_count = 0
        for _, sentence in ranked_sentences:
            sentence_words = len(word_tokenize(sentence))
            if word_count + sentence_words <= max_words:
                summary.append(sentence)
                word_count += sentence_words
            else:
                break

        return " ".join(summary)
    except Exception as e:
        return f"Error: {str(e)}"

# T5 summarization
def t5_summary(text, max_words):
    try:
        max_words = int(max_words)
        tokenizer = T5Tokenizer.from_pretrained("t5-small")
        model = T5ForConditionalGeneration.from_pretrained("t5-small")

        input_text = "summarize: " + text
        inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(inputs, max_length=max_words, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary
    except Exception as e:
        return f"Error: {str(e)}"

# BART summarization
def bart_summary(text, max_words):
    try:
        max_words = int(max_words)
        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
        model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

        inputs = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs, max_length=max_words, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary
    except Exception as e:
        return f"Error: {str(e)}"

# LLM (Gemini) summarization
def llm_summary(text, max_words):
    try:
        max_words = int(max_words)
        llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, max_output_tokens=max_words)
        prompt = f"Summarize the following text in approximately {max_words} words:\n\n{text}"
        result = llm(prompt)
        return result
    except Exception as e:
        return f"Error: {str(e)}"


# Advanced LangChain Summarization Techniques
def map_reduce_summary(text, max_words):
    try:
        # Initialize the LLM
        llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3, max_output_tokens=max_words)

        # Split the text into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )

        # Create documents
        docs = [Document(page_content=text)]

        # Load map-reduce summarization chain
        chain = load_summarize_chain(
            llm,
            chain_type="map_reduce",
            verbose=True
        )

        # Run the chain
        summary = chain.run(docs)

        return summary
    except Exception as e:
        return f"Error in Map Reduce Summary: {str(e)}"

def iterative_refinement_summary(text, max_words):
    try:
        # Initialize the LLM
        llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3, max_output_tokens=max_words)

        # Map prompt
        map_prompt = PromptTemplate(
            template="""Write a concise summary of the following text:
            "{text}"
            CONCISE SUMMARY:""",
            input_variables=["text"]
        )

        # Refine prompt
        refine_prompt = PromptTemplate(
            template="""You are an expert summarizer.
            First, review the existing summary and the new piece of text.
            Then, refine the summary to include the most important information,
            ensuring it captures the key points while staying within the word limit.
            Maintain the word limit strictly.
            Refined Summary:""",
            input_variables=["text"]
        )

        # Split the text into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        docs = text_splitter.create_documents([text])

        # Create the map chain
        map_chain = LLMChain(llm=llm, prompt=map_prompt)

        # Create a chain to combine documents
        combine_documents_chain = StuffDocumentsChain(
            llm_chain=map_chain,
            document_variable_name="text"
        )

        # Create the summary chain
        summary_chain = load_summarize_chain(
            llm,
            chain_type="refine",
            question_prompt=map_prompt,
            refine_prompt=refine_prompt,
            document_variable_name="text",
            return_intermediate_steps=False
        )

        # Run the chain
        summary = summary_chain.run(docs)

        return summary
    except Exception as e:
        return f"Error in Iterative Refinement Summary: {str(e)}"

def pdf_summarizer(pdf_file, method, max_words):
    try:
        # Read PDF
        reader = PdfReader(pdf_file)

        # Extract text from PDF
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"

        # Choose summarization method
        if method == "Map Reduce":
            return map_reduce_summary(text, max_words)
        elif method == "Iterative Refinement":
            return iterative_refinement_summary(text, max_words)
        elif method == "T5":
            return t5_summary(text, max_words)
        elif method == "BART":
            return bart_summary(text, max_words)
        elif method == "LLM (Gemini)":
            return llm_summary(text, max_words)
        else:
            return "Invalid summarization method selected."
    except Exception as e:
        return f"Error in PDF Summarization: {str(e)}"

# Extractive Summarization
def extractive_summarize(text, algorithm, max_words):
    if algorithm == "Frequency-based":
        return frequency_based_summary(text, max_words)
    elif algorithm == "LexRank":
        return lexrank_summary(text, max_words)
    elif algorithm == "TextRank":
        return textrank_summary(text, max_words)
    else:
        return "Invalid extractive summarization algorithm."

# Abstractive Summarization
def abstractive_summarize(text, algorithm, max_words):
    if algorithm == "T5":
        return t5_summary(text, max_words)
    elif algorithm == "BART":
        return bart_summary(text, max_words)
    else:
        return "Invalid abstractive summarization algorithm."

# LLM-based Summarization
def llm_summarize(text, algorithm, max_words):
    if algorithm == "LLM (Gemini)":
        return llm_summary(text, max_words)
    elif algorithm == "Iterative Refinement":
        return iterative_refinement_summary(text, max_words)
    elif algorithm == "Map Reduce":
        return iterative_refinement_summary(text, max_words)
    else:
        return "Invalid LLM summarization algorithm."

# PDF Summarization
def pdf_summarizer(pdf_file, summarization_type, algorithm, max_words):
    try:
        pdf_reader = PdfReader(pdf_file)
        text = "".join(page.extract_text() for page in pdf_reader.pages)

        if summarization_type == "Extractive":
            return extractive_summarize(text, algorithm, max_words)
        elif summarization_type == "Abstractive":
            return abstractive_summarize(text, algorithm, max_words)
        elif summarization_type == "LLM":
            return llm_summarize(text, algorithm, max_words)
        else:
            return "Invalid summarization type."
    except Exception as e:
        return f"Error in PDF summarization: {str(e)}"

# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Text Summarization App")

    with gr.Tabs():
        with gr.Tab("Summarization"):
            gr.Markdown("## Select Summarization Type and Method")

            with gr.Row():
                with gr.Column():
                    input_type = gr.Radio(
                        ["Text Input", "PDF Upload"],
                        label="Input Type",
                        value="Text Input"
                    )

                    # Text Input
                    text_input = gr.Textbox(
                        label="Input Text",
                        lines=5,
                        placeholder="Enter text here",
                        visible=True
                    )

                    # PDF Upload
                    pdf_input = gr.File(
                        label="Upload PDF",
                        type="filepath",
                        file_types=[".pdf"],
                        visible=False
                    )

                with gr.Column():
                    summarization_type = gr.Radio(
                        ["Extractive", "Abstractive", "LLM"],
                        label="Summarization Type",
                        value="Extractive"
                    )

                    method_dropdown = gr.Dropdown(
                        label="Algorithm",
                        visible=True
                    )

                    words = gr.Textbox(
                        label="Maximum Words",
                        value="100"
                    )

            # Output
            output = gr.Textbox(
                label="Summary",
                lines=5
            )

            # Generate Button
            button = gr.Button("Generate Summary")

            # Input Type Toggle
            def toggle_inputs(choice):
                if choice == "Text Input":
                    return {
                        text_input: gr.update(visible=True),
                        pdf_input: gr.update(visible=False)
                    }
                else:
                    return {
                        text_input: gr.update(visible=False),
                        pdf_input: gr.update(visible=True)
                    }

            input_type.change(
                toggle_inputs,
                inputs=input_type,
                outputs=[text_input, pdf_input]
            )

            # Update Algorithms Dropdown
            def update_algorithms(summarization_type):
                if summarization_type == "Extractive":
                    return gr.update(choices=["Frequency-based", "LexRank", "TextRank"], visible=True)
                elif summarization_type == "Abstractive":
                    return gr.update(choices=["T5", "BART"], visible=True)
                elif summarization_type == "LLM":
                    return gr.update(choices=["LLM (Gemini)", "Map Reduce", "Iterative Refinement"], visible=True)
                else:
                    return gr.update(visible=False)

            summarization_type.change(
                update_algorithms,
                inputs=summarization_type,
                outputs=method_dropdown
            )

            # Summarization Logic
            def summarize(input_type, text, pdf, summarization_type, method, max_words):
                if input_type == "Text Input":
                    if summarization_type == "Extractive":
                        return extractive_summarize(text, method, max_words)
                    elif summarization_type == "Abstractive":
                        return abstractive_summarize(text, method, max_words)
                    elif summarization_type == "LLM":
                        return llm_summarize(text, method, max_words)
                    else:
                        return "Invalid summarization type."
                else:  # PDF Upload
                    return pdf_summarizer(pdf, summarization_type, method, max_words)

            button.click(
                summarize,
                inputs=[input_type, text_input, pdf_input, summarization_type, method_dropdown, words],
                outputs=output
            )

demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d36279927f4896acbd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


