<a href="https://colab.research.google.com/github/springboardmentor0327/Text_Summarization_Infosys_Internship_Oct2024/blob/Sameer/FINAL_gradio_app_with_all_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
!pip install gradio --quiet
!pip install sumy



**FREQUENCY MODEL**

In [60]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [61]:
# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  # For lemmatization
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [62]:
def freq_model(text, max_words=30):
    # Load English stopwords
    stopwords1 = set(stopwords.words("english"))

    # Tokenize text into words
    words = word_tokenize(text)

    # Create a frequency table for words (ignoring stopwords)
    freqTable = {word.lower(): words.count(word.lower()) for word in words if word.lower() not in stopwords1}

    # Tokenize text into sentences
    sentences = sent_tokenize(text)

    # Create a sentence value dictionary and sentence score
    sentenceValue = {}
    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
                sentenceValue[sentence] = sentenceValue.get(sentence, 0) + freq

    # Calculate the sum of sentence values
    sumValues = sum(sentenceValue.values())

    # Calculate the average sentence value
    average = int(sumValues / len(sentenceValue))

    # Generate the summary with max and min word constraints
    summary_sentences = []
    current_word_count = 0
    min_words = max_words - 10  # Default minimum words limit

    for sentence in sentences:
        if sentenceValue.get(sentence, 0) > (1.2 * average):
            sentence_word_count = len(word_tokenize(sentence))
            if min_words <= current_word_count + sentence_word_count <= max_words:
                summary_sentences.append(sentence)
                current_word_count += sentence_word_count
    summary = ' '.join(summary_sentences)
    summary_word_count = len(word_tokenize(summary))
    return summary,(f"\nWord Count (Summary): {summary_word_count}")

**SUMY MODEL**

In [63]:
!pip install sumy



In [114]:
# Import required libraries
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

In [65]:
def sumy_model(text, max_words=50):
    # Create a plaintext parser with English tokenization
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    # Create a TextRank summarizer
    summarizer = TextRankSummarizer()

    # Summarize the text, initially setting to retrieve more sentences than needed
    summary_sentences = summarizer(parser.document, len(parser.document.sentences))

    # Initialize variables for word count
    summary = []
    current_word_count = 0
    min_words = max_words - 10  # Default minimum word count

    # Iterate over sentences, adding them until we reach the max word limit
    for sentence in summary_sentences:
        sentence_word_count = len(word_tokenize(str(sentence)))
        if current_word_count + sentence_word_count > max_words:
            break  # Stop if adding the sentence exceeds max_words
        if current_word_count + sentence_word_count >= min_words:
            summary.append(str(sentence))
            current_word_count += sentence_word_count

    # Convert the summary list to a string
    text_summary = " ".join(summary)
    summary_word_count = len(word_tokenize(text_summary))
    return text_summary,(f"\nWord Count (Summary): {summary_word_count}")

**LUHN MODEL**

In [116]:
# Import required libraries
from sumy.summarizers.text_rank import TextRankSummarizer

In [117]:
def luhn_model(text, max_words=30):
    # Create a plaintext parser with English tokenization
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    # Create a Luhn summarizer
    summarizer_luhn = LuhnSummarizer()

    # Generate all sentences in the summary initially
    summary_sentences = summarizer_luhn(parser.document, len(parser.document.sentences))

    # Initialize variables for word count
    summary = []
    current_word_count = 0
    min_words = max(0, max_words - 10)  # Default minimum word count, ensures it’s non-negative

    # Iterate over sentences, adding them until we reach the max word limit
    for sentence in summary_sentences:
        sentence_word_count = len(word_tokenize(str(sentence)))

        # Check if adding this sentence would exceed max_words
        if current_word_count + sentence_word_count > max_words:
            break  # Stop if adding the sentence would exceed max_words

        # Add the sentence to summary and update the word count
        summary.append(str(sentence))
        current_word_count += sentence_word_count

        # Break if we’ve reached the minimum word count required and are close to max_words
        if current_word_count >= min_words:
            break

    # Convert the summary list to a single string
    final_summary = ' '.join(summary)
    summary_word_count = len(word_tokenize(final_summary))
    return final_summary,(f"\nWord Count (Summary): {summary_word_count}")

**T5 MODEL**

In [68]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

import glob
import pprint

pp = pprint.PrettyPrinter()

In [69]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

In [70]:
def t5_model(text,max_words):
    num_beams=5
    # Define min and max words for generation based on the provided max_words
    min_words = max(1, max_words - 10)
    max_tokens = max_words * 1.5  # Set max tokens with some buffer for BERT-based tokenization

    # Initialize the generated summary
    summary = ""

    # Repeat generation to ensure summary falls within the word count range
    while True:
        # Preprocess the text for encoding
        inputs = tokenizer.encode(
            "summarize: " + text,
            return_tensors='pt',
            max_length=512,  # Limit input length to avoid truncation
            truncation=True
        )

        # Generate the summary with length constraints
        summary_ids = model.generate(
            inputs,
            max_length=int(max_tokens),     # Set max tokens for output
            min_length=min_words,           # Set min length
            num_beams=num_beams,
            length_penalty=2.0,
            early_stopping=True
        )

        # Decode and calculate the word count of the generated summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summary_words = summary.split()

        # Check if the summary is within the desired word range
        word_count = len(summary_words)
        if min_words <= word_count <= max_words:
            break

        # Adjust max_tokens slightly to retry if the summary doesn’t fit within the range
        max_tokens -= 5
        if max_tokens < min_words * 1.5:
            break  # Stop if max_tokens becomes impractically low
    summary_word_count = len(summary_words)
    return summary,(f"\nWord Count (Summary): {summary_word_count}")

**LLM MODEL**

In [71]:
from google.colab import userdata
import os
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

In [72]:
%pip install --upgrade --quiet tiktoken langchain langgraph beautifulsoup4 langchain langchain-google-genai langchain-huggingface

In [73]:
from langchain_google_genai import ChatGoogleGenerativeAI

def load_llm(model="gemini-1.5-pro"):

  if model == "gemini-1.5-pro":
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-pro",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2)
    return llm
  elif model == "gemini-1.5-flash":
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2)
    return llm
  else:
    raise ValueError("Invalid model name")

In [74]:
from langchain_core.prompts import ChatPromptTemplate

def get_prompt_template():
    # Define prompt with a strict word range instruction
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Write a concise summary of the following in {min_word_count} to {max_word_count} words. Stop once you reach the word limit:\n\n",
            ),
            ("human", "{context}")
        ]
    )
    return prompt

def llm_model(text, max_word_count=50):
    model="gemini-1.5-pro"
    # Set default for min_word_count if not provided
    # if min_word_count is None:
    min_word_count = max_word_count - 5

    # Load LLM
    llm = load_llm(model)

    # Get Prompt Template
    prompt = get_prompt_template()

    # Instantiate chain
    chain = prompt | llm

    # Invoke chain with specified range
    result = chain.invoke({
        "context": text,
        "min_word_count": min_word_count,
        "max_word_count": max_word_count
    })

    # Process output to enforce word limit
    output = result.content
    words = output.split()

    # If output exceeds max_word_count, truncate to max_word_count
    if len(words) > max_word_count:
        output = ' '.join(words[:max_word_count]) + "."

    # Return truncated output if necessary
    return output,(f"\nWord Count (Summary): {len(output.split())}")


**BART MODEL**

In [76]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

In [77]:
from transformers import BartTokenizer, BartForConditionalGeneration

def bart_model(article, max_words=50):
    # Load BART model and tokenizer
    model_name = 'facebook/bart-large-cnn'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)

    # Define minimum and maximum words for the summary
    min_words = max(10, max_words - 10)
    max_tokens = max_words * 1.5  # Adding a buffer as BERT-based tokens don't align 1:1 with words

    # Count the number of words in the original article
    original_word_count = len(article.split())
    print(f"Original article word count: {original_word_count}")

    # Tokenize and encode the article
    inputs = tokenizer.encode(article, return_tensors='pt', max_length=1024, truncation=True)

    # Generate the summary with specified length constraints
    summary = ""
    while True:
        summary_ids = model.generate(
            inputs,
            num_beams=4,
            min_length=min_words,           # Set minimum tokens
            max_length=int(max_tokens),      # Set maximum tokens
            length_penalty=2.0,
            early_stopping=True
        )

        # Decode and calculate word count of the summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summary_word_count = len(summary.split())

        # Check if the summary is within the specified range
        if min_words <= summary_word_count <= max_words:
            break

        # Reduce max_tokens slightly and retry if summary is too long
        max_tokens -= 5
        if max_tokens < min_words * 1.5:  # Stop if max_tokens is too low to avoid an infinite loop
            break
    return summary,(f"Generated summary word count: {summary_word_count}")

**Defining functions of all models**

In [109]:
# Placeholder for input and method-specific functions
def extractive_summarize(text, method, max_words):
    if method == "Frequency Method":
        return freq_model(text, max_words)
    elif method == "Sumy Method":
        return sumy_model(text, max_words)
    elif method == "Luhn Method":
        return luhn_model(text, max_words)

def abstractive_summarize(text, model, max_words):
    if model == "T5":
        return t5_model(text,max_words)
    elif model == "BART":
        return bart_model(text, max_words)


def advance_summarize(text, model, max_words):
    if model == "LLM Model":
        return llm_model(text, max_words)
    elif model == "refine":
        return llm_model(text, max_words)
    elif model == "map_Reduce":
        return llm_model(text, max_words)

**Text Extractor**

In [84]:
!pip install PyPDF2
from PyPDF2 import PdfReader



In [85]:
def extract_text_from_pdf(pdf_file):
    """Extract text from uploaded PDF file."""
    reader = PdfReader(pdf_file.name)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

**Document Summarizer**

In [80]:
!pip install -U gradio langchain langchain-community pypdf langchain-google-genai langgraph --quiet

In [81]:
from langchain.document_loaders import PyPDFLoader
from langchain.chains.summarize import load_summarize_chain

In [82]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
      model="gemini-1.5-flash",
      temperature=0,
      max_tokens=None,
      timeout=None,
      max_retries=2
)


In [110]:
def summarize_pdf(pdf_file_path, chain_type, max_word_count):
    loader = PyPDFLoader(pdf_file_path)
    docs = loader.load_and_split()

    # Set minimum word count
    min_word_count = max_word_count - 10

    if chain_type == "map_Reduce":
        chain = load_summarize_chain(llm, chain_type="map_reduce")
        summary = chain.invoke(docs)
    elif chain_type == "refine":
        chain = load_summarize_chain(llm, chain_type="refine")
        summary = chain.invoke(docs)
    else:
        return "Enter valid chain_type!! (map_reduce or refine)"

    # Ensure the summary is within the specified word count range
    summary_text = summary['output_text']
    words = summary_text.split()
    word_count = len(words)

    if word_count < min_word_count:
        # Expand the summary until it reaches the minimum word count
        while word_count < min_word_count:
            summary_text += " " + words[word_count % len(words)]
            words = summary_text.split()
            word_count = len(words)
    elif word_count > max_word_count:
        # Truncate the summary until it reaches the maximum word count
        summary_text = " ".join(words[:max_word_count])+'.'
        word_count = max_word_count  # Update word count to max_word_count

    return summary_text, (f"\nWord Count (Summary): {len(summary_text.split())}")  # Return both summary and word count

    # return summary_text

**Handling Functions**

In [86]:
def extractive_handler(pdf_file, input_text, method, word_count):
    text = input_text
    if pdf_file is not None:
        text = extract_text_from_pdf(pdf_file)
    return extractive_summarize(text, method, word_count)

In [111]:
def abstractive_handler(pdf_file, input_text, method, word_count):
    text = input_text
    if pdf_file is not None:
        text = extract_text_from_pdf(pdf_file)
    return abstractive_summarize(text, method, word_count)

In [112]:
def advance_handler(pdf_file, input_text, method, max_words):
    text = input_text
    if pdf_file is not None:
        if method == "map_Reduce":
            return summarize_pdf(pdf_file,method, max_words)
        elif method == "refine":
            return summarize_pdf(pdf_file,method, max_words)
        else:
            text = extract_text_from_pdf(pdf_file)
    return advance_summarize(text, method, max_words)

**Gradio App**

In [113]:
import gradio as gr

# Define CSS
css = """
h1 {
    margin-top: 2rem;
    font-size: 2rem;
    text-align: center;
}
"""
input_text = gr.Textbox(label="Input Text", lines=10)
max_words = gr.Slider(label="Max Words", minimum=10, maximum=100, step=10, value=50)
input_pdf_path = gr.File(label="Enter the PDF file path")
# chain_type = gr.Text(label="Enter Chain_type")
output_summary = gr.Text(label="Summary")


# Function to toggle input fields
def toggle_input(input_type):
    if input_type == "Text":
        return gr.update(visible=True), gr.update(visible=False)
    elif input_type == "PDF":
        return gr.update(visible=False), gr.update(visible=True)

# Gradio app structure
with gr.Blocks(title="Summarizer App", css=css) as demo:
    gr.Markdown("# Summarizer App")

    with gr.Tabs():
        with gr.TabItem("Extractive"):
            with gr.Row():
                # Left column: Inputs
                with gr.Column(scale=1):
                    gr.Markdown("### Inputs")
                    input_type = gr.Radio(
                        ["Text", "PDF"],
                        label="Select Input Type",
                        value="Text",
                    )

                    input_text = gr.Textbox(
                        label="Input Text",
                        lines=10,
                        visible=True,
                    )
                    input_pdf_path = gr.File(
                        label="Upload PDF",
                        file_types=[".pdf"],
                        visible=False,
                    )

                    extractive_dropdown = gr.Dropdown(
                        ["Frequency Method", "Sumy Method", "Luhn Method"],
                        label="Choose Extractive Method",
                    )

                    max_words = gr.Slider(
                        label="Max Words", minimum=10, maximum=100, step=10, value=40
                    )

                    summarize_button = gr.Button("Summarize")

                # Right column: Outputs
                with gr.Column(scale=1):
                    gr.Markdown("### Outputs")
                    output_summary = gr.Textbox(label="Summary Output")
                    output_word_count = gr.Textbox(label="Word Count")

            # Change visibility based on input type selection
            input_type.change(
                toggle_input,
                inputs=input_type,
                outputs=[input_text, input_pdf_path],
            )

            # Trigger summarization
            summarize_button.click(
                extractive_handler,
                inputs=[input_pdf_path,input_text, extractive_dropdown, max_words],
                outputs=[output_summary, output_word_count],
            )
        with gr.TabItem("Abstractive"):
            with gr.Row():
                # Left column: Inputs
                with gr.Column(scale=1):
                    gr.Markdown("### Inputs")
                    input_type = gr.Radio(
                        ["Text", "PDF"],
                        label="Select Input Type",
                        value="Text",
                    )

                    input_text = gr.Textbox(
                        label="Input Text",
                        lines=10,
                        visible=True,
                    )
                    input_pdf_path = gr.File(
                        label="Upload PDF",
                        file_types=[".pdf"],
                        visible=False,
                    )

                    extractive_dropdown = gr.Dropdown(
                        ["T5", "BART"],
                        label="Choose Extractive Method",
                    )

                    max_words = gr.Slider(
                        label="Max Words", minimum=10, maximum=100, step=10, value=40
                    )

                    summarize_button = gr.Button("Summarize")

                # Right column: Outputs
                with gr.Column(scale=1):
                    gr.Markdown("### Outputs")
                    output_summary = gr.Textbox(label="Summary Output")
                    output_word_count = gr.Textbox(label="Word Count")

            # Change visibility based on input type selection
            input_type.change(
                toggle_input,
                inputs=input_type,
                outputs=[input_text, input_pdf_path],
            )

            # Trigger summarization
            summarize_button.click(
                abstractive_handler,
                inputs=[input_pdf_path,input_text, extractive_dropdown, max_words],
                outputs=[output_summary, output_word_count],
            )
        with gr.TabItem("Advance LLM's"):
            with gr.Row():
                # Left column: Inputs
                with gr.Column(scale=1):
                    gr.Markdown("### Inputs")
                    input_type = gr.Radio(
                        ["Text", "PDF"],
                        label="Select Input Type",
                        value="Text",
                    )

                    input_text = gr.Textbox(
                        label="Input Text",
                        lines=10,
                        visible=True,
                    )
                    input_pdf_path = gr.File(
                        label="Upload PDF",
                        file_types=[".pdf"],
                        visible=False,
                    )

                    extractive_dropdown = gr.Dropdown(
                        ["LLM Model","refine","map_Reduce"],
                        label="Choose Extractive Method",
                    )

                    max_words = gr.Slider(
                        label="Max Words", minimum=10, maximum=100, step=10, value=40
                    )

                    summarize_button = gr.Button("Summarize")

                # Right column: Outputs
                with gr.Column(scale=1):
                    gr.Markdown("### Outputs")
                    output_summary = gr.Textbox(label="Summary Output")
                    output_word_count = gr.Textbox(label="Word Count")

            # Change visibility based on input type selection
            input_type.change(
                toggle_input,
                inputs=input_type,
                outputs=[input_text, input_pdf_path],
            )

            # Trigger summarization
            summarize_button.click(
                advance_handler,
                inputs=[input_pdf_path,input_text, extractive_dropdown, max_words],
                outputs=[output_summary, output_word_count],
            )

demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ad3c1f70462d0167a2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




**GRADIO APP without pdf version**

In [50]:
# import gradio as gr

# # Define your CSS
# css = """
# h1 {
#     margin-top: 2rem;
#     font-size: 2rem;
#     text-align: center;
# }
# """

# # Define input fields
# input_text = gr.Textbox(label="Input Text", lines=10)
# max_words = gr.Slider(label="Max Words", minimum=10, maximum=100, step=10, value=50)
# input_pdf_path = gr.File(label="Enter the PDF file path")
# # chain_type = gr.Text(label="Enter Chain_type")
# output_summary = gr.Text(label="Summary")

# # Gradio app structure
# with gr.Blocks(title="Summarizer App", css=css) as demo:
#     gr.Markdown("# Summarizer App")

#     with gr.Tabs():
#         with gr.TabItem("Extractive"):
#           input_type = gr.Radio(
#                         ["Text Input", "PDF Upload"],
#                         label="Input Type",
#                         value="Text Input"
#                     )

#                     # Text Input
#                     text_input = gr.Textbox(
#                         label="Input Text",
#                         lines=5,
#                         placeholder="Enter text here",
#                         visible=True
#                     )

#                     # PDF Upload
#                     pdf_input = gr.File(
#                         label="Upload PDF",
#                         type="filepath",
#                         file_types=[".pdf"],
#                         visible=False
#                     )
#             extractive_dropdown = gr.Dropdown(
#                 ["Frequency Method", "Sumy Method", "Luhn Method"],
#                 label="Choose Extractive Method",
#             )
#             gr.Interface(
#                 fn=extractive_summarize,
#                 inputs=[input_text, extractive_dropdown, max_words],
#                 # outputs=["text","text"],
#                 outputs=[
#                     gr.Textbox(label="Summary Output"),
#                     gr.Textbox(label="Word Count"),
#                 ],
#                 flagging_mode="never",
#                 live=False,
#             )
#         with gr.TabItem("Abstractive"):
#             abstractive_dropdown = gr.Dropdown(
#                 ["T5", "BART", "LLM Model"],
#                 label="Choose Abstractive Model",
#             )
#             gr.Interface(
#                 fn=abstractive_summarize,
#                 inputs=[input_text, abstractive_dropdown, max_words],
#                 # outputs="text",
#                 outputs=[
#                     gr.Textbox(label="Summary Output"),
#                     gr.Textbox(label="Word Count"),
#                 ],
#                 flagging_mode="never",
#                 live=False,
#             )
#         with gr.TabItem("PDF"):
#               chain_type = gr.Dropdown(
#                 ["refine", "map_reduce"],
#                 label="Choose Chain_type",
#             )
#               gr.Interface(fn=summarize_pdf,
#                         inputs=[input_pdf_path,chain_type,max_words],
#                         # outputs=[output_summary],
#                         outputs=[
#                            gr.Textbox(label="Summary Output"),
#                            gr.Textbox(label="Word Count"),
#                         ],
#                         flagging_mode='never',
#                         submit_btn='Generate',
#                         live=False,
#             )

# # Launch app
# demo.launch(debug=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://32fd7eede5f6584f5c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://398bfb5f005e57f8a6.gradio.live
Killing tunnel 127.0.0.1:7861 <> https://47d09e27313f8b35b6.gradio.live
Killing tunnel 127.0.0.1:7862 <> https://1aeaac70e42cab9d0c.gradio.live
Killing tunnel 127.0.0.1:7863 <> https://8f4afd0fcf5843f558.gradio.live
Killing tunnel 127.0.0.1:7864 <> https://e84aff949ed92b0c7b.gradio.live
Killing tunnel 127.0.0.1:7865 <> https://f47c88a44b9d72899d.gradio.live
Killing tunnel 127.0.0.1:7866 <> https://13a33e76687569c288.gradio.live
Killing tunnel 127.0.0.1:7867 <> https://7597e00ceb5c430b5e.gradio.live
Killing tunnel 127.0.0.1:7868 <> https://14abe7c142fdfa7865.gradio.live
Killing tunnel 127.0.0.1:7869 <> https://32fd7eede5f6584f5c.gradio.live


