<a href="https://colab.research.google.com/github/springboardmentor0327/Text_Summarization_Infosys_Internship_Oct2024/blob/BandariRohith/Week_4_gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gradio --quiet
!pip install sumy --quiet
import os
import gradio as gr
from google.colab import userdata

# Set up Google API Key for LangChain LLM
os.environ["GOOGLE_API_KEY"] = userdata.get('API')

# Import necessary libraries
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from transformers import pipeline, AutoTokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import heapq

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Define LLM loader function
def load_llm(model="gemini-1.5-pro"):
    if model == "gemini-1.5-pro":
        llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0, max_tokens=None, timeout=None, max_retries=2)
    elif model == "gemini-1.5-flash":
        llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_tokens=None, timeout=None, max_retries=2)
    else:
        raise ValueError("Invalid model name")
    return llm

# Define prompt template for the LLM
def get_prompt_template():
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "Write a concise summary of the following in {num_words} words:\n\n"),
            ("human", "{context}")
        ]
    )
    return prompt

# Function to use LLM for summarization with size handling
def abstractive_summarization_llm(txt, num_words, model="gemini-1.5-pro"):
    try:
        llm = load_llm(model)
        prompt = get_prompt_template()

        # Ensure input text is within acceptable length for the model
        if len(txt.split()) > 1024:  # Adjust this number as per your model's limit
            txt = ' '.join(txt.split()[:1024])  # Truncate to first 1024 words

        chain = prompt | llm
        result = chain.invoke({"context": txt, "num_words": num_words})
        summary_text = result.content

        return summary_text, len(txt.split()), len(summary_text.split())
    except Exception as e:
        return f"Error: {str(e)}", len(txt.split()), 0

# Extractive Summarization Functions

def extractive_summarization_frequency(txt, n):
    sentences = sent_tokenize(txt)
    words = [word.lower() for word in word_tokenize(txt) if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    word_frequencies = {}
    for word in filtered_words:
        word_frequencies[word] = word_frequencies.get(word, 0) + 1

    max_frequency = max(word_frequencies.values())

    for word in word_frequencies:
        word_frequencies[word] /= max_frequency

    sentence_scores = {}

    for sentence in sentences:
        sentence_words = [word.lower() for word in word_tokenize(sentence) if word.isalpha()]
        if len(sentence.split(' ')) < 30:
            for word in sentence_words:
                if word in word_frequencies:
                    sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_frequencies[word]

    summary = heapq.nlargest(n, sentence_scores, key=sentence_scores.get)
    summary_text = " ".join(summary)

    return summary_text, len(txt.split()), len(summary_text.split())

def extractive_summarization_textrank(txt, n):
    parser = PlaintextParser.from_string(txt, Tokenizer("english"))
    summarizer = TextRankSummarizer()

    summary = summarizer(parser.document, n)
    summary_text = " ".join([str(sentence) for sentence in summary])

    return summary_text, len(txt.split()), len(summary_text.split())

# Abstractive Summarization Functions with Transformers


def abstractive_summarization_bart_split(txt, num_words):
    # Load the BART model and tokenizer
    bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

    # Split text into chunks
    inputs = tokenizer(txt, return_tensors="pt", truncation=True, padding=True, max_length=1024)
    input_length = len(inputs['input_ids'][0])

    if input_length > 1024:
        # Split the text into smaller chunks, each with a maximum of 1024 tokens
        tokenized_text = tokenizer.encode(txt)
        chunk_size = 1024
        chunks = [tokenized_text[i:i + chunk_size] for i in range(0, len(tokenized_text), chunk_size)]

        # Summarize each chunk
        summaries = []
        for chunk in chunks:
            chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
            summary = bart_summarizer(chunk_text, max_length=num_words, min_length=int(num_words * 0.5), do_sample=False)[0]['summary_text']
            summaries.append(summary)

        # Combine the summaries from each chunk
        final_summary = " ".join(summaries)
    else:
        # If the text fits within the token limit, summarize it as usual
        final_summary = bart_summarizer(txt, max_length=num_words, min_length=int(num_words * 0.5), do_sample=False)[0]['summary_text']

    return final_summary, len(txt.split()), len(final_summary.split())


def abstractive_summarization_t5(txt, num_words):
    t5_summarizer = pipeline("summarization", model="t5-small")

    # Tokenize and summarize with T5 (handles larger inputs similarly)
    summary = t5_summarizer(txt, max_length=num_words, min_length=int(num_words * 0.5), do_sample=False)[0]['summary_text']

    return summary, len(txt.split()), len(summary.split())

# Gradio UI setup with gr.Blocks(title="Comprehensive Summarizer App")
with gr.Blocks(title="Comprehensive Summarizer App") as demo:
    gr.Markdown("<h1 style='text-align: center; color: #4A90E2;'>Comprehensive Summarizer App</h1>")

    with gr.Tabs():
        # Extractive Summarization Tab
        with gr.TabItem("Extractive Summarization"):
            input_text = gr.Textbox(label="Input Text", lines=10, placeholder="Paste your text here...")
            method = gr.Dropdown(choices=["FREQUENCY", "TEXT RANK"], label="Select Method", value="FREQUENCY")
            line_limit = gr.Slider(label="Summary Line Limit", minimum=1, maximum=10, step=1, value=2)
            output_text = gr.Textbox(label="Summary", lines=5, placeholder="Your summary will appear here...")
            word_count = gr.Textbox(label="Word Count (Input | Summary)", interactive=False)

            def extractive_summarize_text(text, method, n):
                if method == "FREQUENCY":
                    summary, input_words, summary_words = extractive_summarization_frequency(text, n)
                elif method == "TEXT RANK":
                    summary, input_words, summary_words = extractive_summarization_textrank(text, n)
                return summary, f"{input_words} | {summary_words}"

            gr.Interface(fn=extractive_summarize_text,
                         inputs=[input_text, method, line_limit],
                         outputs=[output_text, word_count])

        # Abstractive Summarization Tab
        with gr.TabItem("Abstractive Summarization"):
            input_text_abstractive = gr.Textbox(label="Input Text", lines=10,
                                                 placeholder="Paste your text here...")
            method_abstractive = gr.Dropdown(choices=["BART", "T5", "LLM"], label="Select Method", value="BART")
            word_limit_abstractive = gr.Slider(label="Summary Word Limit", minimum=20,
                                                maximum=100,
                                                step=10,
                                                value=50)
            output_text_abstractive = gr.Textbox(label="Summary", lines=5,
                                                  placeholder="Your summary will appear here...")
            word_count_abstractive = gr.Textbox(label="Word Count (Input | Summary)", interactive=False)

            def abstractive_summarize_text(text, method, num_words):
                if method == "BART":
                    summary, input_words, summary_words = abstractive_summarization_bart(text,
                                                                                         num_words)
                elif method == "T5":
                    summary, input_words, summary_words = abstractive_summarization_t5(text,
                                                                                         num_words)
                elif method == "LLM":
                    summary, input_words, summary_words = abstractive_summarization_llm(text,
                                                                                         num_words)
                return summary,f"{input_words} | {summary_words}"

            gr.Interface(fn=abstractive_summarize_text,
                         inputs=[input_text_abstractive,
                                 method_abstractive,
                                 word_limit_abstractive],
                         outputs=[output_text_abstractive,
                                  word_count_abstractive])

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://093d64997edcefcefb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


