In [None]:
!pip install gradio
#Importing libraries
import gradio as gr
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse.linalg import svds
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration



In [None]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Load T5 model and tokenizer for Abstractive Summarization
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Load BART model and tokenizer for Abstractive Summarization
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Frequency-based summarization
def frequency_based_summary(text, num_sentences=3):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    sentences = sent_tokenize(text)

    word_frequencies = defaultdict(int)
    for word in word_tokenize(text):
        if word not in stop_words and word not in string.punctuation:
            word_frequencies[word] += 1

    sentence_scores = defaultdict(int)
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                sentence_scores[sentence] += word_frequencies[word]

    summarized_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]

    return ' '.join(summarized_sentences)

In [None]:
# TextRank summarization
def textrank_summary(text):
    sentences = sent_tokenize(text)
    vectorizer = CountVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)

    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary_sentences = [s[1] for s in ranked_sentences[:max(2, len(sentences) // 2)]]
    summary = " ".join(summary_sentences)
    return summary

In [None]:
# LexRank summarization
def lexrank_summary(text):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)

    threshold = 0.1
    for i in range(len(similarity_matrix)):
        for j in range(len(similarity_matrix[i])):
            if similarity_matrix[i][j] < threshold:
                similarity_matrix[i][j] = 0

    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary_sentences = [s[1] for s in ranked_sentences[:max(2, len(sentences) // 2)]]
    summary = " ".join(summary_sentences)
    return summary

In [None]:
# LSA-based summarization
def lsa_summary(text):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    sentence_term_matrix = vectorizer.toarray()

    u, s, vt = svds(sentence_term_matrix, k=min(len(sentences) // 2, 2))
    scores = np.mean(u, axis=1)

    ranked_sentences = [sentences[i] for i in np.argsort(scores)[-max(2, len(sentences) // 3):]]
    summary = " ".join(ranked_sentences)
    return summary

In [None]:
def extractive_summary(text, method="Frequency-based"):
   if method == "Frequency-based":
        return frequency_based_summary(text)
   elif method == "TextRank":
        return textrank_summary(text)
   elif method == "LexRank":
        return lexrank_summary(text)
   elif method == "LSA":
        return lsa_summary(text)

In [None]:
def abstractive_summary(text, model_type="T5"):
    if model_type == "T5":
        inputs = t5_tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = t5_model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    elif model_type == "BART":
        inputs = bart_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = bart_model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
#LLM
import os

In [None]:
from getpass import getpass
os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")


Enter your Google API key: ··········


In [None]:
def llm_summary(text, max_words):
    try:
        max_words = int(max_words)
        llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3, max_output_tokens=max_words)
        prompt = ChatPromptTemplate.from_messages([("system", f"Summarize this text in {max_words} words:\n\n"), ("human", text)])
        chain = prompt | llm
        result = chain.invoke({"text": text})
        return result.content
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
# Iterative Refinement without requiring an existing summary
def iterative_refinement_summary(text, max_words):
    try:
        max_words = int(max_words)
        llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3, max_output_tokens=max_words)

        # Split text into manageable chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        docs = text_splitter.create_documents([text])

        # Refinement chain
        prompt = PromptTemplate(template="Write a concise summary of the following text:\n\n{text}\n\nSUMMARY:", input_variables=["text"])
        summary_chain = load_summarize_chain(
            llm,
            chain_type="map_reduce",
            map_prompt=prompt,
            combine_prompt=prompt
        )

        # Generate the summary
        summary = summary_chain.run(docs)
        return summary
    except Exception as e:
        return f"Error in Iterative Refinement Summary: {str(e)}"

In [None]:
# PDF Summarization
def pdf_summarizer(pdf_file, summarization_type, algorithm, max_words):
    try:
        pdf_reader = PdfReader(pdf_file)
        text = "".join(page.extract_text() for page in pdf_reader.pages)

        if summarization_type == "Extractive":
            return extractive_summarize(text, algorithm, max_words)
        elif summarization_type == "Abstractive":
            return abstractive_summarize(text, algorithm, max_words)
        elif summarization_type == "LLM":
            return llm_summarize(text, algorithm, max_words)
        else:
            return "Invalid summarization type."
    except Exception as e:
        return f"Error in PDF summarization: {str(e)}"



In [None]:
import gradio as gr

# Custom CSS to remove blue highlight and set font to orange
custom_css = """
    /* Set the font family to a normal font and text color to orange */
    * {
        font-family: 'Arial', sans-serif;
        color: orange;  /* Set text color to orange */
    }

    /* Remove blue highlighting when text is selected or focused */
    ::selection {
        background-color: transparent; /* Remove background highlight */
        color: orange;  /* Set selection text color to orange */
    }

    /* Remove focus outlines */
    :focus {
        outline: none;  /* Remove focus outline */
    }

    /* Change the radio button color to orange */
    .gradio-radio input[type="radio"]:checked {
        background-color: orange;
        border-color: orange;
    }

    .gradio-radio input[type="radio"]:checked + label {
        color: white;  /* Text color of checked radio button */
    }

    /* Change the unselected radio button border color to orange */
    .gradio-radio input[type="radio"]:not(:checked) {
        border-color: orange;
    }

    /* Styling for radio button labels */
    .gradio-radio label {
        color: orange;  /* Label color */
    }
"""

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Text Summarization App")

    # Apply custom CSS
    demo.css = custom_css

    with gr.Tabs():
        with gr.Tab("Summarization"):
            gr.Markdown("## Select Summarization Type and Method")

            with gr.Row():
                with gr.Column():
                    input_type = gr.Radio(
                        ["Text Input", "PDF Upload"],
                        label="Input Type",
                        value="Text Input"
                    )

                    # Text Input
                    text_input = gr.Textbox(
                        label="Input Text",
                        lines=5,
                        placeholder="Enter text here",
                        visible=True
                    )

                    # PDF Upload
                    pdf_input = gr.File(
                        label="Upload PDF",
                        type="filepath",
                        file_types=[".pdf"],
                        visible=False
                    )

                with gr.Column():
                    summarization_type = gr.Radio(
                        ["Extractive", "Abstractive", "LLM"],
                        label="Summarization Type",
                        value="Extractive"
                    )

                    method_dropdown = gr.Dropdown(
                        label="Algorithm",
                        visible=True
                    )

                    words = gr.Textbox(
                        label="Maximum Words",
                        value="100"
                    )

            # Output
            output = gr.Textbox(
                label="Summary",
                lines=5
            )

            # Generate Button
            button = gr.Button("Generate Summary")

            # Input Type Toggle
            def toggle_inputs(choice):
                if choice == "Text Input":
                    return {
                        text_input: gr.update(visible=True),
                        pdf_input: gr.update(visible=False)
                    }
                else:
                    return {
                        text_input: gr.update(visible=False),
                        pdf_input: gr.update(visible=True)
                    }

            input_type.change(
                toggle_inputs,
                inputs=input_type,
                outputs=[text_input, pdf_input]
            )

            # Update Algorithms Dropdown
            def update_algorithms(summarization_type):
                if summarization_type == "Extractive":
                    return gr.update(choices=["Frequency-based", "LexRank", "TextRank", "LSA"], visible=True)
                elif summarization_type == "Abstractive":
                    return gr.update(choices=["T5", "BART"], visible=True)
                elif summarization_type == "LLM":
                    return gr.update(choices=["LLM (Gemini)", "Map Reduce", "Iterative Refinement"], visible=True)
                else:
                    return gr.update(visible=False)

            summarization_type.change(
                update_algorithms,
                inputs=summarization_type,
                outputs=method_dropdown
            )

            # Summarization Logic
            def summarize(input_type, text, pdf, summarization_type, method, max_words):
                if input_type == "Text Input":
                    if summarization_type == "Extractive":
                        return extractive_summarize(text, method, max_words)
                    elif summarization_type == "Abstractive":
                        return abstractive_summarize(text, method, max_words)
                    elif summarization_type == "LLM":
                        return llm_summarize(text, method, max_words)
                    else:
                        return "Invalid summarization type."
                else:  # PDF Upload
                    return pdf_summarizer(pdf, summarization_type, method, max_words)

            button.click(
                summarize,
                inputs=[input_type, text_input, pdf_input, summarization_type, method_dropdown, words],
                outputs=output
            )

demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c1602caa552a1e8ae5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


