In [None]:
!pip install gradio
#Importing libraries
import gradio as gr
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse.linalg import svds
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration

Collecting gradio
  Downloading gradio-5.4.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.w

In [None]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Load T5 model and tokenizer for Abstractive Summarization
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Load BART model and tokenizer for Abstractive Summarization
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
#Frequency based
def frequency_based_summary(text, num_sentences=3):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    sentences = sent_tokenize(text)

    word_frequencies = defaultdict(int)
    for word in word_tokenize(text):
        if word not in stop_words and word not in string.punctuation:
            word_frequencies[word] += 1

    sentence_scores = defaultdict(int)
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                sentence_scores[sentence] += word_frequencies[word]

    summarized_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]

    return ' '.join(summarized_sentences)

In [None]:
# TextRank-based summarization
def textrank_summary(text):
    sentences = sent_tokenize(text)
    vectorizer = CountVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)

    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary_sentences = [s[1] for s in ranked_sentences[:max(2, len(sentences) // 2)]]
    summary = " ".join(summary_sentences)
    return summary

In [None]:
# LexRank-based summarization
def lexrank_summary(text):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)

    threshold = 0.1
    for i in range(len(similarity_matrix)):
        for j in range(len(similarity_matrix[i])):
            if similarity_matrix[i][j] < threshold:
                similarity_matrix[i][j] = 0

    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary_sentences = [s[1] for s in ranked_sentences[:max(2, len(sentences) // 2)]]
    summary = " ".join(summary_sentences)
    return summary

In [None]:
# LSA-based summarization
def lsa_summary(text):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    sentence_term_matrix = vectorizer.toarray()

    u, s, vt = svds(sentence_term_matrix, k=min(len(sentences) // 2, 2))
    scores = np.mean(u, axis=1)

    ranked_sentences = [sentences[i] for i in np.argsort(scores)[-max(2, len(sentences) // 3):]]
    summary = " ".join(ranked_sentences)
    return summary

In [None]:
def extractive_summary(text, method="Frequency-based"):
   if method == "Frequency-based":
        return frequency_based_summary(text)
   elif method == "TextRank":
        return textrank_summary(text)
   elif method == "LexRank":
        return lexrank_summary(text)
   elif method == "LSA":
        return lsa_summary(text)

In [None]:
def abstractive_summary(text, model_type="T5"):
    if model_type == "T5":
        inputs = t5_tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = t5_model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    elif model_type == "BART":
        inputs = bart_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = bart_model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
# Gradio interface
css = """
h1 {
    margin-top: 2rem;
    font-size: 2rem;
    text-align: center;
}
"""

input_text = gr.Text(label="Input Text", lines=10)
extractive_method = gr.Dropdown(["Frequency-based","TextRank","LexRank", "LSA"], label="Extractive Method")
abstractive_model = gr.Dropdown(["T5", "BART"], label="Abstractive Model")
word_limit = gr.Slider(10, 100, step=5, label="Word Limit")

with gr.Blocks(title="Text Summarizer App", css=css) as demo:
    gr.Markdown("# Summarizer App")

    with gr.Tabs():
        with gr.TabItem("Extractive"):
            gr.Interface(fn=extractive_summary,
                         inputs=[input_text, extractive_method, word_limit],
                         outputs=['text'],
                         flagging_mode='never',
                         submit_btn='Generate')
        with gr.TabItem("Abstractive"):
            gr.Interface(fn=abstractive_summary,
                         inputs=[input_text, abstractive_model, word_limit],
                         outputs=['text'],
                         flagging_mode='never',
                         submit_btn='Generate')

# Launch the Gradio app
demo.launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
