<a href="https://colab.research.google.com/github/shruthimohan03/video-summarizer/blob/main/Extractive_summarization_BART_%26_LexRank_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **Bart**

In [1]:
from transformers import pipeline, AutoTokenizer
import re

In [2]:
# Load summarizer model and tokenizer
model_name = "facebook/bart-large-cnn" # Use BART pre-trained for summarization
summarizer = pipeline("summarization", model=model_name, device=-1) # CPU-based summarization pipeline
tokenizer = AutoTokenizer.from_pretrained(model_name) # Tokenizer to manage input token limits

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [3]:
def preprocess_text(text):
    """
    Clean the input text by removing special characters and trimming extra spaces.
    """
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

In [4]:
def chunk_text(text, max_tokens=1024):
    """
    Split the input text into chunks respecting the model's token limit.
    """
    sentences = text.split('. ') # Split text into sentences
    chunks = []
    current_chunk = []

    current_length = 0
    for sentence in sentences:
        tokenized_length = len(tokenizer.tokenize(sentence)) # Calculate token count for the sentence
        if current_length + tokenized_length <= max_tokens:  # Check if adding the sentence exceeds token limit
            current_chunk.append(sentence)
            current_length += tokenized_length
        else: # Create a new chunk if limit is reached
            chunks.append(". ".join(current_chunk))
            current_chunk = [sentence]
            current_length = tokenized_length

    if current_chunk:  # Add remaining sentences to the final chunk
        chunks.append(". ".join(current_chunk))

    return chunks

In [5]:
def summarize_text(text, max_length=150, min_length=30, do_sample=False):
    """
    Summarize the input text using pre-trained BART model.
    """
    text = preprocess_text(text)

    if not text.strip():
        raise ValueError("Input text is empty or invalid.")

    # Split text into tokenization-aware chunks
    text_chunks = chunk_text(text, max_tokens=1024)
    summaries = []

    for i, chunk in enumerate(text_chunks):
        print(f"Processing chunk {i + 1}/{len(text_chunks)}...")
        try:
            summary = summarizer(
                chunk,
                max_length=max_length,
                min_length=min_length,
                do_sample=do_sample
            )
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Error processing chunk {i + 1}: {e}")

    # Combine summaries into a single text
    return " ".join(summaries)

In [13]:
# Example usage
if __name__ == "__main__":
    # Load the transcribed text from a file
    with open("computer_lecture.txt", "r") as file:
        transcribed_text = file.read()

    try:
        # Summarize the text
        summarized_text = summarize_text(transcribed_text)

        # Save the summarized text to a file
        with open("extractive_summarization_bart.txt", "w") as file:
            file.write(summarized_text)

        print("Summarization completed.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

Processing chunk 1/1...
Summarization completed.


## **LexRankSummarizer**

In [8]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

In [9]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

In [11]:
def extractive_summarization(file_path, method="lexrank", sentence_count=5):
    """
    Perform extractive summarization using Sumy.
    """
    try:
        # Read the input text
        with open(file_path, "r") as file:
            text = file.read()

        # Initialize Sumy parser and tokenizer
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        stemmer = Stemmer("english")

        # Select summarizer
        if method.lower() == "lexrank":
            summarizer = LexRankSummarizer(stemmer) # LexRank algorithm
        elif method.lower() == "lsa":
            summarizer = LsaSummarizer(stemmer) # Latent Semantic Analysis (LSA) algorithm
        else:
            raise ValueError(f"Invalid method: {method}")

        # Set stop words to improve summarization quality
        summarizer.stop_words = get_stop_words("english")

        # Generate the summary with the specified sentence count
        summary = summarizer(parser.document, sentence_count)

        # Combine sentences into a single string
        return " ".join(str(sentence) for sentence in summary)

    except Exception as e:
        return f"Error during summarization: {e}"

In [14]:
if __name__ == "__main__":
    input_file = "computer_lecture.txt"  # Input file
    output_file = "extractive_summarization_lexrank.txt"  # Output file
    summary = extractive_summarization(input_file, method="lexrank", sentence_count=5)  # Adjust method and count
    print("Extractive Summary:\n")
    print(summary)

    # Save the summary to a file
    with open(output_file, "w") as file:
        file.write(summary)
    print(f"\nSummary saved to {output_file}")


Extractive Summary:

Over the decades, computers have evolved into compact, affordable, and versatile tools that are integral to our daily lives. One of the significant milestones in computer history was the invention of the internet. The internet transformed computers from standalone devices into interconnected tools of communication and information exchange. Looking to the future, the potential of computers seems boundless. Innovations such as quantum computing and advanced artificial intelligence promise to redefine our understanding of computation and problem-solving.

Summary saved to extractive_summarization_lexrank.txt
