<h1 align="center">RAG Pipeline - Summarization</h1>
<h4 align="center">Shaji Joseph</h4>
<h4 align="center"> CSCI 685: Computational Lingustics</h4>
<h4 align="center"> CWID: 50394653</h4>

<h4 align="center">Dependencies</h4>

In [3]:
                                                            # ##############################
                                                            #  Environment Configuration   #
                                                            ################################
import os
# Configure CUDA environment variable for debugging GPU operations
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

                                                            ################################
                                                            #       Web Search & Scrape    #
                                                            ################################
from duckduckgo_search import DDGS  # For web searching
import requests  # For making HTTP requests to websites
from bs4 import BeautifulSoup  # For web scraping and HTML parsing
import re  # For regular expression operations

                                                            ################################
                                                            #       Text Processing        #
                                                            ################################
from langchain.text_splitter import RecursiveCharacterTextSplitter  # For text chunking
import nltk  # Natural Language Toolkit for text processing
nltk.download('punkt')  # Download NLTK tokenizer model
from nltk.tokenize import sent_tokenize  # For sentence tokenization

                                                            ################################
                                                            #       Embeddings & NLP       #
                                                            ################################
from sentence_transformers import SentenceTransformer  # For generating embeddings
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification  # For NLP tasks
import spacy  # For advanced NLP tasks (POS tagging,NER, etc)

                                                            ################################
                                                            #       Vector Search          #
                                                            ################################
import faiss  # For similarity search and indexing

                                                            ################################
                                                            #       Utilities              #
                                                            ################################
import numpy as np  # For numerical operations
import tkinter as tk  # For GUI creation
import warnings  # For warning control

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shaji.joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


<h4 align="center">RAG Pipeline Class</h4>

In [5]:
# Define the RAG (Retrieval-Augmented Generation) pipeline class
class RAG_pipeline():
    # Initializes the RAG pipeline with the user's query.
    def __init__(self, query):
        # Store the user's query as an instance variable 
        self.query = query
############################################################################################        
# Search the web using DuckDuckGo API
    def web_search(self, query):
        warnings.filterwarnings("ignore")
        with DDGS() as ddgs:
            # Get top 3 text results for the query
            results = list(ddgs.text(query, max_results=3))
            # Extract and return URLs from results
            return [result['href'] for result in results]
############################################################################################            
# Fetch webpage content and clean the text by Downloading HTML content, Extracting paragraph text, and Removing noise (parentheses, brackets, extra whitespace)
    def scrape_clean(self, url):
        # Fetch webpage content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        print("Article content extracted.")
        # Extract all paragraph elements
        paragraphs = soup.find_all('p')
        # Join paragraphs into single text
        txt = ' '.join([p.get_text() for p in paragraphs])
        # Clean text content
        txt = re.sub(r'\([^)]*\)', '', txt)
        cln = re.sub(r'\[.*?\]', '', txt)
        cln = re.sub(r'\s+', ' ', cln).strip()
        return cln      
############################################################################################        
# Split text into chunks of 256 characters with 20 characters overlap
    def chunking(self, text):
        print('Chunking...')
        # Initialize the text splitter with Ideal chunk length in characters (256) and Context-preserving overlap (20 chars)
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=20)
        # Perform the actual text splitting
        return text_splitter.split_text(text)
############################################################################################        
    # Generates vector embeddings for text chunks using a pre-trained sentence transformer model.
    def generate_embeddings(self, chunks):
        print('Generating embeddings...')
        # Initialize the pre-trained sentence transformer model 'all-MiniLM-L6-v2' Lightweight model
        model = SentenceTransformer('all-MiniLM-L6-v2')
        # Generate embeddings for all chunks. Convert each chunk to lowercase for case-insensitive processing
        return model.encode([c.lower() for c in chunks], normalize_embeddings=True)
############################################################################################
 # Calculate similarity scores between query and document chunks, ranking and returning the most relevant text portions.
    def similarity_metric(self, embeddings, query_embedding, chunks):
        print('Calculating similarity scores...')
        # Get embedding dimension from the shape of embeddings matrix
        dimension = embeddings.shape[1]
        # Initialize FAISS index using Inner Product
        index = faiss.IndexFlatIP(dimension)
        # Add document embeddings to the search index
        index.add(np.array(embeddings))
        # Perform similarity search
        # query embedding - The vector representation of the user's query.Retrieve top 16 most similar chunks
        # D - Array of distances (not used here)
        # I - Array of indices for the most similar chunks
        D, I = index.search(np.array(query_embedding), k=16)
        # Formulate the final text by joining the top ranked chunks in order of relevance
        return ' '.join([chunks[i] for i in I[0]])
############################################################################################
 # Generate summary from long text using BART model
    def summarize(self, longtext):
        print('Generating summary...')
        # Initialize the tokenizer to handle truncation
        tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
        # Tokenize the input text with truncation
        inputs = tokenizer(longtext, return_tensors="pt", truncation=True, max_length=1024, padding="max_length")
        # Decode the tokenized input back to text after truncation
        truncatedtext = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
        # Initialize the summarizer pipeline
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        # Generate the summary using the model
        summary = summarizer(truncatedtext, max_length=900, min_length=150, do_sample=False)
        # Return the summary text
        return summary[0]['summary_text']
##############################################################################################
# Paraphrase text using a pre-trained T5-based model for paraphrasing

    def paraphrase(self, text):
        print('Generating paraphrases...')
        # Initialize the paraphrasing pipeline
        paraphraser = pipeline("text2text-generation", model="humarin/chatgpt_paraphraser_on_T5_base")
        
        # Split the text into sentences for better paraphrasing
        sentences = sent_tokenize(text)
        paraphrased_sentences = []
        # Loop over each sentence and generate a paraphrased version
        for sentence in sentences:    
            prompt = "paraphrase: " + sentence + " </s>"
            # Generate a single paraphrased output
            paraphrases = paraphraser(prompt, num_beams=15, num_return_sequences=3, max_length=256, early_stopping=True,temperature=0.9, top_k=50, repetition_penalty=3.0)
            paraphrased_sentences.append(paraphrases[0]['generated_text'])
        # Join all paraphrased sentences into a single text block and return
        return ' '.join(paraphrased_sentences)
################################################################################################
 # Execute the full RAG pipeline: search, scrape, chunk, embed, and summarize
    def generate_summary(self):
        #  Web Search - Get relevant URLs using the query
        urls = self.web_search(self.query)
        # Web Scraping - Extract and clean text from the first URL
        clean_webtext = self.scrape_clean(urls[0])
        # Chunking - Split text into manageable pieces
        chunks = self.chunking(clean_webtext)
        # Generate embeddings for document chunks
        doc_embeddings = self.generate_embeddings(chunks)
        # Generate embedding for the original query
        query_embedding = self.generate_embeddings([self.query])
        # Retrieve most relevant chunks using similarity search
        long_text = self.similarity_metric(doc_embeddings, query_embedding, chunks)
        
        # Generate initial summary from retrieved text
        summary = self.summarize(long_text)
        
        # Augment with paraphrasing - Refine the summary for better readability
        paraphrased_summary = self.paraphrase(summary)
        
        return paraphrased_summary

<h4 align="center">Formating Summary</h4>

In [7]:
                            ###############################################
                            # Proper Noun Capitalizer using NER and Spacy #
                            ###############################################

# spaCy for part-of-speech tagging
# HuggingFace pipeline for Named Entity Recognition (NER)
nlp, ner = spacy.load("en_core_web_sm"), pipeline("ner", model="Davlan/distilbert-base-multilingual-cased-ner-hrl")

def capitalize(text):
    #  Extract all named entities (PER, ORG, LOC) from text
    ents = {w.lower() for e in ner(text) if (t:=e.get('entity_group',e.get('entity',''))) in ['PER','ORG','LOC'] 
            # regex to extract individual words
            for w in re.findall(r'\b\w+\b', e['word'].lower())}
            # Identify all prepositions (ADP) in text using spaCy
    preps = {t.text.lower() for t in nlp(text) if t.pos_ == "ADP"}
    # Process each word in the text
    words, fixed, sentstart = text.split(), [], True
    for i, w in enumerate(words):
        # Skip and preserve pure punctuation/non-word tokens
        if not (m := re.match(r'^(\W*)(\w+)(\W*)$', w)):
            fixed.append(w)
            sentstart = any(p in w for p in ['.','!','?'])
            continue
        # Split words into prefix (punctuation), core word, and suffix (punctuation)    
        p, c, s = m.groups()
        # # Lowercase for case-insensitive checks
        lc = c.lower()
        # sentence starters
        c = (c.capitalize() if lc in ents or (sentstart and lc not in preps) 
             # Force lowercase prepositions
             else lc if lc in preps else c)
        # words ending with sentence-ending punctuations
        sentstart = any(p in s for p in ['.','!','?']) or (i>0 and words[i-1][-1] in ['.','!','?'])
        # Reconstruct word with original punctuation
        fixed.append(f"{p}{c}{s}")
    
    return ' '.join(fixed)



<h4 align="center">Final Summary</h4>

In [9]:
# Handle the user input and executes the full summarization pipeline
def show_input(event=None):
    # Disable Submit button and get user input
    submitbutton.config(state=tk.DISABLED)
    # Get and clean input string
    userinput = entry.get().strip() 
    
    # Show processing summary warning
    resultlabel.config(text=f"Please Wait.... Processing Summary for: {userinput}...", fg="black")
    # Force GUI update
    window.update_idletasks()  
    if not userinput.strip():
        resultlabel.config(text="Please enter a valid query.", fg="red")
        return
    # Initialize RAG pipeline with user query
    pipeline_obj = RAG_pipeline(userinput)
    # Generate raw summary through the full RAG process
    raw_summary = pipeline_obj.generate_summary()
    # Split summary into sentences
    sentences = sent_tokenize(raw_summary)
    formatted_sentences = []
    # Format each sentence (capitalize first letter)
    for s in sentences:
        s = s.strip()
        if s:
            s = s[0].upper() + s[1:] if len(s) > 1 else s.upper()
            formatted_sentences.append(s)
            
    # Group sentences into paragraphs (3 sentences per paragraph)
    paragraphs = [
        ' '.join(formatted_sentences[i:i+3])
        for i in range(0, len(formatted_sentences), 3)
    ]
    formatted_output = '\n\n'.join(paragraphs)
    # Apply proper noun capitalization to the formatted output
    finaloutput = capitalize(formatted_output)
    # Formulate and show the final output in the result label
    resultlabel.config(text=finaloutput, anchor='w', justify='left', fg="black")

<h4 align="center">GUI Layout</h4>

In [11]:
# Set up the main application window
window = tk.Tk()
window.title("Summarizer")
window.geometry("600x400")

# Add GUI elements: labels, entry fields, and buttons
prompt = tk.Label(window, text="Enter a Query String", font=('Helvetica', 12))
prompt.pack(pady=5)

# Create and place the text entry field
entry = tk.Entry(window, width=40, font=('Helvetica', 11))
entry.pack(pady=5)

# Set the cursor focus and re-enable the submit button
entry.focus_force()
entry.bind("<Return>", lambda event: show_input())
entry.bind("<Key>", lambda e: submitbutton.config(state=tk.NORMAL))

# Create a frame to hold action buttons
buttonframe = tk.Frame(window)
buttonframe.pack(pady=10)

# Submit button configuration
submitbutton = tk.Button(buttonframe, text="Submit", command=show_input, bg="#4CAF50", fg="white", padx=10)
submitbutton.pack(side=tk.LEFT, padx=10)

# Close button configuration
close_button = tk.Button(buttonframe, text="Close", command=window.destroy, bg="#f44336", fg="white", padx=10)
close_button.pack(side=tk.LEFT, padx=10)

# Create and configure the results label
resultlabel = tk.Label(window, text="", wraplength=560, justify='left', anchor='w', bg="#f8f9fa", font=('Helvetica', 11), relief=tk.SUNKEN, padx=10, pady=10)
resultlabel.pack(pady=10, fill=tk.BOTH, expand=True)

# Start the main event loop - keeps the window responsive
window.mainloop()

Article content extracted.
Chunking...
Generating embeddings...
Generating embeddings...
Calculating similarity scores...
Generating summary...


Your max_length is set to 900, but your input_length is only 562. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=281)


Generating paraphrases...
Article content extracted.
Chunking...
Generating embeddings...
Generating embeddings...
Calculating similarity scores...
Generating summary...


Your max_length is set to 900, but your input_length is only 632. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=316)


Generating paraphrases...
