# Research Paper Summarizer for Different Personas

## 1.Import Libraries

In [1]:
# Necessary Libraries
import os
import re
import anthropic
import torch
import time
import logging

# For Document Loading
from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, ArxivLoader, PyPDFLoader
from langchain.schema import Document

# For Text Splitting and Processing
from langchain.text_splitter import RecursiveCharacterTextSplitter
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# For Embedding and Vector Store
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# For LLM and Prompts
from langchain_anthropic import ChatAnthropic
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

# For setting up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# To read .env file to get Anthropic API key
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv('env')) # read local .env file
anthropic_api_key=os.environ['ANTHROPIC_API_KEY']

# Initialize Anthropic client
anthropic_client = anthropic.Anthropic(api_key=os.environ['ANTHROPIC_API_KEY'])

## 2. Define & Initialize LLM Models & Embeddings

### Define LLM Model

In [3]:
# LLM Model for Summarization
llm_model = "claude-3-7-sonnet-20250219"
default_llm = ChatAnthropic(model=llm_model, temperature=0.0, api_key=anthropic_api_key)

# LLM Model for Evaluation
llm_eval_model = "claude-sonnet-4-20250514"
default_llm_eval = ChatAnthropic(model=llm_eval_model, temperature=0.5, api_key=anthropic_api_key)

### Initialize embeddings

In [4]:
# Initialize HuggingFace Embeddings
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


## 3. Create Important Classes & Functions

### 1. Create SentenceWindow and SentenceWindowRetriever Classes

In [5]:
class SentenceWindow:
    """ A sentence with its surrounding context window. """
    def __init__(self, sentence, window_context, sentence_index, metadata):
        self.sentence = sentence
        self.window_context = window_context
        self.sentence_index = sentence_index
        self.metadata = metadata

class SentenceWindowRetriever:
    """ To retrieve sentences with their context windows. """
    def __init__(self, window_size: int =2):
        self.window_size = window_size
        self.sentence_windows = []
        self.vector_store = None

    def create_sentence_windows(self, documents):
        """ Create sentence windows from the documents"""
        logger.info("Creating sentence windows....")
        all_windows = []

        for doc in documents:
            # Split into sentences using NLTK
            sentences = nltk.sent_tokenize(doc.page_content)

            # Creare windows around each sentence
            for i, sentence in enumerate(sentences):
                # Define window boundaries
                start_index = max(0, i - self.window_size)
                end_index = min(len(sentences), i + self.window_size + 1)

                # Create context window
                window_sentences = sentences[start_index:end_index]
                window_context = " ".join(window_sentences)

                # Create metadata
                metadata = {
                    **doc.metadata,
                    "sentence_index":i,
                    "total_sentences": len(sentences),
                    "window_start": start_index,
                    "window_end": end_index
                }

                window = SentenceWindow(
                    sentence=sentence,
                    window_context=window_context,
                    sentence_index=i,
                    metadata=metadata
                )
                all_windows.append(window)

        logger.info(f"Created {len(all_windows)} sentence windows.")
        self.sentence_windows = all_windows
        return all_windows
    
    def build_vectorstore(self, embeddings):
        """ Build a vector store from the sentence windows. """
        if not self.sentence_windows:
            raise ValueError("No sentence windows created. Call create_sentence_windows() first.")
        
        logger.info("Building vector store from sentence windows...")

        # Convert windows to documents for vector store
        window_docs = []
        for window in self.sentence_windows:
            doc = Document(
                page_content=window.window_context,
                metadata={
                    **window.metadata,
                    "core_sentence": window.sentence
                }
            )
            window_docs.append(doc)

        # Build FAISS vector store (more efficient than in-memory)
        self.vectorstore = FAISS.from_documents(window_docs, embeddings)
        logger.info("Vector store built successfully.")
        return self.vectorstore
    
    def retrieve(self, query, k=5):
        """ Retrieve relevant sentence windows """
        if not self.vectorstore:
            raise ValueError("Vector store not built. Call build_vectorstore() first.")
        
        return self.vectorstore.similarity_search(query, k=k)
    
print("SentenceWindow and SentenceWindowRetriever classes defined successfully.")

SentenceWindow and SentenceWindowRetriever classes defined successfully.


###

### 2. Create DocumentProcessor Class

In [6]:
class DocumentProcessor:
    """ Handles document loading and preprocessing"""

    @staticmethod
    def load_documents(source):
        """ Load document from  various sources with better error handling"""
        logger.info(f"Loading document from {source}")

        try:
            # Determine the source type(whether it's a PDF file or a ARXIV link) and use appropriate loader
            if source.startswith("http") and "arxiv.org" in source:
                # Extract arXiv ID from URL
                arxiv_id = DocumentProcessor._extract_arxiv_id(source)
                if arxiv_id:
                    loader = ArxivLoader(query=arxiv_id, load_max_docs=1)
                else:
                    loader = WebBaseLoader(source)
            elif source.startswith("http"):
                loader = WebBaseLoader(source)
            elif source.startswith("arxiv:"):
                # Handle direct arXiv IDs 
                arxiv_id = source.replace("arxiv:", "")
                loader = ArxivLoader(query=arxiv_id, load_max_docs=1)
            elif os.path.isfile(source):
                file_extension = os.path.splitext(source)[1].lower()
                if file_extension == ".pdf":
                    # Use PyPDFLoader for PDF files 
                    logging.info(f"Detected PDF file, using PyPDFLoader for {source}")
                    loader = PyPDFLoader(source)
            else:
                # Raise an error if the source type is not recognized or file doesn't exist
                raise ValueError(f"Unsupported document source or file not found: {source}")

            documents = loader.load()

            if not documents:
                raise ValueError("No documents loaded from the specified source.")
            
            logger.info(f"Successfully loaded {len(documents)} document(s) from {source}")
            return documents
        
        except FileNotFoundError as e:
            logger.error(f"File not found error while loading document: {e}")
            raise
        except Exception as e:
            logger.error(f"An unexpected error occurred during document loading: {e}")
            raise

    @staticmethod
    def _extract_arxiv_id(url):
        """ Extract arXiv ID from an arXiv URL"""
        patterns = [
            r'arxiv\.org/abs/(\d+\.\d+)',  # e.g., https://arxiv.org/abs/1234.5678
            r'arxiv\.org/pdf/(\d+\.\d+)'   # e.g., https://arxiv.org/pdf/1234.5678
        ]

        for pattern in patterns:
            match = re.search(pattern, url)
            if match:
                return match.group(1)

        return None
    
    @staticmethod
    def preprocess_documents(documents):
        """ Cleans and preprocesses a list of LangChain Document object"""

        logger.info("Starting document preprocessing...")

        processed_docs = []
        for i, doc in enumerate(documents):
            content = doc.page_content

            # Remove excessive whitespace 
            content = re.sub(r'\\s+',' ', content).strip()

            # Remove common artifacts like isolated page numbers 
            # Use regex to look for a newline, optional whitespace, digits, optional whitespace, and another newline
            content = re.sub(r'\\n\\s*\\d+\\s*\\n', '\\n', content)

        # Filter out documents with very short content after cleaning
            if len(content) < 100:
                logger.warning(f"Skipping document {i} due to very short content after preprocessing (length: {len(content)}).")
                continue

            # Create a new Document object with cleaned content
            processed_doc = Document(page_content=content, metadata=doc.metadata)
            processed_docs.append(processed_doc)

        logger.info(f"Finished preprocessing. Original documents: {len(documents)}, Preprocessed documents: {len(processed_docs)}")
        return processed_docs

print("DocumentProcessor class defined.")

DocumentProcessor class defined.


### 3. Create PersonaPrompts Class

In [7]:
class PersonaPrompts:
    """ Manages persona-specific prompts"""

    @staticmethod
    def get_personas():
        """ Define different persona descriptions"""
        return {
            "Data Scientist": ("a data scientist with strong knowledge in machine learning and deep learning who is currently learning generative AI. "
                "Focus on practical applications, data requirements, model performance metrics, implementation considerations, "
                "and how this relates to traditional ML/DL approaches you already know. Use technical language but explain generative AI concepts clearly."),
            "AI Engineer": (
                "a senior AI engineer responsible for implementing and deploying ML systems. "
                "Focus on practical implementation details, computational requirements, scalability considerations, "
                "integration challenges, and performance metrics. Use engineering-focused language."
            ),
            "Graduate Student": (
                "a graduate student studying machine learning who needs to understand this paper for research. "
                "Explain the core problem, methodology, key findings, and significance. "
                "Use clear technical language but explain complex concepts."
            ),
            "Business Executive": (
                "a business executive with limited technical background who needs to understand the business impact. "
                "Focus on the problem being solved, potential applications, market implications, "
                "competitive advantages, and ROI considerations. Avoid technical jargon."
            ),
            "General Audience": (
                "explaining to an educated general audience with no AI background. "
                "Use simple language, analogies, and focus on the big picture: what problem is solved, "
                "how it works in simple terms, and why it matters."
            )
        }
        
    @staticmethod
    def create_prompt_templates():
        """Create persona-specific prompt templates."""
        personas = PersonaPrompts.get_personas()
        templates = {}
        
        base_template = """You are {persona_description}

                            Based on the following research paper content, provide a comprehensive summary that addresses:
                            1. The main problem or research question
                            2. The proposed approach/solution
                            3. Key findings and results 
                            4. Significance and implications
                            5. Limitations or future work (if mentioned)

                            Keep your summary focused on aspects most relevant to your perspective and audience.

                            Research Paper Content:
                            {context}

                            Summary:"""
        
        for name, description in personas.items():
            templates[name] =  ChatPromptTemplate.from_template(base_template).partial(persona_description=description)
        
        return templates

print("PersonaPrompts class defined.")

PersonaPrompts class defined.


### 4. Create SummaryEvaluator Class

In [8]:
class SummaryEvaluator:
    """Evaluates summary quality using LLM-as-a-judge."""

    def __init__(self, llm):
        self.llm = llm
        self.eval_prompt = self._create_evaluation_prompt()

    def _create_evaluation_prompt(self):
        """Create evaluation prompt template."""
        template = """You are an expert evaluator assessing the quality for research paper summaries.

                        Evaluate the following summary based on these criteria:
                        1. Accuracy: Does it correctly represent the source material?
                        2. Completeness: Does it cover the key points appropriately?
                        3. Clarity: Is it well-written and understandable for the target audience?
                        4. Relevance: Does it focus on aspects relevant to the specified persona?

                        Rate the summary on a scale of 1-5 where:
                        1 = Poor (major inaccuracies, missing key points, unclear)
                        2 = Fair (some issues with accuracy or completeness)
                        3 = Good (mostly accurate and complete, minor issues)
                        4 = Very Good (accurate, complete, well-written)
                        5 = Excellent (outstanding in all criteria)

                        Provide your rating and detailed justification.

                        Source Material:
                        {context}

                        Summary to Evaluate:
                        {summary}

                        Persona: {persona}

                        Evaluation:"""
        return ChatPromptTemplate.from_template(template)
    
    def evaluate(self, summary, context, persona):
        """ Evaluate a single summary."""
        try:
            chain = self.eval_prompt | self.llm | StrOutputParser()
            evaluation = chain.invoke({
                "summary": summary,
                "context": context,
                "persona": persona
            })
            return evaluation
        except Exception as e:
            logger.error(f"Error evaluating summary: {e}")
            return f"Evaluation failed: {str(e)}"

print(f"SummaryEvaluator class defined with model {default_llm_eval}.")


SummaryEvaluator class defined with model model='claude-sonnet-4-20250514' temperature=0.5 anthropic_api_url='https://api.anthropic.com' anthropic_api_key=SecretStr('**********') model_kwargs={}.


### 5. Create PaperSummarizer Class

In [9]:
class PaperSummarizer:
    """ Main class responsible for the summarization process."""

    def __init__(self, llm, window_size=2):
        self.llm = default_llm
        self.embeddings = embeddings_model
        self.retriever = SentenceWindowRetriever(window_size=window_size)
        self.persona_prompts = PersonaPrompts.create_prompt_templates()
        self.evaluator = SummaryEvaluator(default_llm_eval)

        logger.info(f"Initialized PaperSummarizer with model {llm_model}")


    def process_document(self, source):
        """ Load and process document, return processed documents and context."""
        # Load document
        documents = DocumentProcessor.load_documents(source)

        # Preprocess
        processed_docs = DocumentProcessor.preprocess_documents(documents)

        if not processed_docs:
            raise ValueError("No valid documents after preprocessing.")
        
        # Create sentence windows
        windows = self.retriever.create_sentence_windows(processed_docs)

        # Build vectorstore
        self.retriever.build_vectorstore(self.embeddings)

        return processed_docs, f"Processed {len(windows)} sentence windows"
    

    def generate_summaries(self, query="Summarize this research paper"):
        """Generate summaries for all personas."""
        # Retrieve relevant context
        retrieved_docs = self.retriever.retrieve(query, k=150) # Get more context
        context = "/n/n".join([doc.page_content for doc in retrieved_docs])

        if not context.strip():
            raise ValueError("No relevant context retrieved")
        
        logger.info(f"Retrieved context length: {len(context)} characters")

        # Generate summaries for each persona
        summaries = {}
        for persona, template in self.persona_prompts.items():
            try:
                logger.info(f"Generating summary for: {persona}")

                chain = template | self.llm | StrOutputParser()
                summary = chain.invoke({"context": context})
                summaries[persona] = summary

                # Add a small delay to avoid hitting API rate limits
                time.sleep(30)

            except Exception as e:
                logger.error(f"Error generating summary for {persona}: {e}")
                summaries[persona] = f"Error generating summary: {str(e)}"
        
        return summaries, context
    

    def evaluate_summaries(self, summaries, context):
        """ Evaluate all generated summaries."""
        evaluations = {}

        for persona_name, summary in summaries.items():
            logger.info(f"Evaluating summary for {persona_name}")
            evaluation = self.evaluator.evaluate(summary, context, persona_name)
            evaluations[persona_name] = evaluation
            
            # Add a small delay to avoid hitting API rate limits
            time.sleep(30)
        
        return evaluations
    

    def summarize_paper(self, source, query = "Summarize this research paper"):
        """Complete process: load, preprocess, summarize, and evaluate."""
        try:
            logger.info(f"Starting paper summarizarion for {source}")

            # Process document
            processed_docs, processing_info = self.process_document(source)

            # Generate summaries
            summaries, context = self.generate_summaries(query)

            # Evaluate summaries
            evaluations = self.evaluate_summaries(summaries, context)

            # Compile results
            results = {
                "source": source,
                "query": query,
                "processing_info": processing_info,
                "context_length": len(context),
                "summaries": summaries,
                "evaluations": evaluations,
                "personas": list(summaries.keys())
            }

            logger.info("Paper summarization completed successfully.")
            return results
        
        except Exception as e:
            logger.error(f"Error in summarization process: {e}")
            raise

print("PaperSummarizer class defined")

PaperSummarizer class defined


### 6. Create get_paper_summary Function

In [10]:
def get_paper_summary(source:str, query:str = "Summarize this research paper") -> dict:
    """ Summarizes a research paper from a given source."""
    try:
        summarizer = PaperSummarizer(default_llm)
        results = summarizer.summarize_paper(source, query)
        return results
    except ValueError as ve:
        logger.error(f"Configuration error: {ve}")
        return {"error": str(ve)}
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
        return {"error": str(e)}

print("get_paper_summary function defined.")

get_paper_summary function defined.


## 3. Generating Summaries for Personas

In [11]:
# Local PDF file
source = "Attention is All You Need.pdf"

# ArXiv paper by ID
# source = "arxiv:1706.03762" # ArXiv ID for Attention is All You Need Paper

# ArXiv URL
# source = "https://arxiv.org/abs/1706.03762" # URL for Attention is All You Need Paper

# Custom query for focused summarization
query = "Summarize this research paper"

# Specify the LLM model
llm_model = llm_model

print(f"Attempting to summarize {source} with model {llm_model} and query '{query}'")

try:
    # Run summarization
    results = get_paper_summary(source, query)
    
    if "error" in results:
        print(f"\n--- ERROR ---")
        print(f"An error occurred during summarization: {results['error']}")
    else:
        # Print results
        print("\n" + "="*80)
        print("PAPER SUMMARIZATION RESULTS")
        print("="*80)
        
        print(f"\nSource: {results['source']}")
        print(f"Query: {results['query']}")
        print(f"LLM Model Used: {llm_model}")
        print(f"Processing: {results['processing_info']}")
        print(f"Context Length: {results['context_length']:,} characters")
        
        print(f"\n{'-'*60}")
        print("PERSONA SUMMARIES")
        print("-"*60)
        
        for persona in results['personas']:
            print(f"\n--- {persona.upper()} ---")
            print(results['summaries'][persona])
        
        print(f"\n{'-'*60}")
        print("SUMMARY EVALUATIONS")
        print("-"*60)
        
        for persona in results['personas']:
            print(f"\n--- {persona.upper()} EVALUATION ---")
            print(results['evaluations'][persona])
            
except Exception as e:
    print(f"\n--- CRITICAL ERROR ---")
    print(f"An unexpected critical error occurred: {e}")

INFO:__main__:Initialized PaperSummarizer with model claude-3-7-sonnet-20250219
INFO:__main__:Starting paper summarizarion for Attention is All You Need.pdf
INFO:__main__:Loading document from Attention is All You Need.pdf
INFO:root:Detected PDF file, using PyPDFLoader for Attention is All You Need.pdf


Attempting to summarize Attention is All You Need.pdf with model claude-3-7-sonnet-20250219 and query 'Summarize this research paper'


INFO:__main__:Successfully loaded 15 document(s) from Attention is All You Need.pdf
INFO:__main__:Starting document preprocessing...
INFO:__main__:Finished preprocessing. Original documents: 15, Preprocessed documents: 15
INFO:__main__:Creating sentence windows....
INFO:__main__:Created 373 sentence windows.
INFO:__main__:Building vector store from sentence windows...
INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Successfully loaded faiss with AVX2 support.
INFO:faiss:Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.
INFO:__main__:Vector store built successfully.
INFO:__main__:Retrieved context length: 61788 characters
INFO:__main__:Generating summary for: Data Scientist
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:__main__:Generating summary for: AI Engineer
INFO:httpx:HTTP Request: POST https://api


PAPER SUMMARIZATION RESULTS

Source: Attention is All You Need.pdf
Query: Summarize this research paper
LLM Model Used: claude-3-7-sonnet-20250219
Processing: Processed 373 sentence windows
Context Length: 61,788 characters

------------------------------------------------------------
PERSONA SUMMARIES
------------------------------------------------------------

--- DATA SCIENTIST ---
# Comprehensive Summary of "Attention Is All You Need"

## 1. Main Problem/Research Question
The paper addresses limitations in sequence transduction models that rely on complex recurrent or convolutional neural networks with encoder-decoder architectures. Traditional models face challenges in learning long-distance dependencies and have limited parallelization capabilities during training. The authors investigate whether attention mechanisms alone can create effective sequence models without using recurrence or convolution.

## 2. Proposed Approach/Solution
The authors introduce the Transformer, a nove