# Arxiv PDF Summarization Bot

## Setup

In [1]:
import arxiv
import io
import anthropic
import os
from dotenv import load_dotenv
import base64
import requests
from tqdm import tqdm
import PyPDF2
import re

import weave
from arxiv_models import convert_raw_arxiv_to_pydantic
import filetype
from PIL import Image
import io
from pdf2image import convert_from_bytes
import PyPDF2
from datetime import datetime, timezone
from arxiv_models import ArxivPaper, Author, Link


In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
weave.init("arxiv-papers-anthropic-testv2-4")

weave version 0.50.9 is available!  To upgrade, please run:
 $ pip install weave --upgrade
Logged in as Weights & Biases user: a-sh0ts.
View Weave data at https://wandb.ai/a-sh0ts/arxiv-papers-anthropic-testv2-4/weave


<weave.weave_client.WeaveClient at 0x167997f10>

In [4]:
# Initialize Anthropic anthropic_client
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

## (Optional) Fetch Arxiv Papers

In [5]:
@weave.op()
def generate_arxiv_query_args(instruction, model="claude-3-sonnet-20240229"):
    tools = [{
        "name": "prepare_arxiv_search",
        "description": "Prepare arguments for ArXiv paper search. This tool generates an optimal query string utilizing Boolean operators, field-specific syntax, and precise search terms. It also determines an efficient maximum number of results to fetch, balancing comprehensive coverage with processing efficiency. The output is tailored to the given research instruction, aiming to provide relevant and focused search results.",
        "input_schema": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The ArXiv search query string. Supports Boolean operators (AND, OR, NOT), field-specific syntax (e.g., 'ti:' for title, 'au:' for author), quotation marks for exact phrases, and wildcards. Can include multiple search terms to refine results based on title, abstract, authors, comments, journal reference, subject category, or report number."
                },
                "max_results": {
                    "type": "integer",
                    "description": "The maximum number of paper results to return from the ArXiv search. Aims to minimize the number of results while ensuring sufficient coverage of the topic. Defaults to 5 if not specified. Increasing this value broadens the search but may increase processing time and resource usage. Aim to be below 10 articles."
                }
            },
            "required": ["query", "max_results"]
        }
    }]

    system_prompt = """You are an expert at generating ArXiv queries. Use the prepare_arxiv_search tool to create an optimal query and determine the appropriate maximum number of results for the given research question. The query should utilize advanced search techniques including Boolean operators, field-specific syntax, and precise terms to ensure comprehensive yet focused results."""

    messages = [
        {
            "role": "user",
            "content": f"Use the prepare_arxiv_search tool to generate an optimal ArXiv query and determine the maximum number of results for the following research instruction: {instruction}"
        }
    ]

    response = anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=messages,
        system=system_prompt,
        tools=tools
    )

    # Extract the query and max_results from the response
    for content in response.content:
        if content.type == 'tool_use' and content.name == 'prepare_arxiv_search':
            args = content.input
            return args.get('query'), args.get('max_results')

    # If no tool use was found, return a default query and the provided max_results
    return f"{instruction}", 5

In [6]:
# instruction = "Answer the following question: What are the latest advancements in audio music information retrieval?"
# arxiv_query, max_results = generate_arxiv_query_args(instruction)
# print(f"ArXiv query: {arxiv_query}")
# print(f"Max results: {max_results}")

In [7]:
@weave.op()
def fetch_arxiv_papers(query, max_results=5):
    # Initialize the arxiv Client
    arxiv_client = arxiv.Client()
    
    # Create the search object
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance,
        sort_order=arxiv.SortOrder.Descending
    )
    
    # Fetch the results using client.results() and convert them to ArxivPaper objects
    papers = []
    for result in arxiv_client.results(search):
        paper = convert_raw_arxiv_to_pydantic(result)
        papers.append(paper)
    
    return papers

In [8]:
# arxiv_papers = fetch_arxiv_papers(arxiv_query)

## Create a sample Arxiv paper object

In [9]:
arxiv_paper = ArxivPaper(
    entry_id="http://arxiv.org/abs/2406.04744v1",
    updated=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
    published=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
    title="CRAG -- Comprehensive RAG Benchmark",
    authors=[
        Author(full_name="Xiao Yang"),
        Author(full_name="Kai Sun"),
        Author(full_name="Hao Xin"),
        Author(full_name="Yushi Sun"),
        Author(full_name="Nikita Bhalla"),
        Author(full_name="Xiangsen Chen"),
        Author(full_name="Sajal Choudhary"),
        Author(full_name="Rongze Daniel Gui"),
        Author(full_name="Ziran Will Jiang"),
        Author(full_name="Ziyu Jiang"),
        Author(full_name="Lingkun Kong"),
        Author(full_name="Brian Moran"),
        Author(full_name="Jiaqi Wang"),
        Author(full_name="Yifan Ethan Xu"),
        Author(full_name="An Yan"),
        Author(full_name="Chenyu Yang"),
        Author(full_name="Eting Yuan"),
        Author(full_name="Hanwen Zha"),
        Author(full_name="Nan Tang"),
        Author(full_name="Lei Chen"),
        Author(full_name="Nicolas Scheffer"),
        Author(full_name="Yue Liu"),
        Author(full_name="Nirav Shah"),
        Author(full_name="Rakesh Wanga"),
        Author(full_name="Anuj Kumar"),
        Author(full_name="Wen-tau Yih"),
        Author(full_name="Xin Luna Dong")
    ],
    summary="Retrieval-Augmented Generation (RAG) has recently emerged as a promising solution to alleviate Large Language Model (LLM)'s deficiency in lack of knowledge. Existing RAG datasets, however, do not adequately represent the diverse and dynamic nature of real-world Question Answering (QA) tasks. To bridge this gap, we introduce the Comprehensive RAG Benchmark (CRAG), a factual question answering benchmark of 4,409 question-answer pairs and mock APIs to simulate web and Knowledge Graph (KG) search. CRAG is designed to encapsulate a diverse array of questions across five domains and eight question categories, reflecting varied entity popularity from popular to long-tail, and temporal dynamisms ranging from years to seconds. Our evaluation on this benchmark highlights the gap to fully trustworthy QA. Whereas most advanced LLMs achieve <=34% accuracy on CRAG, adding RAG in a straightforward manner improves the accuracy only to 44%. State-of-the-art industry RAG solutions only answer 63% questions without any hallucination. CRAG also reveals much lower accuracy in answering questions regarding facts with higher dynamism, lower popularity, or higher complexity, suggesting future research directions. The CRAG benchmark laid the groundwork for a KDD Cup 2024 challenge, attracting thousands of participants and submissions within the first 50 days of the competition. We commit to maintaining CRAG to serve research communities in advancing RAG solutions and general QA solutions.",
    comment="",
    journal_ref=None,
    doi="10.48550/arXiv.2406.04744",
    primary_category="cs.CL",
    categories=["cs.CL"],
    links=[
        Link(href="https://arxiv.org/abs/2406.04744", title="Abstract", rel="alternate", content_type=None),
        Link(href="https://arxiv.org/pdf/2406.04744", title="pdf", rel="related", content_type=None)
    ],
    pdf_url="https://arxiv.org/pdf/2406.04744"
)

In [10]:
# arxiv_paper.pdf_url

In [11]:
def load_pdf(arxiv_result):
    pdf_url = arxiv_result["pdf_url"]
    response = requests.get(pdf_url)
    pdf_file = io.BytesIO(response.content)
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    return pdf_reader

## Convert Images to Text using Sonnet's vision capabilities

Note: If we can't directly extract the image (in the case of SVGs or other vector graphics), we need to convert the page to an image first.
Then we just ask the LLM to explain only the images on the page and to ignore the text

In [12]:
def convert_vector_graphic_page_to_image(pdf_page, scale_factor=0.5):
    def get_object(obj):
        if isinstance(obj, PyPDF2.generic.IndirectObject):
            return obj.get_object()
        return obj

    resources = get_object(pdf_page.get('/Resources', {}))
    xobject = get_object(resources.get('/XObject', {}))
    
    # Check if there's a figure that's not an image
    if xobject:
        for obj in xobject.values():
            obj = get_object(obj)
            if isinstance(obj, dict) and obj.get('/Subtype') == '/Form':  # This indicates a vector graphic
                # Convert the page to a PIL Image
                pdf_bytes = io.BytesIO()
                pdf_writer = PyPDF2.PdfWriter()
                pdf_writer.add_page(pdf_page)
                pdf_writer.write(pdf_bytes)
                pdf_bytes.seek(0)
                
                # Convert PDF to image
                images = convert_from_bytes(pdf_bytes.getvalue(), fmt='png')
                
                if images:
                    image = images[0]
                    # Resize the image
                    new_size = (int(image.width * scale_factor), int(image.height * scale_factor))
                    image = image.resize(new_size, Image.LANCZOS)
                    img_byte_arr = io.BytesIO()
                    image.save(img_byte_arr, format='PNG')
                    img_byte_arr = img_byte_arr.getvalue()
                    img_str = base64.b64encode(img_byte_arr).decode("utf-8")
                    data_url = f"data:image/png;base64,{img_str}"
                    return data_url
    
    return None  # Return None if no conversion was needed

In [13]:
# # Usage example:
# pdf_reader = load_pdf(arxiv_paper)
# page = pdf_reader.pages[3]
# image = convert_vector_graphic_page_to_image(page)
# if image:
#     # Process the image as needed
#     print("Image converted successfully")
# else:
#     print("No vector graphics found or conversion failed")

In [14]:
@weave.op()
def process_figure_image(data_url, model="claude-3-5-sonnet-20240620"):
    """Process image data and return a detailed technical description."""
    img_str = data_url.split(",")[1]

    response = anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": img_str,
                        },
                    },
                    {
                        "type": "text",
                        "text": """Analyze this image as if it's a figure from a scientific research paper. Provide a detailed technical description addressing the following:

1. Type of figure (e.g., graph, diagram, flowchart, experimental setup)
2. Key components or variables represented
3. Relationships or trends depicted
4. Quantitative information (if present)
5. Methodology or process illustrated (if applicable)
6. Potential implications or conclusions that can be drawn
7. Any limitations or assumptions evident in the figure

Focus on technical accuracy and relevance to scientific research. Avoid general descriptions and concentrate on the specific scientific content presented.""",
                    },
                ],
            }
        ],
    )
    return response.content[0].text

In [15]:
@weave.op()
def process_vector_image_pdf(data_url, model="claude-3-5-sonnet-20240620"):
    img_str = data_url.split(",")[1]

    response = anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": img_str,
                        },
                    },
                    {
                        "type": "text",
                        "text": """This image is a full page from a scientific paper PDF, converted to PNG format. It may contain one or more vector graphic figures or charts. Your task is to:

1. Identify and focus solely on the vector graphic figures or charts within the page.
2. For each identified figure or chart, provide a detailed technical analysis addressing:

   a. Type of figure (e.g., graph, diagram, flowchart)
   b. Key components or variables represented
   c. Relationships or trends depicted
   d. Quantitative information (if present)
   e. Methodology or process illustrated (if applicable)
   f. Potential implications or conclusions that can be drawn

3. Ignore any text or other elements on the page that are not part of the vector graphic figures.
4. If multiple figures are present, analyze each separately and clearly indicate which figure you are describing.

Focus on providing accurate, technical descriptions of the vector graphic content only.""",
                    },
                ],
            }
        ],
    )
    return response.content[0].text

In [16]:
@weave.op()
def extract_images(paper, model="claude-3-5-sonnet-20240620"):
    """Extract text and images from PDF content."""

    pdf_reader = load_pdf(paper)
    all_images = []

    for page in pdf_reader.pages:
        images = []

        for image in page.images:
            img_data = image.data
            kind = filetype.guess(img_data)
            if kind is None:
                print(f"Cannot guess file type!")
                continue
            
            img_str = base64.b64encode(img_data).decode("utf-8")
            data_url = f"data:{kind.mime};base64,{img_str}"
            try:
                images.append(
                    {"image": data_url, "description": process_figure_image(data_url, model=model)}
                )
            except Exception as e:
                print(f"Error processing image: {e}")
                images.append({"image": data_url, "description": ""})
        
        vector_graphics_image_data_url = convert_vector_graphic_page_to_image(page)
        if vector_graphics_image_data_url:
            images.append({"image": vector_graphics_image_data_url, "description": process_vector_image_pdf(vector_graphics_image_data_url, model=model)})
        all_images.append(images)

    return all_images

In [17]:
# extracted_images = extract_images(arxiv_paper)
# extracted_images

In [18]:
@weave.op()
def replace_images_with_descriptions(paper, images):
    pdf_reader = load_pdf(paper)
    text = ""
    for page_num, page in enumerate(pdf_reader.pages):
        text += page.extract_text() + "\n\n"
        if images[page_num] and len(images[page_num]) > 0:
            text += f"\n\n[Image Descriptions for page {page_num+1}]\n"
            for image_num, image in enumerate(images[page_num]):
                text += f"\n[Image {image_num+1}]: {image['description']}\n"
            text += "[END OF IMAGE DESCRIPTIONS]\n"

    return text

In [19]:
# cleaned_text = replace_images_with_descriptions(arxiv_paper, extracted_images)
# cleaned_text[:500]

## Augmented Chain of Density Summarization
1. Chunk and iteratively summarize the text
2. Iteratively refine the final chunk-based summary
3. Do one final pass of summarization to refine the density of the final summary

In [20]:
#TODO: Incorporate the question in the summary creation process instead of just using it to create the final summary
@weave.op()
def chain_of_density_summarization(instruction, text, model="claude-3-5-sonnet-20240620", chunk_size=4000, chunk_iterations=2, density_iterations=2):
    """Apply Chain of Density summarization to the text with embedded image descriptions."""
    
    @weave.op()
    def chunk_text(text, chunk_size=4000):
        chunks = []
        current_chunk = ""
        lines = text.split('\n')
        
        i = 0
        while i < len(lines):
            line = lines[i]
            if len(current_chunk) + len(line) > chunk_size:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
            
            current_chunk += line + "\n"
            
            # Check if this line starts an image description section
            if line.startswith("[Image Descriptions for page"):
                # If we have content before this, add it as a chunk
                if current_chunk.strip():
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
                
                # Collect all image descriptions for this page
                image_descriptions = line + "\n"
                i += 1
                while i < len(lines) and not lines[i].startswith("[END OF IMAGE DESCRIPTIONS]"):
                    image_descriptions += lines[i] + "\n"
                    i += 1
                if i < len(lines):
                    image_descriptions += lines[i] + "\n"
                
                # Add image descriptions as a separate chunk
                chunks.append(image_descriptions.strip())
                current_chunk = ""
            else:
                i += 1
        
        if current_chunk:
            chunks.append(current_chunk.strip())
    
        # Combine chunks until they reach the defined chunk_size
        combined_chunks = []
        current_combined_chunk = ""
        for chunk in chunks:
            if len(current_combined_chunk) + len(chunk) <= chunk_size:
                current_combined_chunk += chunk + "\n\n"
            else:
                if current_combined_chunk:
                    combined_chunks.append(current_combined_chunk.strip())
                current_combined_chunk = chunk + "\n\n"
        
        if current_combined_chunk:
            combined_chunks.append(current_combined_chunk.strip())

        return combined_chunks
    
    # Split the document into chunks
    chunks = chunk_text(text, chunk_size)
    print(f"Number of chunks: {len(chunks)}")
    print(f"Chunk sizes: {[len(chunk) for chunk in chunks]}")
            
    @weave.op()
    def summarize_chunk(chunk, instruction, current_summary="", iteration=1):
        prompt = f"""Current summary:
        {current_summary}

        New information:
        {chunk}

        Instruction to focus on: {instruction}

        Iteration: {iteration}

        Create an extremely dense, highly technical summary that specifically addresses the given instruction. Follow these steps:

        1. Identify 3-5 key technical points from the new information that are directly relevant to the instruction, prioritizing:
        - Novel methodologies or algorithms related to the instruction
        - Specific quantitative results or metrics that address the instruction
        - Detailed experimental setups or parameters pertinent to the instruction
        - Precise definitions of domain-specific concepts mentioned in the instruction
        - Critical limitations or assumptions in the research that affect the instruction

        2. Integrate these points with the current summary, ensuring:
        - Direct relevance to the instruction at hand
        - No redundancy or oversimplification
        - Preservation of technical nuances and complexities specific to the instruction
        - Inclusion of relevant equations, formulas, or mathematical notations that help address the instruction
        - Accurate representation of statistical significance and error margins for instruction-related data

        3. Rephrase the combined information to maximize information density while maintaining focus on the instruction:
        - Use domain-specific terminology and jargon without simplification, as relevant to the instruction
        - Maintain the level of detail expected in a PhD-level discourse on the specific topic of the instruction
        - Incorporate precise citations or references where applicable to support the response
        - Preserve any conflicting viewpoints or ongoing debates in the field that relate to the instruction

        4. With each iteration, aim to increase information density by 30-40% without sacrificing technical accuracy or critical details that address the instruction.

        5. Ensure the summary includes instruction-specific:
        - Methodological details (e.g., exact algorithms, parameter settings) that are crucial to addressing the instruction
        - Precise quantitative results with appropriate units and error bounds that directly relate to the instruction
        - Detailed descriptions of novel techniques or approaches that are key to addressing the instruction
        - Critical analysis of strengths and limitations in the research as they pertain to the instruction

        Produce a summary that is significantly more information-dense and technically precise than the previous one, while remaining laser-focused on addressing the given instruction. Use language appropriate for a highly specialized audience in the field."""

        response = anthropic_client.messages.create(
            model=model,
            max_tokens=4096,
            messages=[{"role": "user", "content": prompt}]
            )
        return response.content[0].text
    
    @weave.op()
    def summarize_current_summary(instruction, current_summary="", iteration=1):
        prompt = f"""Current summary:
        {current_summary}

        Instruction to focus on: {instruction}

        Iteration: {iteration}

        Generate an increasingly concise, entity-dense, and highly technical summary of the above text that specifically addresses the given instruction.

        Follow these steps:
        1. Identify 1-3 informative technical Entities from the original text which are missing from the current summary and are relevant to the instruction. These entities should be:
        - Highly relevant to addressing the specific instruction
        - Specific and technical (preferably 5 words or fewer)
        - Novel (not in the current summary)
        - Faithful (present in the original text)
        - May include methodologies, algorithms, metrics, or key findings that directly relate to the instruction

        2. Write a new, denser summary of identical length which covers every entity and technical detail from the current summary plus the newly identified Missing Entities, while maintaining focus on addressing the instruction.

        Guidelines:
        - Prioritize technical accuracy and specificity over general readability, always in the context of the given instruction.
        - Make every word count: rewrite the current summary to improve information density and make space for additional technical entities that are relevant to the instruction.
        - Use domain-specific terminology, precise quantitative information, and technical jargon where appropriate and relevant to addressing the instruction.
        - Employ fusion, compression, and removal of less informative phrases to increase density, while ensuring all information pertains to the instruction.
        - Ensure the summary remains highly dense and technical, yet self-contained and focused on the instruction.
        - Never drop entities or technical details from the current summary that are relevant to the instruction. If space is limited, add fewer new entities.
        - Maintain the exact same word count as the current summary.

        Produce a summary that is more information-dense and technically precise than the previous one, suitable for an expert audience in the field, while remaining laser-focused on addressing the given instruction."""

        response = anthropic_client.messages.create(
            model=model,
            max_tokens=4096,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text



    @weave.op()
    def summarize_chunk_summaries(instruction, current_summary, chunk_summaries):
        # Final densification step
        return anthropic_client.messages.create(
            model="claude-3-opus-20240229", #Ensure it has a long context window
            max_tokens=4096,
            messages=[
                {
                    "role": "user",
                    "content": f"""Given this current summary:

        {current_summary}

        And these chunk summaries:

        {' '.join(chunk_summaries)}

        And this instruction to focus on:

        {instruction}

        Create an extremely dense, final summary that refines the current summary by incorporating key information from the chunk summaries, while specifically addressing the given instruction. Follow these guidelines:

        1. Integrate the most relevant and important information from the chunk summaries into the current summary.
        2. Ensure all key technical content from both the current summary and chunk summaries that relates to the instruction is retained.
        3. Aim to reduce overall length by 30-40% while increasing information density.
        4. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.
        5. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.
        6. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.
        7. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.
        8. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints from across all summaries.
        9. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.

        The final summary should be a highly concentrated, technical distillation of all provided summaries that specifically addresses the given instruction, suitable for specialists in the field.""",
                    }
                ],
        ).content[0].text


    @weave.op()
    def summarize_chunk_iteration(chunks, instruction, current_summary, iteration):
        chunk_summaries = []
        for i, chunk in enumerate(chunks, 1):
            current_summary = summarize_chunk(chunk, instruction, current_summary, iteration)
            chunk_summaries.append(current_summary)
            print(f"Iteration {iteration}, Chunk {i}:\n{current_summary}\n")
        current_summary = summarize_chunk_summaries(instruction, current_summary, chunk_summaries)
        print(f"Iteration {iteration}, Final Summary:\n{current_summary}\n")
        return current_summary, chunk_summaries

    @weave.op()
    def iterative_chunk_summarization(chunks, instruction, current_summary, chunk_iterations):
        chunk_iteration_summaries = []
        chunk_summaries = []
        for iteration in range(1, chunk_iterations + 1):
            current_summary, iteration_chunk_summaries = summarize_chunk_iteration(chunks, instruction, current_summary, iteration)
            chunk_iteration_summaries.append(current_summary)
            chunk_summaries.append(iteration_chunk_summaries)
        return current_summary, chunk_iteration_summaries, chunk_summaries

    current_summary, chunk_iteration_summaries, chunk_summaries = iterative_chunk_summarization(chunks, instruction, "", chunk_iterations)

    @weave.op()
    def iterative_density_summarization(instruction, current_summary, density_iterations):
        iteration_summaries = []
        for iteration in range(1, density_iterations + 1):
            current_summary = summarize_current_summary(instruction, current_summary, iteration)
            iteration_summaries.append(current_summary)
            print(f"Iteration {iteration}:\n{current_summary}\n")
        return current_summary, iteration_summaries

    current_summary, iteration_summaries = iterative_density_summarization(instruction, current_summary, density_iterations)

    @weave.op()
    def final_summary(instruction, current_summary):
        # Final densification step
        return anthropic_client.messages.create(
            model=model,
            max_tokens=4096,
            messages=[
                {
                    "role": "user",
                    "content": f"""Given this summary:

    {current_summary}

    And this instruction to focus on:

    {instruction}

    Create an extremely dense, final summary that captures all key technical information in the most concise form possible, while specifically addressing the given instruction. Follow these guidelines:

    1. Aim to reduce length by 30-40% while retaining all critical technical content relevant to the instruction.
    2. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.
    3. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.
    4. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.
    5. Ensure that all key entities and concepts from the original summary that relate to the instruction are represented.
    6. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.
    7. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints.
    8. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.

    The final summary should be a highly concentrated, technical distillation of the research that specifically addresses the given instruction, suitable for specialists in the field.""",
                }
            ],
        ).content[0].text

    final_summary = final_summary(instruction, current_summary)
    print(f"Final Summary:\n{final_summary}\n")

    return {
        "final_summary": final_summary,
        "accumulated_summary": current_summary,
        "iteration_summaries": iteration_summaries,
        "chunk_iteration_summaries": chunk_iteration_summaries,
        "chunk_summaries": chunk_summaries 
    }

In [21]:
question = "Explain the unique evaluation value props this RAG benchmark provides to AI Engineers."

In [22]:
# summaries = chain_of_density_summarization(question, cleaned_text)
# print(summaries["final_summary"])

## Create a Weave Model Object to better serialize the model for experimentation

In [23]:
class ArxivChainOfDensityPipeline(weave.Model):

    model: str = "claude-3-5-sonnet-20240620"
    chunk_size: int = 20000
    chunk_iterations: int = 1
    density_iterations: int = 3

    def __init__(self, model: str = "claude-3-5-sonnet-20240620", chunk_size: int = 4000, chunk_iterations: int = 1, density_iterations: int = 3):
        super().__init__()
        self.model = model
        self.chunk_size = chunk_size
        self.chunk_iterations = chunk_iterations
        self.density_iterations = density_iterations

    @weave.op()
    def predict(self, paper: ArxivPaper, instruction: str) -> dict:
        extracted_images = extract_images(paper)
        cleaned_text = replace_images_with_descriptions(paper, extracted_images)
        return chain_of_density_summarization(instruction, cleaned_text, model=self.model, chunk_size=self.chunk_size, chunk_iterations=self.chunk_iterations, density_iterations=self.density_iterations)

In [24]:
arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline()
# arxiv_chain_of_density_pipeline.predict(arxiv_paper, "Determine how I would best incorporate these benchmarks for my customer support RAG system. What evaluations would work best specifically for me?")

## Run and evaluate the experiments!

In [25]:
arxiv_paper1 = ArxivPaper(
    entry_id="http://arxiv.org/abs/2405.05904",
    updated=datetime(2024, 5, 13, 7, 29, 58, tzinfo=timezone.utc),
    published=datetime(2024, 5, 9, 17, 0, 22, tzinfo=timezone.utc),
    title="Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?",
    authors=[
        Author(full_name="Zorik Gekhman"),
        Author(full_name="Gal Yona"),
        Author(full_name="Roee Aharoni"),
        Author(full_name="Matan Eyal"),
        Author(full_name="Amir Feder"),
        Author(full_name="Roi Reichart"),
        Author(full_name="Jonathan Herzig")
    ],
    summary=("When large language models are aligned via supervised fine-tuning, they may encounter new factual information "
             "that was not acquired through pre-training. It is often conjectured that this can teach the model the behavior "
             "of hallucinating factually incorrect responses, as the model is trained to generate facts that are not grounded "
             "in its pre-existing knowledge. In this work, we study the impact of such exposure to new knowledge on the capability "
             "of the fine-tuned model to utilize its pre-existing knowledge. To this end, we design a controlled setup, focused on "
             "closed-book QA, where we vary the proportion of the fine-tuning examples that introduce new knowledge. We demonstrate "
             "that large language models struggle to acquire new factual knowledge through fine-tuning, as fine-tuning examples that "
             "introduce new knowledge are learned significantly slower than those consistent with the model's knowledge. However, we "
             "also find that as the examples with new knowledge are eventually learned, they linearly increase the model's tendency "
             "to hallucinate. Taken together, our results highlight the risk in introducing new factual knowledge through fine-tuning, "
             "and support the view that large language models mostly acquire factual knowledge through pre-training, whereas fine-tuning "
             "teaches them to use it more efficiently."),
    comment=None,
    journal_ref=None,
    doi="10.48550/arXiv.2405.05904",
    primary_category="cs.CL",
    categories=["cs.CL"],
    links=[
        Link(href="https://arxiv.org/abs/2405.05904", title="Abstract", rel="alternate"),
        Link(href="https://arxiv.org/pdf/2405.05904", title="pdf", rel="related")
    ],
    pdf_url="https://arxiv.org/pdf/2405.05904"
)

In [26]:
arxiv_paper2 = ArxivPaper(
    entry_id="http://arxiv.org/abs/2404.11018",
    updated=datetime(2024, 5, 22, 17, 6, 10, tzinfo=timezone.utc),
    published=datetime(2024, 4, 17, 2, 49, 26, tzinfo=timezone.utc),
    title="Many-Shot In-Context Learning",
    authors=[
        Author(full_name="Rishabh Agarwal"),
        Author(full_name="Avi Singh"),
        Author(full_name="Lei M. Zhang"),
        Author(full_name="Bernd Bohnet"),
        Author(full_name="Luis Rosias"),
        Author(full_name="Stephanie Chan"),
        Author(full_name="Biao Zhang"),
        Author(full_name="Ankesh Anand"),
        Author(full_name="Zaheer Abbas"),
        Author(full_name="Azade Nova"),
        Author(full_name="John D. Co-Reyes"),
        Author(full_name="Eric Chu"),
        Author(full_name="Feryal Behbahani"),
        Author(full_name="Aleksandra Faust"),
        Author(full_name="Hugo Larochelle")
    ],
    summary=("Large language models (LLMs) excel at few-shot in-context learning (ICL) -- learning from a few examples provided in context at inference, "
             "without any weight updates. Newly expanded context windows allow us to investigate ICL with hundreds or thousands of examples -- the many-shot regime. "
             "Going from few-shot to many-shot, we observe significant performance gains across a wide variety of generative and discriminative tasks. While promising, "
             "many-shot ICL can be bottlenecked by the available amount of human-generated examples. To mitigate this limitation, we explore two new settings: Reinforced "
             "and Unsupervised ICL. Reinforced ICL uses model-generated chain-of-thought rationales in place of human examples. Unsupervised ICL removes rationales from the "
             "prompt altogether, and prompts the model only with domain-specific questions. We find that both Reinforced and Unsupervised ICL can be quite effective in the "
             "many-shot regime, particularly on complex reasoning tasks. Finally, we demonstrate that, unlike few-shot learning, many-shot learning is effective at overriding "
             "pretraining biases, can learn high-dimensional functions with numerical inputs, and performs comparably to fine-tuning. Our analysis also reveals the limitations "
             "of next-token prediction loss as an indicator of downstream ICL performance."),
    comment=None,
    journal_ref=None,
    doi="10.48550/arXiv.2404.11018",
    primary_category="cs.LG",
    categories=["cs.LG", "cs.AI", "cs.CL"],
    links=[
        Link(href="https://arxiv.org/abs/2404.11018", title="Abstract", rel="alternate"),
        Link(href="https://arxiv.org/pdf/2404.11018", title="pdf", rel="related")
    ],
    pdf_url="https://arxiv.org/pdf/2404.11018"
)

In [27]:
arxiv_paper3 = ArxivPaper(
    entry_id="http://arxiv.org/abs/2406.18403",
    updated=datetime(2024, 6, 26, 14, 56, 13, tzinfo=timezone.utc),
    published=datetime(2024, 6, 26, 14, 56, 13, tzinfo=timezone.utc),
    title="LLMs instead of Human Judges? A Large Scale Empirical Study across 20 NLP Evaluation Tasks",
    authors=[
        Author(full_name="Anna Bavaresco"),
        Author(full_name="Raffaella Bernardi"),
        Author(full_name="Leonardo Bertolazzi"),
        Author(full_name="Desmond Elliott"),
        Author(full_name="Raquel Fernández"),
        Author(full_name="Albert Gatt"),
        Author(full_name="Esam Ghaleb"),
        Author(full_name="Mario Giulianelli"),
        Author(full_name="Michael Hanna"),
        Author(full_name="Alexander Koller"),
        Author(full_name="André F. T. Martins"),
        Author(full_name="Philipp Mondorf"),
        Author(full_name="Vera Neplenbroek"),
        Author(full_name="Sandro Pezzelle"),
        Author(full_name="Barbara Plank"),
        Author(full_name="David Schlangen"),
        Author(full_name="Alessandro Suglia"),
        Author(full_name="Aditya K Surikuchi"),
        Author(full_name="Ece Takmaz"),
        Author(full_name="Alberto Testoni")
    ],
    summary=("There is an increasing trend towards evaluating NLP models with LLM-generated judgments instead of human judgments. "
             "In the absence of a comparison against human data, this raises concerns about the validity of these evaluations; in case they are conducted with proprietary models, "
             "this also raises concerns over reproducibility. We provide JUDGE-BENCH, a collection of 20 NLP datasets with human annotations, and comprehensively evaluate 11 current LLMs, "
             "covering both open-weight and proprietary models, for their ability to replicate the annotations. Our evaluations show that each LLM exhibits a large variance across datasets in its correlation to human judgments. "
             "We conclude that LLMs are not yet ready to systematically replace human judges in NLP."),
    comment=None,
    journal_ref=None,
    doi="10.48550/arXiv.2406.18403",
    primary_category="cs.CL",
    categories=["cs.CL"],
    links=[
        Link(href="https://arxiv.org/abs/2406.18403", title="Abstract", rel="alternate"),
        Link(href="https://arxiv.org/pdf/2406.18403", title="pdf", rel="related")
    ],
    pdf_url="https://arxiv.org/pdf/2406.18403"
)

In [28]:
arxiv_paper1.pdf_url

'https://arxiv.org/pdf/2405.05904'

In [29]:
eval_papers = [
    arxiv_paper1,
    # arxiv_paper2,
    # arxiv_paper3
]

In [30]:
eval_instructions = [
    "Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field.",
    # "Analyze the experimental setup, results, and limitations of this study, highlighting any statistical significance and error margins.",
    # "Compare this paper's approach to existing methods in the field, explaining how it addresses current challenges or limitations."
]

In [31]:
from itertools import product

In [32]:
eval_data = list(product(eval_papers, eval_instructions))
print(len(eval_data))

1


In [33]:
dataset = weave.Dataset(name="we-paper-reading-eval-data", rows=[{"paper": arxiv_paper, "instruction": instruction, "summary": arxiv_paper.summary} for arxiv_paper, instruction in eval_data])

In [34]:
weave.publish(dataset)

📦 Published to https://wandb.ai/a-sh0ts/arxiv-papers-anthropic-testv2-4/weave/objects/we-paper-reading-eval-data/versions/zL0RIFyIYA6l2OnpDrqNd18VextVr8UxtsNXhlJyYj0


ObjectRef(entity='a-sh0ts', project='arxiv-papers-anthropic-testv2-4', name='we-paper-reading-eval-data', digest='zL0RIFyIYA6l2OnpDrqNd18VextVr8UxtsNXhlJyYj0', extra=[])

In [35]:
import json

In [36]:
from openai import OpenAI

In [37]:
@weave.op()
def quality_scorer(instruction, model_output, model="gpt-4o"):
    openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    
    def score_summary(summary, summary_type):
        prompt = f"""Evaluate the quality of the following {summary_type} based on how well it addresses the given instruction. Use the scoring rules below to calculate a numerical score between 0 and 10.

Instruction: {instruction}

{summary_type}:
{summary}

Scoring Rules:
1. Start with a base score of 5 points.
2. Relevance to instruction: Add up to 2 points for high relevance, subtract up to 2 points for low relevance.
3. Technical accuracy and depth: Add up to 1 point for high accuracy/depth, subtract up to 1 point for low accuracy/depth.
4. Conciseness and information density: Add up to 1 point for high density, subtract up to 1 point for verbosity.
5. Use of domain-specific terminology: Add up to 0.5 points for appropriate use, subtract up to 0.5 points for lack of use.
6. Inclusion of key methodologies, metrics, or findings: Add up to 0.5 points for comprehensive inclusion, subtract up to 0.5 points for missing key elements.
7. Preservation of critical nuances and complexities: Add up to 0.5 points for preserving nuances, subtract up to 0.5 points for oversimplification.

Sample Exemplars:

1. High-quality summary (Instruction: "Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field."):
{{
    "base_score": 5,
    "relevance_adjustment": 1.8,
    "technical_adjustment": 0.9,
    "conciseness_adjustment": 0.8,
    "terminology_adjustment": 0.4,
    "key_elements_adjustment": 0.5,
    "nuance_adjustment": 0.4,
    "final_score": 9.8,
    "reasoning": "Highly relevant (+1.8) with comprehensive coverage of key methodologies and novel contributions. Excellent technical depth (+0.9) in explaining the Chain of Density (CoD) approach. Very concise with high information density (+0.8). Appropriate use of domain-specific terms like 'entity-dense summaries' and 'iterative refinement' (+0.4). Covers all key elements including experimental setup and results (+0.5). Preserves critical nuances such as the comparison with baseline methods and limitations (+0.4)."
}}

2. Average-quality summary (Instruction: "Analyze the experimental setup, results, and limitations of this study, highlighting any statistical significance and error margins."):
{{
    "base_score": 5,
    "relevance_adjustment": 0.5,
    "technical_adjustment": 0.3,
    "conciseness_adjustment": -0.2,
    "terminology_adjustment": 0.1,
    "key_elements_adjustment": 0.2,
    "nuance_adjustment": -0.1,
    "final_score": 5.8,
    "reasoning": "Moderately relevant (+0.5) but lacks focus on statistical significance and error margins. Some technical depth in describing the experimental setup (+0.3). Slightly verbose (-0.2). Limited use of domain-specific terms (+0.1). Covers some key elements like dataset description and evaluation metrics (+0.2) but oversimplifies some aspects of the results (-0.1)."
}}

3. Low-quality summary (Instruction: "Compare this paper's approach to existing methods in the field, explaining how it addresses current challenges or limitations."):
{{
    "base_score": 5,
    "relevance_adjustment": -1.5,
    "technical_adjustment": -0.7,
    "conciseness_adjustment": -0.6,
    "terminology_adjustment": -0.3,
    "key_elements_adjustment": -0.4,
    "nuance_adjustment": -0.3,
    "final_score": 1.2,
    "reasoning": "Largely irrelevant (-1.5), focusing on general NLP concepts instead of comparing the Chain of Density approach to existing methods. Poor technical depth (-0.7) with no specific details about the paper's methodology. Verbose and repetitive (-0.6). Lacks domain-specific terms related to summarization techniques (-0.3). Misses key elements of the paper's contributions and comparative analysis (-0.4). Fails to capture nuances of how this approach addresses current challenges (-0.3)."
}}

Provide your evaluation in the following JSON format:
{{
    "base_score": 5,
    "relevance_adjustment": <float>,
    "technical_adjustment": <float>,
    "conciseness_adjustment": <float>,
    "terminology_adjustment": <float>,
    "key_elements_adjustment": <float>,
    "nuance_adjustment": <float>,
    "final_score": <float>,
    "reasoning": "<brief explanation for each adjustment>"
}}

Ensure your response is ONLY valid JSON. Do not include any other text outside the JSON object."""

        response = openai_client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content)

    scores = {
        "chunk_summaries": [],
        "chunk_iteration_summaries": [],
        "iteration_summaries": [],
        "accumulated_summary": {},
        "final_summary": {}
    }

    # Score chunk summaries
    for i, chunk_summary_list in enumerate(model_output["chunk_summaries"]):
        chunk_scores = []
        for j, chunk_summary in enumerate(chunk_summary_list):
            score = score_summary(chunk_summary, f"Chunk Summary {i+1}.{j+1}")
            chunk_scores.append(score)
        scores["chunk_summaries"].append(chunk_scores)

    # Score chunk iteration summaries
    for i, summary in enumerate(model_output["chunk_iteration_summaries"]):
        score = score_summary(summary, f"Chunk Iteration Summary {i+1}")
        scores["chunk_iteration_summaries"].append(score)

    # Score iteration summaries
    for i, summary in enumerate(model_output["iteration_summaries"]):
        score = score_summary(summary, f"Iteration Summary {i+1}")
        scores["iteration_summaries"].append(score)

    # Score accumulated summary
    scores["accumulated_summary"] = score_summary(model_output["accumulated_summary"], "Accumulated Summary")

    # Score final summary
    scores["final_summary"] = score_summary(model_output["final_summary"], "Final Summary")

    return scores

In [38]:
evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])
await evaluation.evaluate(arxiv_chain_of_density_pipeline)

Number of chunks: 24
Chunk sizes: [3512, 2214, 3958, 3967, 3993, 1410, 4051, 3978, 2216, 3995, 3990, 3047, 3957, 3960, 3973, 3956, 3964, 3992, 3997, 3046, 2035, 2907, 4250, 2973]
Iteration 1, Chunk 1:
Key methodologies and novel contributions:

1. Controlled experimental setup: Fine-tuning LLMs on closed-book QA tasks with varying proportions of examples introducing new knowledge vs. pre-existing knowledge.

2. Quantitative analysis of learning dynamics: Demonstrated slower acquisition of new factual knowledge (Unknown examples) compared to pre-existing knowledge (Known examples) during fine-tuning. Precise learning rates and performance metrics to be determined from full paper.

3. Hallucination tendency assessment: Established linear relationship between learning of Unknown examples and increased propensity for model hallucination. Quantitative measure of hallucination rate increase per learned Unknown example to be extracted from detailed results.

4. Optimal fine-tuning strategy id

🍩 https://wandb.ai/a-sh0ts/arxiv-papers-anthropic-testv2-4/r/call/190ba385-4841-4a5d-b589-88485982ad7f


{'quality_scorer': {'accumulated_summary': {'base_score': {'mean': 5.0},
   'relevance_adjustment': {'mean': -2.0},
   'technical_adjustment': {'mean': -1.0},
   'conciseness_adjustment': {'mean': -1.0},
   'terminology_adjustment': {'mean': -0.5},
   'key_elements_adjustment': {'mean': -0.5},
   'nuance_adjustment': {'mean': -0.5},
   'final_score': {'mean': -0.5}},
  'final_summary': {'base_score': {'mean': 5.0},
   'relevance_adjustment': {'mean': -2.0},
   'technical_adjustment': {'mean': -1.0},
   'conciseness_adjustment': {'mean': -1.0},
   'terminology_adjustment': {'mean': -0.5},
   'key_elements_adjustment': {'mean': -0.5},
   'nuance_adjustment': {'mean': -0.5},
   'final_score': {'mean': -0.5}}},
 'model_latency': {'mean': 553.9931058883667}}