In [1]:
import PyPDF2
import torch
from threading import Thread
from transformers import TextIteratorStreamer, MllamaForCausalLM, AutoTokenizer
import warnings


# Suppress warnings
warnings.filterwarnings("ignore")

## Set up LLM
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(f"You are running the model on: {model.device}")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

You are running the model on: cuda:0


In [2]:
def extract_pdf_text(file_path):
    full_text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            full_text += page.extract_text() + "\n\n"
    return full_text.strip()

In [3]:
def generate_text_stream(prompt, max_new_tokens=1024):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    try:
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
        
        generation_kwargs = dict(
            inputs,
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=50,
            top_p=0.7,
            temperature=0.2,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
        generation_kwargs['eos_token_id'] = tokenizer.encode("</explanation>")[-1]

        # Start the generation in a separate thread
        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()

        generated_text = ""
        for new_text in streamer:
            generated_text += new_text

        thread.join()  # Wait for the generation to finish
        
        # Clear CUDA cache
        torch.cuda.empty_cache()
        
        return generated_text
    except Exception as e:
        print(f"\nAn error occurred during text generation: {str(e)}")
        return None
    finally:
        # Ensure we always clear the inputs tensor
        del inputs
        torch.cuda.empty_cache()

In [4]:
def summarize_text(text, chunk_size=8192, max_chunks=10):
    # Tokenize the full text
    tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False)
    
    # Process the text in chunks
    chunk_summaries = []
    for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size):
        chunk_tokens = tokens[i:i+chunk_size]
        chunk_text = tokenizer.decode(chunk_tokens)
        
        prompt = f"Summarize the following excerpt from a scientific paper (part {len(chunk_summaries)+1}):\n\n{chunk_text}\n\nBrief summary:"
        chunk_summary = generate_text_stream(prompt, max_new_tokens=200)
        if chunk_summary:
            chunk_summaries.append(chunk_summary)

    # Generate final concise summary
    final_summary_prompt = "Based on the following summaries of different parts of a scientific paper, provide a concise summary of the entire paper in no more than 10 sentences. Focus on the main points, methodology, and key findings:\n\n" + "\n\n".join(chunk_summaries) + "\n\nConcise 10-sentence summary:"
    final_summary = generate_text_stream(final_summary_prompt, max_new_tokens=300)

    return final_summary

In [5]:
# Usage
pdf_path = '2410.02740v1.pdf'
document_text = extract_pdf_text(pdf_path)
len(document_text)

66769

In [6]:
summary = summarize_text(document_text)
print("Summary of the paper:")
print(summary)

Summary of the paper:
 
The paper explores the role of synthetic captions in pre-training multimodal foundation models, including CLIP, multimodal LLMs, and diffusion models. The authors propose a novel captioning pipeline to generate diverse caption formats tailored to different models. A comprehensive study reveals that a hybrid approach combining synthetic captions and original AltText can outperform the use of synthetic captions alone, improving both alignment and performance. Each model demonstrates preferences for particular caption formats, and the optimal captioning techniques vary across models. The authors also investigate the interaction between synthetic captions and original AltText, analyzing whether a hybrid approach can balance the need for diverse data with the benefits of enhanced image-text alignment. The study provides valuable insights into optimizing captioning strategies for pre-training multimodal foundation models. The authors develop a controllable and human-a

In [8]:
import textwrap
def format_summary(summary, width=80):
    """Format the summary text to fit within a specified width."""
    return '\n'.join(textwrap.wrap(summary, width=width))

In [10]:
# For Jupyter Notebook, you can use display() for richer output
from IPython.display import display, Markdown
formatted_summary = format_summary(summary)
display(Markdown(f"**Concise summary of the paper (max 10 sentences):**\n\n{formatted_summary}"))

**Concise summary of the paper (max 10 sentences):**

  The paper explores the role of synthetic captions in pre-training multimodal
foundation models, including CLIP, multimodal LLMs, and diffusion models. The
authors propose a novel captioning pipeline to generate diverse caption formats
tailored to different models. A comprehensive study reveals that a hybrid
approach combining synthetic captions and original AltText can outperform the
use of synthetic captions alone, improving both alignment and performance. Each
model demonstrates preferences for particular caption formats, and the optimal
captioning techniques vary across models. The authors also investigate the
interaction between synthetic captions and original AltText, analyzing whether a
hybrid approach can balance the need for diverse data with the benefits of
enhanced image-text alignment. The study provides valuable insights into
optimizing captioning strategies for pre-training multimodal foundation models.
The authors develop a controllable and human-aligned captioning pipeline to
generate various types of captions and conduct extensive pre-training
experiments to derive key insights. The results show that both AltText and
synthetic captions play crucial roles, with AltText contributing to more diverse
information and synthetic captions offering improved image-text alignment. The
authors propose two new metrics, CHAIR and CapScore, to evaluate the quality of
captions generated by a model. Overall, the study aims to improve the quality of
captions generated by models and provide a more comprehensive evaluation
framework for multimodal tasks. The findings highlight the importance of
synthetic captions in multimodal foundation models and suggest future directions
for research.  (Note: I have rewritten the summary to make it more concise and
clear, while