In [1]:
!pip install transformers torch PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_c

In [2]:
import fitz  # PyMuPDF
from transformers import pipeline

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ''
    for page in document:
        text += page.get_text()
    document.close()
    return text

# Function to chunk text into manageable parts
def chunk_text(text, chunk_size=1024):
    # Split the text by whitespace to get words
    words = text.split()
    # Initialize chunks
    chunks = []
    current_chunk = []

    # Create chunks of words based on the approximate character length
    for word in words:
        current_chunk.append(word)
        if sum(len(w) + 1 for w in current_chunk) >= chunk_size:  # +1 for space
            chunks.append(' '.join(current_chunk))
            current_chunk = []

    # Add the last chunk if any
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

from transformers import pipeline

# Function to analyze text using a BERT model
def analyze_text_with_bert(text_chunks):
    # Load a BART model pipeline for summarization
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summaries = []

    # Summarize each chunk of text with specified max_length and min_length
    for chunk in text_chunks:
        # Set max_length to be roughly half of the chunk length or a fixed value
        max_length = min(len(chunk.split()), 88)  # or any other logic to set max_length
        min_length = max(10, max_length // 2)  # Ensure min_length is less than max_length

        try:
            summary = summarizer(chunk, max_length=max_length, min_length=min_length)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Error summarizing chunk: {e}")
            continue

    # Combine all summaries into one string
    combined_summary = ' '.join(summaries)
    return combined_summary


In [5]:
# Main execution block
if __name__ == "__main__":
    pdf_path = 'jon_rep_1.pdf'  # Path to the PDF file

    # Extract text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Chunk the text for better processing
    text_chunks = chunk_text(extracted_text, chunk_size=800)  # Adjust chunk size based on your observation

    # Analyze text using BERT
    insights = analyze_text_with_bert(text_chunks)

    # Print the generated insights
    print("Generated Insights:")
    print(insights)


KeyboardInterrupt: 