In [2]:
pip install pymupdf transformers

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting filelock (from transformers)
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.2-py3-none-any.whl.metadata (12 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.0->transformers)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading tran

In [None]:
import fitz  # PyMuPDF
import openai

# Function to set the OpenAI API key
def set_openai_api_key(api_key):
    openai.api_key = api_key

# Function to read the playbook PDF document
def read_playbook_pdf(file_path):
    document = fitz.open(file_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text") + "\n"  # Add newline to separate pages
    return text

# Function to chunk the document
def chunk_document(text, max_chunk_size=2000):
    paragraphs = text.split('\n')
    chunks = []
    current_chunk = ""
    
    for paragraph in paragraphs:
        if len(current_chunk) + len(paragraph) + 1 > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = paragraph + "\n"
        else:
            current_chunk += paragraph + "\n"
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

# Function to simplify text using the GPT-4 model
def simplify_text(text, prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": text}
        ],
        max_tokens=2000,
        temperature=0.5,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    )
    return response.choices[0].message['content'].strip()

# Function to simplify the playbook and save it as a PDF
def simplify_playbook(file_path, output_path, api_key, prompt):
    # Set the OpenAI API key
    set_openai_api_key(api_key)
    
    # Read the playbook PDF document
    playbook_text = read_playbook_pdf(file_path)
    
    # Chunk the document
    chunks = chunk_document(playbook_text)
    
    # Simplify each chunk
    simplified_chunks = [simplify_text(chunk, prompt) for chunk in chunks]
    
    # Combine the simplified chunks
    simplified_text = "\n".join(simplified_chunks)
    
    # Create a new PDF with the simplified text
    simplified_pdf = fitz.open()
    simplified_text_chunks = simplified_text.split('\n')
    
    page = simplified_pdf.new_page()
    y = 72  # Starting position for text on the page
    
    for chunk in simplified_text_chunks:
        page.insert_text((72, y), chunk, fontsize=12, fontname="helv")
        y += 14  # Move down for the next line
        if y > 800:  # Create a new page if the current one is full
            page = simplified_pdf.new_page()
            y = 72

    # Save the simplified PDF
    simplified_pdf.save(output_path)
    print(f"Simplified playbook saved to {output_path}")

# Example usage
api_key = ""
prompt = "You are a legal expert tasked with simplifying complex legal language. Please rewrite the following text in plain language while retaining all legal terms, obligations, and details. Ensure that it remains legally accurate and clear to someone without a legal background. Avoid legal jargon and make it concise. Preserve all critical information and context:\n\n"
simplify_playbook('playbook2.pdf', 'simplified_playbook3.pdf', api_key, prompt)


In [None]:
import fitz  # PyMuPDF
from transformers import pipeline

# Function to read the playbook PDF document
def read_playbook_pdf(file_path):
    document = fitz.open(file_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text") + "\n"  # Add newline to separate pages
    return text

# Function to chunk the document
def chunk_document(text, max_chunk_size=512):
    paragraphs = text.split('\n')
    chunks = []
    current_chunk = ""

    for paragraph in paragraphs:
        if len(current_chunk) + len(paragraph) + 1 > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = paragraph + "\n"
        else:
            current_chunk += paragraph + "\n"

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Function to simplify text using a Hugging Face model
def simplify_text(text, simplifier):
    simplified_text = simplifier(text)[0]['generated_text']
    return simplified_text

# Function to simplify the playbook and save it as a PDF
def simplify_playbook(file_path, output_path, simplifier):
    # Read the playbook PDF document
    playbook_text = read_playbook_pdf(file_path)

    # Chunk the document
    chunks = chunk_document(playbook_text)

    # Simplify each chunk
    simplified_chunks = [simplify_text(chunk, simplifier) for chunk in chunks]

    # Combine the simplified chunks
    simplified_text = "\n".join(simplified_chunks)

    # Create a new PDF with the simplified text
    simplified_pdf = fitz.open()
    simplified_text_chunks = simplified_text.split('\n')

    page = simplified_pdf.new_page()
    y = 72  # Starting position for text on the page

    for chunk in simplified_text_chunks:
        page.insert_text((72, y), chunk, fontsize=12, fontname="helv")
        y += 14  # Move down for the next line
        if y > 800:  # Create a new page if the current one is full
            page = simplified_pdf.new_page()
            y = 72

    # Save the simplified PDF
    simplified_pdf.save(output_path)
    print(f"Simplified playbook saved to {output_path}")

# Example usage
model_name = "t5-base"  # You can also use other models like "facebook/bart-large-cnn"
simplifier = pipeline("text2text-generation", model="muheng/finetuned-contract-legal")


simplify_playbook('/content/playbook2.pdf', 'simplified_playbook.pdf', simplifier)
