In [6]:
from loguru import logger
import pymupdf

In [3]:
def extract_pdf_text(file_bytes: bytes) -> str:
    """Extract text from PDF bytes."""
    try:
        import pymupdf  # PyMuPDF
        logger.info("Extracting text from PDF")
        doc = pymupdf.open(stream=file_bytes, filetype="pdf")
        text_parts = []
        logger.info(f"PDF has {doc.page_count} pages")
        for page_num, page in enumerate(doc, 1):
            logger.info(f"Extracting text from page {page_num}")
            text = page.get_text()
            logger.debug(page)
            logger.debug(f"Page {page_num} text length: {len(text)}")
            if text.strip():
                text_parts.append(f"--- Page {page_num} ---\n{text}")
        doc.close()
        return "\n\n".join(text_parts)
    except ImportError:
        logger.error("PyMuPDF not installed")
        return ""
    except Exception as e:
        logger.error(f"PDF extraction error: {e}")
        
        return ""

In [4]:
with open("/Users/redam94/Documents/stat-p.pdf", "rb") as f:
    pdf_bytes = f.read()

In [8]:
doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")

In [19]:
doc[1].get_images()

[]

In [29]:
doc_direct = pymupdf.open("/Users/redam94/Downloads/A+Personalized+Causal+Inference+Framework+for+Media+Effectiveness+Using+Hierarchical+Bayesian+Market+Mix+Models.pdf")

In [48]:
text = '\n'.join([doc.get_text().strip().replace(' \n', '\n') for doc in doc_direct])

In [53]:
from sentence_transformers import SentenceTransformer, util

def semantic_chunking(text, model_name='all-MiniLM-L6-v2', chunk_size=500, overlap=50):
    """
    Splits text into semantically meaningful chunks using sentence embeddings.

    Args:
        text (str): Full input text.
        model_name (str): Name of the embedding model to use.
        chunk_size (int): Approximate target chunk size in characters.
        overlap (int): Overlapping characters between chunks for context preservation.

    Returns:
        List[str]: List of semantically grouped chunks.
    """
    import nltk
    nltk.download('punkt')
    nltk.download('punkt_tab')
    from nltk.tokenize import sent_tokenize

    model = SentenceTransformer(model_name)

    # Step 1: Split text into sentences
    sentences = sent_tokenize(text)

    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += sentence + " "
        else:
            # Get embeddings for current chunk and sentence to check semantic continuity
            chunk_embedding = model.encode(current_chunk.strip(), convert_to_tensor=True)
            sentence_embedding = model.encode(sentence, convert_to_tensor=True)
            similarity = util.cos_sim(chunk_embedding, sentence_embedding).item()

            if similarity > 0.5:
                # If semantically similar, include sentence
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk)
                current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk)

    # Add overlap (optional, character-based)
    if overlap > 0 and len(chunks) > 1:
        overlapped_chunks = []
        for i in range(len(chunks)):
            prev = chunks[i - 1][-overlap:] if i > 0 else ""
            overlapped_chunks.append(prev + chunks[i])
        return overlapped_chunks

    return chunks

In [54]:
semantic_chunks = semantic_chunking(text, chunk_size=1000, overlap=100)

[nltk_data] Downloading package punkt to /Users/redam94/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/redam94/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [55]:
from rich.console import Console
from rich.markdown import Markdown

In [57]:
console = Console()
console.print(Markdown(semantic_chunks[30]))

In [3]:
# helper_functions.py
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN
from pptx.enum.shapes import MSO_SHAPE
import os

class PowerPointHelper:
    """
    A helper class to provide modular functions for PowerPoint manipulation
    """

    def __init__(self, presentation_path=None):
        """
        Initialize the PowerPoint presentation
        :param presentation_path: Path to existing presentation or None to create new
        """
        if presentation_path and os.path.exists(presentation_path):
            self.presentation = Presentation(presentation_path)
        else:
            self.presentation = Presentation()

    def add_slides(self, slide_count=1, layout_index=1):
        """
        Add new slides to the presentation
        :param slide_count: Number of slides to add
        :param layout_index: Index of slide layout to use (default: 1 for Title and Content)
        :return: List of added slide objects
        """
        slides = []
        for _ in range(slide_count):
            slide = self.presentation.slides.add_slide(self.presentation.slide_layouts[layout_index])
            slides.append(slide)
        return slides

    def insert_image(self, slide, image_path, left, top, width=None, height=None):
        """
        Insert an image into a slide
        :param slide: Slide object to insert image into
        :param image_path: Path to the image file
        :param left: Left position (inches)
        :param top: Top position (inches)
        :param width: Width of image (inches) - optional
        :param height: Height of image (inches) - optional
        :return: Image object
        """
        if width and height:
            image = slide.shapes.add_picture(image_path, Inches(left), Inches(top), Inches(width), Inches(height))
        elif width:
            image = slide.shapes.add_picture(image_path, Inches(left), Inches(top), Inches(width))
        elif height:
            image = slide.shapes.add_picture(image_path, Inches(left), Inches(top), height=Inches(height))
        else:
            image = slide.shapes.add_picture(image_path, Inches(left), Inches(top))
        return image

    def insert_text(self, slide, text, left, top, width=None, height=None, 
                   font_size=18, font_name='Calibri', bold=False, italic=False, 
                   color=(0, 0, 0), align='left'):
        """
        Insert text into a slide
        :param slide: Slide object to insert text into
        :param text: Text to insert
        :param left: Left position (inches)
        :param top: Top position (inches)
        :param width: Width of text box (inches) - optional
        :param height: Height of text box (inches) - optional
        :param font_size: Font size in points
        :param font_name: Font name
        :param bold: Bold text
        :param italic: Italic text
        :param color: RGB color tuple (r, g, b)
        :param align: Text alignment ('left', 'center', 'right', 'justify')
        :return: Text frame object
        """
        # Create textbox
        if width and height:
            textbox = slide.shapes.add_textbox(Inches(left), Inches(top), Inches(width), Inches(height))
        elif width:
            textbox = slide.shapes.add_textbox(Inches(left), Inches(top), Inches(width), Inches(1))
        elif height:
            textbox = slide.shapes.add_textbox(Inches(left), Inches(top), Inches(6), Inches(height))
        else:
            textbox = slide.shapes.add_textbox(Inches(left), Inches(top), Inches(6), Inches(1))

        # Add text to textbox
        text_frame = textbox.text_frame
        text_frame.text = text

        # Format text
        paragraph = text_frame.paragraphs[0]
        run = paragraph.runs[0]
        run.text = text
        run.font.size = Pt(font_size)
        run.font.name = font_name
        run.font.bold = bold
        run.font.italic = italic
        run.font.color.rgb = RGBColor(*color)

        # Set alignment
        alignment_map = {
            'left': PP_ALIGN.LEFT,
            'center': PP_ALIGN.CENTER,
            'right': PP_ALIGN.RIGHT,
            'justify': PP_ALIGN.JUSTIFY
        }
        paragraph.alignment = alignment_map.get(align.lower(), PP_ALIGN.LEFT)

        return text_frame

    def update_text_formatting(self, text_frame, font_size=None, font_name=None, 
                              bold=None, italic=None, color=None, align=None):
        """
        Update formatting of existing text
        :param text_frame: Text frame object to update
        :param font_size: New font size in points
        :param font_name: New font name
        :param bold: New bold setting
        :param italic: New italic setting
        :param color: New RGB color tuple (r, g, b)
        :param align: New text alignment ('left', 'center', 'right', 'justify')
        """
        paragraph = text_frame.paragraphs[0]
        run = paragraph.runs[0]

        if font_size is not None:
            run.font.size = Pt(font_size)
        if font_name is not None:
            run.font.name = font_name
        if bold is not None:
            run.font.bold = bold
        if italic is not None:
            run.font.italic = italic
        if color is not None:
            run.font.color.rgb = RGBColor(*color)
        if align is not None:
            alignment_map = {
                'left': PP_ALIGN.LEFT,
                'center': PP_ALIGN.CENTER,
                'right': PP_ALIGN.RIGHT,
                'justify': PP_ALIGN.JUSTIFY
            }
            paragraph.alignment = alignment_map.get(align.lower(), PP_ALIGN.LEFT)

    def save_presentation(self, file_path):
        """
        Save the presentation to a file
        :param file_path: Path to save the presentation
        """
        self.presentation.save(file_path)

    def get_slide_count(self):
        """
        Get the number of slides in the presentation
        :return: Number of slides
        """
        return len(self.presentation.slides)

# Example usage:
if __name__ == "__main__":
    # Create a new presentation
    ppt_helper = PowerPointHelper()

    # Add slides
    slides = ppt_helper.add_slides(2)

    # Insert text
    text_frame = ppt_helper.insert_text(
        slide=slides[0],
        text="Welcome to the Presentation",
        left=1, top=1,
        font_size=24,
        font_name='Arial',
        bold=True,
        color=(0, 0, 0)
    )

    # Insert image
    ppt_helper.insert_image(slides[0], "image.png", 2, 2, width=4, height=3)

    # Update text formatting
    ppt_helper.update_text_formatting(
        text_frame=text_frame,
        font_size=36,
        font_name='Times New Roman',
        bold=False,
        italic=True,
        color=(255, 0, 0),
        align='center'
    )
    ppt_helper.insert_text(
        slide=slides[1],
        text="Thank you for attending!",
        left=1, top=1,
        font_size=20,
        font_name='Verdana',
        bold=False,
        color=(0, 0, 255)
    )
    # Save presentation
    ppt_helper.save_presentation("example_presentation.pptx")
    print(f"Presentation saved with {ppt_helper.get_slide_count()} slides")

Presentation saved with 2 slides
