In [1]:
# Load the libraries that are needed
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import os
import random

In [2]:
fpath = '/domino/datasets/local/Gartner_Article_Chat'
files = [os.path.join(fpath, f) for f in os.listdir(fpath) if os.path.isfile(os.path.join(fpath, f))]

loader = PyPDFLoader(files[0])

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 5000,  # size of each chunk created
    chunk_overlap  = 2500,  # size of  overlap between chunks in order to maintain the context
)
documents = text_splitter.split_documents(loader.load())
print(f"There are {len(documents)} pages in the document")

invalid pdf header: b'PK\x03\x04\x14'


EOF marker not found


PdfStreamError: Stream has ended unexpectedly

In [15]:
# Pick a sample page
print(documents[4].page_content)

Gartner, Inc. | G00812459 Page 5 of 8
Overview
Top Business Priorities for Financial Services Leaders in Banking and
Investment Services
Modernizing business-critical systems and capabilities is a top priority for 55% of
ﬁnancial services leaders in banking and investment services, according to the 2024
Gartner Financial Services Business Priority Tracker Survey, 3Q. Other top priorities for the
quarter include improving operational efﬁciency (42%), enhancing data and analytics
capabilities (32%), minimizing operational risk (32%) and investing in emerging
technological capabilities (30%). Conﬁdence in executing against these priorities is
generally high, ranging from 55% to 59%.
This research note is restricted to the personal use of wesley.palmer@wwt.com.


## parser.py

In [1]:
import os
import fitz  # PyMuPDF
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict
from dataclasses import dataclass

@dataclass
class DocumentChunk:
    content: str
    metadata: Dict[str, str]

class PDFIngestor:
    def __init__(self, data_dir: str, chunk_size: int = 1200, chunk_overlap: int = 300):
        self.data_dir = data_dir
        self.chunker = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ".", " "]
        )

    def _load_pdf(self, path: str) -> List[str]:
        """Extract raw text from each page of the PDF."""
        doc = fitz.open(path)
        pages = [page.get_text("text") for page in doc]
        return pages

    def _chunk_text(self, text: str, metadata: Dict[str, str]) -> List[DocumentChunk]:
        """Split long text into overlapping semantic chunks."""
        chunks = self.chunker.split_text(text)
        return [DocumentChunk(content=c, metadata=metadata) for c in chunks]

    def ingest(self) -> List[DocumentChunk]:
        """Main ingestion loop for all PDFs in the data directory."""
        all_chunks = []
        pdf_files = [f for f in os.listdir(self.data_dir) if f.lower().endswith(".pdf")]

        for pdf in tqdm(pdf_files, desc="Processing PDFs"):
            pdf_path = os.path.join(self.data_dir, pdf)
            
            if check_file_in_pinecone(pdf):
                print(f"Skipping '{pdf}' — already in Pinecone.")
                continue

            try:
                pages = self._load_pdf(pdf_path)
                full_text = "\n\n".join(pages)
                metadata = {
                    "source": pdf,
                    "page_count": str(len(pages))
                }
                chunks = self._chunk_text(full_text, metadata)
                all_chunks.extend(chunks)
            except Exception as e:
                print(f"Failed to process {pdf}: {e}")

        return all_chunks


In [4]:
fpath = '/domino/datasets/local/Gartner_Article_Chat'
ingestor = PDFIngestor(data_dir=fpath)
chunks = ingestor.ingest()

# Example: print first 2 chunks
for c in chunks[2:6]:
    print(f"--- Chunk ---\n{c.content}...\nMetadata: {c.metadata}\n")

Processing PDFs: 100%|██████████| 2/2 [00:00<00:00, 22.23it/s]

--- Chunk ---
Gartner, Inc. | G00812459
Page 6 of 8
Looking ahead 12 months (through 3Q25), we see some shifts across all ﬁve priority and
conﬁdence levels. Investment in emerging technological capabilities is projected to
become the No. 1 priority, increasing 18 points from 30% to 48%. For several quarters
since early 2023, executives have consistently indicated that investing in emerging
technology will be their top priority in 12 months. However, data collected from the
business priority tracker survey shows that investment in emerging technology has yet to
become the No. 1 priority. Organizations seeking to transform their technology must make
a concerted effort to align their future priorities and their actions. For more details on how
to manage this shift see Ignition Guide to Technology Implementation Change
Management. Three out of the ﬁve priorities are expected to decrease in prioritization,
showing less differentiation in the signiﬁcance of the priorities. For all ﬁve initia




In [5]:
len(chunks)

92

## vector_store.py

In [1]:
import os
from pinecone import Pinecone
from pinecone import ServerlessSpec
from openai import OpenAI
from dotenv import load_dotenv
from typing import List
from uuid import uuid4


# Load environment variables
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
#pinecone_env = os.getenv("PINECONE_ENVIRONMENT")
pinecone_index_name = os.getenv("PINECONE_INDEX_NAME")
embedding_dimension = 512 

# Initialize Pinecone
pc = Pinecone(api_key = pinecone_api_key)

client = OpenAI(api_key = openai_key)

# Create index if needed
if not pc.has_index(pinecone_index_name):
    print(f"Creating Pinecone index '{pinecone_index_name}' with dimension {embedding_dimension}...")
    pc.create_index(
        name=pinecone_index_name,
        dimension=embedding_dimension,
        metric = "cosine",
        spec = ServerlessSpec(cloud='aws', region='us-east-1')
    )

# Connect to the index
index = pc.Index(pinecone_index_name)

def get_embedding(text: str) -> List[float]:
    response = client.embeddings.create(
        input=[text],
        model="text-embedding-3-small",
        dimensions=512
    )
    return response.data[0].embedding

def upsert_chunks_to_pinecone(chunks: List[DocumentChunk]):
    """Convert chunks to embeddings and upsert into pc"""
    vectors = []
    for chunk in chunks:
        try:
            emb = get_embedding(chunk.content)
            vector = {
                "id": str(uuid4()),
                "values": emb,
                "metadata": {
                    "text": chunk.content,  # Optional truncation
                    **chunk.metadata
                }
            }
            vectors.append(vector)
        except Exception as e:
            print(f"Failed to embed chunk: {e}")
            continue

    # Upsert in batches
    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"Upserted {i + len(batch)} / {len(vectors)} vectors")

def check_file_in_pinecone(source_filename: str) -> bool:
    dummy_vector = [0.0] * 512  # Match embedding dim
    try:
        response = index.query(
            vector=dummy_vector,
            top_k=1,
            include_metadata=True,
            filter={"source": source_filename}
        )
        matches = response.get("matches", [])
        return len(matches) > 0
    except Exception as e:
        print(f"Failed to check for existing file: {e}")
        return False



  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'DocumentChunk' is not defined

In [11]:
upsert_chunks_to_pinecone(chunks)

Upserted 92 / 92 vectors
