In [1]:
JD_FOLDER_PATH = "/home/ruchirich/Documents/repositories/career-ai/data/sample_jd"

# RecursiveCharacterTextSplitter
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

# OllamaEmbeddings
MODEL = "llama3.2"

In [2]:
import dotenv
dotenv.load_dotenv()

True

In [3]:
from typing import List
import os

In [5]:
from langchain_core.prompts import ChatPromptTemplate

# 1. load document, prep model-readable text
from langchain_core.documents import Document
from langchain_community.document_loaders.pdf import PyPDFLoader

# 2. text splitter, break docs into smaller chunks (better for indexing and model context size)
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

# 3.1 embedding, converts text to vectors
from langchain_ollama import OllamaEmbeddings
# 3.2 vectorstore, store the vectors( try out a better option MongoDB, Pinecone, Chroma)
from langchain_core.vectorstores import (
    InMemoryVectorStore,
    VectorStore # for type hinting
    )
# 4.1 RAG: retrieve, use VectorStore.as_retriever() function
# 4.2 RAG: generate
from langchain import hub
from langchain_ollama.chat_models import ChatOllama

[ref](https://diptimanrc.medium.com/rapid-q-a-on-multiple-pdfs-using-langchain-and-chromadb-as-local-disk-vector-store-60678328c0df)

In [6]:
def load_split_store() -> VectorStore:
    """
    Load PDF's into a vectorstore.
    Documents are chunks of pdf's
    """

    documents: List[Document] = []

    # load
    for file in os.listdir(JD_FOLDER_PATH):
        file_path = os.path.join(JD_FOLDER_PATH, file)
        loader = PyPDFLoader(
            file_path=file_path
        )

        documents.extend(loader.load())
    
    # split / chunk docs
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        add_start_index=True # metadata for the start character of chunk within the pdf being chunked
    )

    chunked_docs = text_splitter.split_documents(
        documents=documents
    )

    # store in vectorsore
    embedding_model = OllamaEmbeddings(
        model=MODEL
    )

    vectorstore = InMemoryVectorStore.from_documents(
        documents=chunked_docs,
        embedding=embedding_model
    )

    return vectorstore

In [7]:
retriever = load_split_store().as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}
    )

## sample retrievals

In [None]:
retriever.invoke(
    input="What are some data scientist applications I submitted in the past"
)[:3]

In [None]:
retriever.invoke(
    input="What job posting required an experience in AI and LLM"
)[:3]

In [None]:
retriever.invoke(
    input="What applications did I make at a banking or financial services company"
)[:3]

## Prompt

## Generate