In [None]:
JD_FOLDER_PATH = "/home/ruchirich/Documents/repositories/career-ai/data/sample_jd"

# RecursiveCharacterTextSplitter
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

# OllamaEmbeddings
MODEL = "llama3.2"

In [None]:
from typing import List
import os
import getpass

import dotenv
dotenv.load_dotenv()

# TODO: Fix langsmith tracing
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()
os.environ["LANGSMITH_PROJECT"] = "career-ai"


In [None]:
from langchain_core.prompts import ChatPromptTemplate

# 1. load document, prep model-readable text
from langchain_core.documents import Document
from langchain_community.document_loaders.pdf import PyPDFLoader

# 2. text splitter, break docs into smaller chunks (better for indexing and model context size)
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

# 3.1 embedding, converts text to vectors
from langchain_ollama.embeddings import OllamaEmbeddings
# 3.2 vectorstore, store the vectors( try out a better option MongoDB, Pinecone, Chroma)
from langchain_core.vectorstores import (
    InMemoryVectorStore,
    VectorStore # for type hinting
    )
# 4.1 RAG: retrieve, use VectorStore.as_retriever() function
# 4.2 RAG: generate
from langchain import hub # prompt templates available on langchain hub
from langchain_ollama.chat_models import ChatOllama

# 5. Parse output
from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.runnables.passthrough import RunnablePassthrough

[ref](https://diptimanrc.medium.com/rapid-q-a-on-multiple-pdfs-using-langchain-and-chromadb-as-local-disk-vector-store-60678328c0df)

In [None]:
def load_split_store() -> VectorStore:
    """
    Load PDF's into a vectorstore.
    Documents are chunks of pdf's
    """

    documents: List[Document] = []

    # load
    for file in os.listdir(JD_FOLDER_PATH):
        file_path = os.path.join(JD_FOLDER_PATH, file)
        loader = PyPDFLoader(
            file_path=file_path
        )

        documents.extend(loader.load())
    
    # split / chunk docs
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        add_start_index=True # metadata for the start character of chunk within the pdf being chunked
    )

    chunked_docs = text_splitter.split_documents(
        documents=documents
    )

    # store in vectorsore
    embedding_model = OllamaEmbeddings(
        model=MODEL
    )

    vectorstore = InMemoryVectorStore.from_documents(
        documents=chunked_docs,
        embedding=embedding_model
    )

    return vectorstore

In [None]:
retriever = load_split_store().as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}
    )

## sample retrievals

In [None]:
retriever.invoke(
    input="What are some data scientist applications I submitted in the past"
)[:3]

In [None]:
retriever.invoke(
    input="What job posting required an experience in AI and LLM"
)[:3]

In [None]:
retriever.invoke(
    input="What applications did I make at a banking or financial services company"
)[:3]

## Prompt

In [None]:
prompt = hub.pull("rlm/rag-prompt")

the prompt template requires two inputs:

1. context
2. question


In [None]:
example_message = prompt.invoke(
    input={"context": "filler context", "question": "filler question"}
).to_messages()

print(example_message[0].content)

## Generate

In [None]:
llm = ChatOllama(
    model=MODEL
)

## Chain

Chain will:

1. take a question
2. retrieve relevant documents
3. construct a prompt
4. pass prompt to the model
5. parse output

Finna put the chain in langchain

what we want: an easy syntax to create a pipeline of functions in a RAG application

how langchain does it: LangChain Expression Language (LCEL)

what else does it help with: observability of the chain on LangSmith

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()} # prepares the two inputs for prompt template
    | prompt # creates a PromptMessage ready for LLM
    | llm # inference
    | StrOutputParser() # just plucks the string content out of the LLM's output message
)

In [None]:
def get_model_output():
    answer = rag_chain.invoke("What is the major diffference between data analyst and data scientist?")
    return answer

In [None]:
import streamlit as st

st.set_page_config(
    page_title="career-ai",
    page_icon=":computer:"
)

st.header("What's on your mind?")
form_input = st.text_input("Enter query")
submit = st.button("Generate")

if submit:
    st.write(get_model_output())

In [None]:
rag_chain.get_graph().print_ascii()