In [33]:
query = 'Which project is Antonino Sirchia working at the moment?'

In [2]:
from langchain_ollama import OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_ollama.chat_models import ChatOllama

OLLAMA_BASEURL='http://192.168.1.99:11434/'
OLLAMA_MODEL='llama3.2:latest'

llm = ChatOllama(
        model = OLLAMA_MODEL,
        temperature = 0,
        base_url = OLLAMA_BASEURL
    )

embeddings = OllamaEmbeddings(
    model=OLLAMA_MODEL,
    base_url = OLLAMA_BASEURL
)

vector_store = InMemoryVectorStore(embeddings)

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

def load_to_knowledge(file_path):
    print(f"\nProcessing {file_path}")
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    print(f'\t{len(docs)} pages found')

    splits = text_splitter.split_documents(docs)
    print(f'\t{len(splits)} splits generated')

    print('Saving splits to Vectore Store...')
    vector_store.add_documents(documents=splits)
    print('\tSaved.\n')

In [4]:
knowledge_base_items = [
    'docs/SIRA_CV_EN.pdf'
]

for item in knowledge_base_items:
    load_to_knowledge(item)

print('Knowledge base ready')


Processing docs/SIRA_CV_EN.pdf
	7 pages found
	15 splits generated
Saving splits to Vectore Store...
	Saved.

Knowledge base ready


## Define a State to be passed through the chain steps


In [79]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict

class State(TypedDict):
    question: str
    raw_context: List[Document]
    context: List[str]

    def __init__(self, question):
        self.question = question
        self.raw_context = []
        self.context = []

state = State(question=query)
state['context'] = []

## Define custom Runnables

We create two runnables:
- A runnable for similarity search in vector store (this is necessary because it now has to rely on the state variable)
- A runnable which invokes an LLM to anonymize the context information retrieved from the vector store

In [None]:
from langchain_core.runnables import RunnableLambda

def retrieve_context(state:State) -> State:
    state['raw_context'] = vector_store.similarity_search(state['question'], k=2)
    return state

retrieve = RunnableLambda(retrieve_context)

In [90]:
def anonymize_context_with_llm(state:State) -> State:
    p = ChatPromptTemplate.from_template("""
            Your goal is to remove any confidential information that can be reconducted to an individual person. 
            In the following text replace any confidential information with the following placeholder: #######

            Here is the text:
            {content}

            
            Answer only with the anonymized text.
            """)

    anonymization_chain = p | llm

    for d in state['raw_context']:
        r = anonymization_chain.invoke({'content': d.page_content})
        state['context'].append(r.content)
    
    return state


anonymize = RunnableLambda(anonymize_context_with_llm)

## Final chain creation

In [91]:
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_template("""
    Answer the user's question trying to be as synthetic as possible. Use at max 10 words:
    Context: {context}
    Question: {question}
    """)
chain = retrieve | anonymize | prompt | llm | StrOutputParser()
answer = chain.invoke(state)
print(answer)

According to the anonymized text, Antonino Sirchia is currently working on the "EasyGov" project.
