# RAG with Memory and rewriter llm



## Main system process

In [None]:
#invoke the db vectorebase to do similarity search again
from langchain_postgres.vectorstores import PGVector
from langchain_community.embeddings import HuggingFaceEmbeddings


connection = "postgresql+psycopg://langchain:langchain@127.0.0.1:6024/langchain"
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Later, second time (re-use existing store, no reinsertion)
db = PGVector.from_existing_index(embedding=embedding_model, connection=connection)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [17]:
# langgraph technique + adding memory + 
from typing import TypedDict, Annotated
from langgraph.graph.message import add_messages
from langgraph.graph import StateGraph, START, END
from langchain_ollama import ChatOllama
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage



class State(TypedDict):
    messages: Annotated[list, add_messages]
    
builder = StateGraph(State)

# Add the chat node
llm_model = ChatOllama(model="llama3.1:latest")

prompt = ChatPromptTemplate.from_template(
    """You are a helpful assistant for questions about our colleagues.
First, use the information in CONTEXT as your primary source.
Try to asnwer in a short way.

CONTEXT:
{context}

Question: {question}
Answer:"""
)

retriever = db.as_retriever(search_kwargs={"k": 15})

def chat_bot(state: State):
    last_msg = state["messages"][-1]
    question = last_msg.content if isinstance(last_msg, HumanMessage) else str(last_msg)

    docs = retriever.invoke(question)
    context = "\n\n".join(d.page_content for d in docs)

    formatted_prompt = prompt.format(context=context, question=question)
    # formatted_prompt is a string; for ChatOllama you can pass it directly
    answer_msg = llm_model.invoke(formatted_prompt)

    return {"messages": [answer_msg]}

builder.add_node("chatbot", chat_bot)


# def chat_bot(state: State):
#     answer = llm_model.invoke(state["messages"])
#     return {"messages": [answer]}
# builder.add_node("chatbot", chat_bot)

# Add the edges
builder.add_edge(START, "chatbot")
builder.add_edge("chatbot", END)

graph = builder.compile(checkpointer=MemorySaver())

# #plot the graph
# from IPython.display import Image, display
# png_bytes = graph.get_graph().draw_mermaid_png()
# display(Image(png_bytes))
# Run the graph
# create the thread and invoke the graph with the thread
thread_1 = {"configurable": {"thread_id": "thread_1"}}

# input = {"messages": ["Hi. my name is Porya."]}
# result_1 = graph.invoke(input, thread_1)

# input = {"messages": ["what was my name?"]}
# result_2 = graph.invoke(input, thread_1)
# graph.get_state(thread_1)

In [21]:
input = {"messages": ["what is the job title of Sebastian?"]}
result = graph.invoke(input, thread_1)
result

{'messages': [HumanMessage(content='Who is Pouriya Amini?', additional_kwargs={}, response_metadata={}, id='e8bc5b45-950d-44b5-863b-cf6451b2c5c2'),
  AIMessage(content='Pouriya Amini Digehsara, a Data scientist | AI/ML engineer based in Lingen (Ems), Germany.', additional_kwargs={}, response_metadata={'model': 'llama3.1:latest', 'created_at': '2026-02-26T13:32:09.408857049Z', 'done': True, 'done_reason': 'stop', 'total_duration': 860173014, 'load_duration': 69149490, 'prompt_eval_count': 1904, 'prompt_eval_duration': 356239740, 'eval_count': 29, 'eval_duration': 413108494, 'model_name': 'llama3.1:latest'}, id='lc_run--019c9a26-51e3-7f60-a0ab-985fce339eec-0', tool_calls=[], invalid_tool_calls=[], usage_metadata={'input_tokens': 1904, 'output_tokens': 29, 'total_tokens': 1933}),
  HumanMessage(content='what is Mobile of  Pouriya Amini?', additional_kwargs={}, response_metadata={}, id='47c7cd09-4f5a-470a-9004-1b60cca4dccf'),
  AIMessage(content='(+49)17661053413 (Mobile)', additional_kwar

## A single LLM just to test the info it has

In [22]:
# TEMPORARY: Just a single llm to check if it knows the answer
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

llm_model = ChatOllama(model="llama3.1:latest")
template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant.Your name is Rosenxt_bot.answer the question."),
    ('human', "question: {Question}")
])

chain = template | llm_model
chain.invoke({"Question": "what is the job title of Sebastian?"})

AIMessage(content="I don't have enough information to determine the job title of Sebastian. Could you provide more context or details about which Sebastian you are referring to? This will help me give a more accurate answer.", additional_kwargs={}, response_metadata={'model': 'llama3.1:latest', 'created_at': '2026-02-26T13:35:25.941959146Z', 'done': True, 'done_reason': 'stop', 'total_duration': 710867316, 'load_duration': 68173324, 'prompt_eval_count': 41, 'prompt_eval_duration': 35648574, 'eval_count': 40, 'eval_duration': 581952938, 'model_name': 'llama3.1:latest'}, id='lc_run--019c9a29-522e-70f3-b6f2-1319c6eb8c3a-0', tool_calls=[], invalid_tool_calls=[], usage_metadata={'input_tokens': 41, 'output_tokens': 40, 'total_tokens': 81})

## Vector data base and embedding prepration

In [89]:
# Now prepare the pdfs of the profile of our collegous for RAG

#1-load pdfs
import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

DATA_PATH = Path("./data/")  # adjust to teh data folder

all_docs = []

for file in os.listdir(DATA_PATH):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(str(DATA_PATH / file))
        docs = loader.load()
        all_docs.extend(docs)

#2- split docs to chunks
# use langchain text splitter to split the documents into smaller chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pprint import pprint


text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Split the documents into smaller chunks
splitted_texts = text_splitter.split_documents(all_docs)

# outputs
pprint(f"The number of original documents: {len(docs)} and after splitting there are {len(splitted_texts)} of chunks.")
pprint("Here is an example of a splitted text chunk metadata:")
pprint(splitted_texts[10].metadata)

('The number of original documents: 1 and after splitting there are 68 of '
 'chunks.')
'Here is an example of a splitted text chunk metadata:'
{'author': 'LinkedIn',
 'creationdate': '2026-02-16T09:50:52+00:00',
 'creator': 'PyPDF',
 'page': 0,
 'page_label': '1',
 'producer': 'Apache FOP Version 2.3',
 'source': 'data/Andrey.pdf',
 'subject': 'Resume generated from profile',
 'title': 'Resume',
 'total_pages': 2}


In [None]:
# 3- embdeeing chunks and save in vector database

# firtst lets create the database via docker (docker.yml file):
# services:
#   pgvector:
#     image: pgvector/pgvector:pg16
#     container_name: pgvector-container-chp4
#     environment:
#       POSTGRES_USER: langchain
#       POSTGRES_PASSWORD: langchain
#       POSTGRES_DB: langchain
#       POSTGRES_HOST_AUTH_METHOD: md5
#     ports:
#       - "6024:5432"
#     volumes:
#       - pgvector_RAG_chp4:/var/lib/postgresql/data

# volumes:
#   pgvector_RAG_chp4:

# =========================Naive embeddings================================
# NOW we can use pgvector to save our chunks(first time:)
# #Option1: using llama3.1 embedding model
# from langchain_postgres.vectorstores import PGVector
# # using ollama as embedding model
# from langchain_ollama import OllamaEmbeddings

# embedding_model = OllamaEmbeddings(model='llama3.1:latest')

# connection = "postgresql+psycopg://langchain:langchain@127.0.0.1:6024/langchain"
# db = PGVector.from_documents(splitted_texts, embedding_model, connection=connection)

# # ==========================Best embeddings=========================
# #Option2: using sentence-transformers model from Huggingface
# from langchain_postgres.vectorstores import PGVector
# from langchain_community.embeddings import HuggingFaceEmbeddings

# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# connection = "postgresql+psycopg://langchain:langchain@127.0.0.1:6024/langchain"
# db = PGVector.from_documents(splitted_texts, embedding_model, connection=connection)

In [None]:
# call vectore database again to re-use the existing store, no reinsertion
from langchain_postgres.vectorstores import PGVector
# from langchain_ollama import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

connection = "postgresql+psycopg://langchain:langchain@127.0.0.1:6024/langchain"
# embedding_model = OllamaEmbeddings(model='llama3.1:latest')
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Later, second time (re-use existing store, no reinsertion)
db = PGVector.from_existing_index(embedding=embedding_model, connection=connection)