# RAG with Memory and rewriter llm



## Main system process

In [None]:
#invoke the db vectorebase to do similarity search again
from langchain_postgres.vectorstores import PGVector
from langchain_ollama import OllamaEmbeddings

connection = "postgresql+psycopg://langchain:langchain@127.0.0.1:6024/langchain"
embedding_model = OllamaEmbeddings(model='llama3.1:latest')

# Later, second time (re-use existing store, no reinsertion)
db = PGVector.from_existing_index(embedding=embedding_model, connection=connection)

In [165]:
# langgraph technique + adding memory + 
from typing import TypedDict, Annotated
from langgraph.graph.message import add_messages
from langgraph.graph import StateGraph, START, END
from langchain_ollama import ChatOllama
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage



class State(TypedDict):
    messages: Annotated[list, add_messages]
    
builder = StateGraph(State)

# Add the chat node
llm_model = ChatOllama(model="llama3.1:latest")

prompt = ChatPromptTemplate.from_template(
    """You are a helpful assistant for questions about our colleagues.
Answer **only** using the information in CONTEXT.
If the answer is not in CONTEXT, say exactly: "I don't know".

CONTEXT:
{context}

Question: {question}
Answer:"""
)

retriever = db.as_retriever(search_kwargs={"k": 15})

def chat_bot(state: State):
    last_msg = state["messages"][-1]
    question = last_msg.content if isinstance(last_msg, HumanMessage) else str(last_msg)

    docs = retriever.invoke(question)
    context = "\n\n".join(d.page_content for d in docs)

    formatted_prompt = prompt.format(context=context, question=question)
    # formatted_prompt is a string; for ChatOllama you can pass it directly
    answer_msg = llm_model.invoke(formatted_prompt)

    return {"messages": [answer_msg]}

builder.add_node("chatbot", chat_bot)


# def chat_bot(state: State):
#     answer = llm_model.invoke(state["messages"])
#     return {"messages": [answer]}
# builder.add_node("chatbot", chat_bot)

# Add the edges
builder.add_edge(START, "chatbot")
builder.add_edge("chatbot", END)

graph = builder.compile(checkpointer=MemorySaver())

# #plot the graph
# from IPython.display import Image, display
# png_bytes = graph.get_graph().draw_mermaid_png()
# display(Image(png_bytes))
# Run the graph
# create the thread and invoke the graph with the thread
thread_1 = {"configurable": {"thread_id": "thread_1"}}

# input = {"messages": ["Hi. my name is Porya."]}
# result_1 = graph.invoke(input, thread_1)

# input = {"messages": ["what was my name?"]}
# result_2 = graph.invoke(input, thread_1)
# graph.get_state(thread_1)

In [166]:
input = {"messages": ["Who has this number: +49 151 2904 1718 ?"]}
result = graph.invoke(input, thread_1)
result

{'messages': [HumanMessage(content='Who has this number: +49 151 2904 1718 ?', additional_kwargs={}, response_metadata={}, id='0a77fcb8-0512-4d1f-af91-4d1fed4448fc'),
  AIMessage(content="I don't know.", additional_kwargs={}, response_metadata={'model': 'llama3.1:latest', 'created_at': '2026-02-16T12:39:55.848617574Z', 'done': True, 'done_reason': 'stop', 'total_duration': 926736411, 'load_duration': 60350568, 'prompt_eval_count': 2546, 'prompt_eval_duration': 771715423, 'eval_count': 6, 'eval_duration': 87702416, 'model_name': 'llama3.1:latest'}, id='lc_run--019c6676-e929-7f70-aa3a-3fcca767466e-0', tool_calls=[], invalid_tool_calls=[], usage_metadata={'input_tokens': 2546, 'output_tokens': 6, 'total_tokens': 2552})]}

## A single LLM just to test the info it has

In [145]:
# TEMPORARY: Just a single llm to check if it knows the answer
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

llm_model = ChatOllama(model="llama3.1:latest")
template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant.Your name is Rosenxt_bot.answer the question."),
    ('human', "question: {Question}")
])

chain = template | llm_model
chain.invoke({"Question": "where did Pouriya Amini Digehsara study?"})

AIMessage(content='Pouriya Amini Digehsara studied at Foolad Novin Academy and also played for Foolad, before joining Persepolis in Iran.', additional_kwargs={}, response_metadata={'model': 'llama3.1:latest', 'created_at': '2026-02-16T12:36:38.326059819Z', 'done': True, 'done_reason': 'stop', 'total_duration': 576523509, 'load_duration': 63803227, 'prompt_eval_count': 46, 'prompt_eval_duration': 16639762, 'eval_count': 33, 'eval_duration': 475479119, 'model_name': 'llama3.1:latest'}, id='lc_run--019c6673-e6f4-72e0-880d-e52561eace92-0', tool_calls=[], invalid_tool_calls=[], usage_metadata={'input_tokens': 46, 'output_tokens': 33, 'total_tokens': 79})

## Vector data base and embedding prepration

In [81]:
# Now prepare the pdfs of the profile of our collegous for RAG

#1-load pdfs
import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

DATA_PATH = Path("./data/")  # adjust to teh data folder

all_docs = []

for file in os.listdir(DATA_PATH):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(str(DATA_PATH / file))
        docs = loader.load()
        all_docs.extend(docs)

#2- split docs to chunks
# use langchain text splitter to split the documents into smaller chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pprint import pprint


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the documents into smaller chunks
splitted_texts = text_splitter.split_documents(all_docs)

# outputs
pprint(f"The number of original documents: {len(docs)} and after splitting there are {len(splitted_texts)} of chunks.")
pprint("Here is an example of a splitted text chunk metadata:")
pprint(splitted_texts[10].metadata)

('The number of original documents: 1 and after splitting there are 39 of '
 'chunks.')
'Here is an example of a splitted text chunk metadata:'
{'author': 'LinkedIn',
 'creationdate': '2026-02-16T09:50:41+00:00',
 'creator': 'PyPDF',
 'page': 1,
 'page_label': '2',
 'producer': 'Apache FOP Version 2.3',
 'source': 'data/Dirk.pdf',
 'subject': 'Resume generated from profile',
 'title': 'Resume',
 'total_pages': 2}


In [None]:
# 3- embdeeing chunks and save in vector database

# firtst let create the database via docker (docker.yml file):
# services:
#   pgvector:
#     image: pgvector/pgvector:pg16
#     container_name: pgvector-container-chp4
#     environment:
#       POSTGRES_USER: langchain
#       POSTGRES_PASSWORD: langchain
#       POSTGRES_DB: langchain
#       POSTGRES_HOST_AUTH_METHOD: md5
#     ports:
#       - "6024:5432"
#     volumes:
#       - pgvector_RAG_chp4:/var/lib/postgresql/data

# volumes:
#   pgvector_RAG_chp4:

# then we can use pgvector to save our chunks(first time:)
# from langchain_postgres.vectorstores import PGVector
# # using ollama as embedding model
# from langchain_ollama import OllamaEmbeddings

# embedding_model = OllamaEmbeddings(model='llama3.1:latest')

# connection = "postgresql+psycopg://langchain:langchain@127.0.0.1:6024/langchain"
# db = PGVector.from_documents(splitted_texts, embedding_model, connection=connection)

In [None]:
# call vectore database again to re-use the existing store, no reinsertion
from langchain_postgres.vectorstores import PGVector
from langchain_ollama import OllamaEmbeddings

connection = "postgresql+psycopg://langchain:langchain@127.0.0.1:6024/langchain"
embedding_model = OllamaEmbeddings(model='llama3.1:latest')

# Later, second time (re-use existing store, no reinsertion)
db = PGVector.from_existing_index(embedding=embedding_model, connection=connection)