# API calls are REST based and stateless. 
Everytime call is made to Model - it is brand new request for LLM. So, you can provide some memory to LLM with various technique to keep track of chat conversation which has already happened

In [1]:
from dotenv import load_dotenv 
load_dotenv(override=True)

True

In [2]:
import os
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
# set langsmit tracing off 
os.environ['LANGSMITH_TRACING'] = "false"

In [5]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama

# create llms 
google_llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-3-flash-preview")
openai_llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
ollama_llm = ChatOllama(temperature=0, model="gemma3:12b")

### Chat memory 
Conversation memory can be maintained using any of 4 techniques:
- ConversationBufferedMemory: <br>
Keep history of all interactions in a list. When making a call to LLM use this history list as context for LLM to keep track of what was discussed before. 

- ConversationBufferedWindowMemory: <br>
Keep track of all interactions in a list, but when making a call to LLM use only latest N conversations as context to LLM 

- ConversationSummaryMemory: <br> 
Instead of all conversations, just summarize and use this as input to LLM 

- ConversationSummaryBufferedMemory: <br>
Use Summary + Latest N interactions as context to LLM


### 1. ConversationBufferedMemory using RunnableWithMessageHistory 

In [6]:
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder, ChatPromptTemplate

system_prompt = """You are a helpful assistant and your name is Gamma. 
You provide answers to questions to best of your knowledge. 
If you dont know just say "I don't know" """

prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    MessagesPlaceholder(variable_name="history"),
    HumanMessagePromptTemplate.from_template("{query}")
])

In [7]:
prompt_template.input_variables

['history', 'query']

In [8]:
prompt_template

ChatPromptTemplate(input_variables=['history', 'query'], input_types={'history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages.chat.ChatMessageChunk, Tag(tag='ChatMessageChunk')], typing.Annotated[langchain_core.messages.system.SystemMessageChunk, Tag(tag='SystemMessageChunk')], typing.Annotated[langchain_c

In [9]:
# lets create the pipeline 
pipeline = prompt_template | ollama_llm 

```
To have this running we want to have RunnableWithMessageHistory wrapped around our pipeline. The object RunnableWithMessageHistory requires input parameters. One of those is get_session_history which returns ChatMessageHistory object based on a session id. 

In [10]:
from langchain_core.chat_history import InMemoryChatMessageHistory
chat_map = {} 

def get_chat_history(session_id) -> InMemoryChatMessageHistory:
    if session_id not in chat_map:
        chat_map[session_id] = InMemoryChatMessageHistory()
    return chat_map[session_id]

In [11]:
from langchain_core.runnables.history import RunnableWithMessageHistory
pipeline_with_history = RunnableWithMessageHistory(
    pipeline, 
    get_session_history=get_chat_history, 
    input_messages_key="query",
    history_messages_key="history"
)

In [12]:
pipeline_with_history.invoke({"query": "Hi my name is Rohit"}, 
                             config={"session_id": "id_123"})

AIMessage(content="Hi Rohit! It's nice to meet you. I'm Gamma, your helpful assistant. What can I do for you today?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:39:10.784067Z', 'done': True, 'done_reason': 'stop', 'total_duration': 24662831250, 'load_duration': 5656928667, 'prompt_eval_count': 59, 'prompt_eval_duration': 16923610084, 'eval_count': 29, 'eval_duration': 2012652712, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fb2-7966-7652-9baa-59c72c1ed9b9-0', usage_metadata={'input_tokens': 59, 'output_tokens': 29, 'total_tokens': 88})

In [13]:
pipeline_with_history.invoke({"query": "I am working on research paper for Autoencoders. Can you help me in 100 words what is it and how to use it?"}, 
                             config={"session_id": "id_123"})

AIMessage(content="Hello Rohit! Absolutely, I can help with that.\n\nAutoencoders are a type of neural network used for unsupervised learning. They learn to compress and then reconstruct input data. Think of it as a sophisticated compression algorithm! \n\nEssentially, an autoencoder has an encoder (compresses) and a decoder (reconstructs). You train it by feeding it data and minimizing the difference between the original and reconstructed data. They're used for dimensionality reduction, feature extraction, anomaly detection, and even generative tasks.\n\n\n\nHope this helps with your research!", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:39:29.379343Z', 'done': True, 'done_reason': 'stop', 'total_duration': 11933545583, 'load_duration': 302912041, 'prompt_eval_count': 126, 'prompt_eval_duration': 3638545542, 'eval_count': 112, 'eval_duration': 7926863742, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc

In [14]:
pipeline_with_history.invoke({"query": "What is my name again?"}, 
                             config={"session_id": "id_123"})

AIMessage(content='Your name is Rohit! ðŸ˜Š', additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:39:30.313201Z', 'done': True, 'done_reason': 'stop', 'total_duration': 916135417, 'load_duration': 191199292, 'prompt_eval_count': 253, 'prompt_eval_duration': 283362875, 'eval_count': 7, 'eval_duration': 430488667, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fb3-2273-7bf3-bd7c-39738fb2e525-0', usage_metadata={'input_tokens': 253, 'output_tokens': 7, 'total_tokens': 260})

In [15]:
# lets change the session id and see if any of these work 
pipeline_with_history.invoke({"query": "What is my name?"}, 
                             config={"session_id": "id_567"})

AIMessage(content="I don't know your name. I am Gamma, a helpful assistant, and I don't have access to personal information like that. ðŸ˜Š \n\n\n\nDo you want to tell me your name?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:39:33.708557Z', 'done': True, 'done_reason': 'stop', 'total_duration': 3380349375, 'load_duration': 167024667, 'prompt_eval_count': 59, 'prompt_eval_duration': 272984375, 'eval_count': 42, 'eval_duration': 2917967586, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fb3-2617-78f2-83cb-52db2ea04f2b-0', usage_metadata={'input_tokens': 59, 'output_tokens': 42, 'total_tokens': 101})

### 2. ConversationBufferWindowMemory with RunnableWithMessageHistory

In [16]:
from pydantic import BaseModel, Field 
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.messages import BaseMessage

class BufferedWindowMessageHistory(BaseChatMessageHistory, BaseModel):
    messages: list[BaseMessage] = Field(default_factory=list)
    k:int = Field(default_factory=int)

    def __init__(self, k:int):
        super().__init__(k=k)
        print(f"Instantiating BufferWindowMessageHistory with k = {k}")
    
    def add_messages(self, messages: list[BaseMessage]):
        self.messages.extend(messages)
        self.messages = self.messages[-self.k:]

    def clear(self):
        self.messages = []

In [17]:
chat_map = {} 

def get_chat_history(session_id:str, k:int=4) -> BufferedWindowMessageHistory:
    print(f"get_chat_history called with session={session_id} and k={k}")
    if session_id not in chat_map:
        chat_map[session_id] = BufferedWindowMessageHistory(k=k)
    return chat_map[session_id]


In [18]:
# lets wrap the buffered window message history inside runnablewithmessagehistory object 
from langchain_core.runnables import ConfigurableFieldSpec 
pipeline_with_trimmed_history = RunnableWithMessageHistory(
    pipeline, 
    get_session_history=get_chat_history, 
    input_messages_key="query", 
    history_messages_key="history",
    history_factory_config=[
        ConfigurableFieldSpec(
            id="session_id", 
            annotation=str, 
            name="Session ID", 
            description="Session id to use for Chat History", 
            default="id_default"
        ),
        ConfigurableFieldSpec(
            id="k", 
            annotation=int, 
            name="k",
            description="Number of messages to keep in history", 
            default=4
        )
    ]
)

In [19]:
pipeline_with_trimmed_history.invoke(
    {"query": "Hi my name is Rohit"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4
Instantiating BufferWindowMessageHistory with k = 4


AIMessage(content="Hi Rohit! It's nice to meet you. I'm Gamma, your helpful assistant. What can I do for you today?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:39:45.179455Z', 'done': True, 'done_reason': 'stop', 'total_duration': 2701295833, 'load_duration': 208145875, 'prompt_eval_count': 59, 'prompt_eval_duration': 489047875, 'eval_count': 29, 'eval_duration': 1988183705, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fb3-558c-7312-b0b3-038bfda1e377-0', usage_metadata={'input_tokens': 59, 'output_tokens': 29, 'total_tokens': 88})

In [20]:
pipeline_with_trimmed_history.invoke(
    {"query": "I am working on autoencoders. Can you tell me briefly about autoencoders in less than 100 words?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content="Absolutely, Rohit! Here's a brief explanation of autoencoders:\n\nAutoencoders are a type of neural network used for unsupervised learning. They aim to learn a compressed, efficient representation (encoding) of input data. They do this by training a network to reconstruct its input. Essentially, they have an encoder that reduces dimensionality and a decoder that expands it back. They're used for dimensionality reduction, feature extraction, and anomaly detection.\n\n\n\nHope that helps!", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:39:54.355927Z', 'done': True, 'done_reason': 'stop', 'total_duration': 7635918291, 'load_duration': 205265041, 'prompt_eval_count': 121, 'prompt_eval_duration': 703866250, 'eval_count': 94, 'eval_duration': 6667691883, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fb3-661e-7d52-8b64-d93545378393-0', usage_metadata={'input_tokens': 121, 'output_to

In [21]:
pipeline_with_trimmed_history.invoke(
    {"query": "OK.. But I have a large dataset with over 1000 columns. All these are categorical data and I want to understand the relationship between these columns without doing much of data pre-processing. What is the best way to do this?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content='Okay, Rohit, that\'s a challenging but interesting situation! Dealing with 1000+ categorical columns with minimal preprocessing requires a strategic approach. Here\'s a breakdown of potential methods, considering their pros and cons, to understand relationships without heavy preprocessing:\n\n**1. Autoencoders (Your Existing Interest - Potentially Powerful)**\n\n*   **How it works:** You can adapt autoencoders to handle categorical data directly. Instead of one-hot encoding (which would explode the dimensionality), you could use embedding layers for each categorical feature. The autoencoder learns to reconstruct the input, and the "latent space" (the compressed representation) reveals relationships.\n*   **Pros:** Can capture complex, non-linear relationships. Doesn\'t require explicit feature engineering.\n*   **Cons:** Can be computationally expensive to train. Requires careful tuning of architecture and hyperparameters. Interpretation of the latent space can be tr

In [22]:
pipeline_with_trimmed_history.invoke(
    {"query": "Thats very detailed approach. But is it possible to use Decision Tree in this sceanrio? Again I dont want to preprocess categorical data"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content="You've raised a great point, Rohit! Using Decision Trees (or their ensemble variants like Random Forests and Gradient Boosting) is indeed a viable option, and it's particularly attractive given your desire to avoid extensive preprocessing of categorical features. Here's a breakdown of how you can leverage Decision Trees in this scenario:\n\n**How Decision Trees Handle Categorical Data**\n\nDecision Trees natively handle categorical features without requiring one-hot encoding or other transformations. The tree-building algorithm directly considers the categories as possible split points. This is a significant advantage.\n\n**Methods & Considerations**\n\n1.  **Single Decision Tree:** You could train a single decision tree to predict a target variable (even if it's just a dummy variable). The tree's structure will reveal which features are most important for splitting the data, thus highlighting relationships.\n2.  **Random Forest:** A Random Forest is an ensemble of d

In [23]:
pipeline_with_trimmed_history.invoke(
    {"query": "Do you know what is weather today?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content='I don\'t have access to real-time information, including the current weather. I\'m a language model, and my knowledge is based on the data I was trained on, which isn\'t updated live.\n\nTo find out the weather today, I recommend checking these resources:\n\n*   **A weather app on your phone:** Most smartphones have built-in weather apps.\n*   **A weather website:** Popular options include Google Weather, AccuWeather, The Weather Channel, and more.\n*   **A quick Google search:** Just type "weather [your location]" into Google.\n\n\n\nHope you have a pleasant day, whatever the weather!', additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:42:01.929671Z', 'done': True, 'done_reason': 'stop', 'total_duration': 21199899042, 'load_duration': 204671708, 'prompt_eval_count': 1505, 'prompt_eval_duration': 10997241750, 'eval_count': 136, 'eval_duration': 9821680044, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'olla

In [24]:
pipeline_with_trimmed_history.invoke(
    {"query": "Do you know how much quantized is gemma3 model running on M1 mac?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content='Okay, this is a great and increasingly important question! Getting precise numbers for quantized Gemma 3 running on an M1 Mac is tricky because performance varies *a lot* based on the specific quantization method, the M1 Mac\'s configuration (memory, etc.), and the task being performed. However, I can give you a good overview and some realistic ranges based on what\'s been observed by the community.\n\nHere\'s a breakdown of what I know, combining information from various sources and community discussions:\n\n**Understanding the Variables**\n\n*   **Quantization Level:** This is the biggest factor. Common levels are:\n    *   **Q4_K_M:**  A popular choice, offering a good balance of size and performance.\n    *   **Q5_K_M:** Slightly better performance than Q4, but larger.\n    *   **Q8_0:**  Closest to full precision (FP16), offering the best performance but the largest size.\n*   **M1 Mac Configuration:**  M1, M1 Pro, M1 Max, and M1 Ultra all have different memory 

In [25]:
pipeline_with_trimmed_history.invoke(
    {"query": "What was I working on?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content="As a large language model, I have no memory of past conversations. Each interaction we have is fresh. I don't retain information from previous exchanges. \n\nSo, I don't know what you were working on before this conversation. You'll need to refer to your own records or notes to recall that.", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:43:32.533647Z', 'done': True, 'done_reason': 'stop', 'total_duration': 13616170667, 'load_duration': 222820292, 'prompt_eval_count': 1181, 'prompt_eval_duration': 8484402459, 'eval_count': 66, 'eval_duration': 4843749377, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fb6-a2fe-7200-aa34-30e235346536-0', usage_metadata={'input_tokens': 1181, 'output_tokens': 66, 'total_tokens': 1247})

In [26]:
pipeline_with_trimmed_history.invoke(
    {"query": "What my name again?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content="You haven't told me your name! I'm Gamma, your helpful assistant. ðŸ˜Š What's your name?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:43:42.63235Z', 'done': True, 'done_reason': 'stop', 'total_duration': 10059769666, 'load_duration': 240380083, 'prompt_eval_count': 1108, 'prompt_eval_duration': 7956846417, 'eval_count': 26, 'eval_duration': 1810500460, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fb6-d85a-7990-aafb-99f76dbca3fc-0', usage_metadata={'input_tokens': 1108, 'output_tokens': 26, 'total_tokens': 1134})

```
this is one of the drawback of using window buffered memory technique. some of the inforamtion gets lost. But keeping all these inforamtion is going to cause latency and increase the cost. 

In [27]:
chat_map

{'id_123': BufferedWindowMessageHistory(messages=[HumanMessage(content='What was I working on?', additional_kwargs={}, response_metadata={}), AIMessage(content="As a large language model, I have no memory of past conversations. Each interaction we have is fresh. I don't retain information from previous exchanges. \n\nSo, I don't know what you were working on before this conversation. You'll need to refer to your own records or notes to recall that.", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:43:32.533647Z', 'done': True, 'done_reason': 'stop', 'total_duration': 13616170667, 'load_duration': 222820292, 'prompt_eval_count': 1181, 'prompt_eval_duration': 8484402459, 'eval_count': 66, 'eval_duration': 4843749377, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fb6-a2fe-7200-aa34-30e235346536-0', usage_metadata={'input_tokens': 1181, 'output_tokens': 66, 'total_tokens': 1247}), HumanMessage(conten

### 3. ConversationSummaryMemory with RunnableWithMessageHistory
store the summary of conversation in message lookup. Here summary is also generated from the LLM 
side effect to this approach is - there are far more tokens used compared to Window Buffered technique

In [28]:
from langchain_core.messages import SystemMessage

class ConversationSummaryMessageHistory(BaseChatMessageHistory, BaseModel):
    messages : list[BaseMessage] = Field(default_factory=list)
    llm : ChatOllama = Field(default_factory=ChatOllama)

    def __init__(self, llm:ChatOpenAI):
        super().__init__(llm=llm)

    def add_messages(self, message:list[BaseMessage]) -> None:
        """ create a summary """
        self.messages.extend(message)
        summary_prompt = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(
                """Given the existing conversation summary and the new message, 
                generate a new summary of the conversation.
                Ensure to maintain as much relevant information as possible."""
            ),
            HumanMessagePromptTemplate.from_template(
                """Existing Conversation Summary:
                {existing_summary}

                New Messages: 
                {messages}"""
            )
        ])

        new_summary = self.llm.invoke(summary_prompt.format_messages(existing_summary=self.messages, 
                                                                     messages=message))
        self.messages = [SystemMessage(content=new_summary.content)]

    def clear(self) -> None:
        """ clear the history """ 
        self.messages = [] 

In [29]:
chat_map = {}
def get_chat_history(session_id:str, llm:ChatOllama) -> ConversationSummaryMessageHistory:
    if session_id not in chat_map:
        chat_map[session_id] = ConversationSummaryMessageHistory(llm=llm)
    
    return chat_map[session_id]

In [30]:
pipeline_with_history = RunnableWithMessageHistory(
    pipeline, 
    get_session_history=get_chat_history, 
    input_messages_key="query",
    history_messages_key="history", 
    history_factory_config=[
        ConfigurableFieldSpec(
            id="session_id", annotation="str", name="Session ID", 
            description="The session id to use for chat history", 
            default="id_default"
        ), 

        ConfigurableFieldSpec(
            id="llm", annotation=ChatOllama, name="LLM",
            description="LLM to use for the conversation history",
            default=ollama_llm,
        )
    ]
)

In [31]:
pipeline_with_history.invoke(
    {"query": "Hi my name is Rohit"}, 
    config={"configurable" : {"session_id": "id_123", "llm": ollama_llm}}
)

AIMessage(content="Hi Rohit! It's nice to meet you. I'm Gamma, your helpful assistant. What can I do for you today?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:50:54.695632Z', 'done': True, 'done_reason': 'stop', 'total_duration': 12229472333, 'load_duration': 4737116875, 'prompt_eval_count': 59, 'prompt_eval_duration': 5334354333, 'eval_count': 29, 'eval_duration': 1989334336, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fbd-6798-74a0-a43e-be21ed0e8075-0', usage_metadata={'input_tokens': 59, 'output_tokens': 29, 'total_tokens': 88})

In [32]:
pipeline_with_history.invoke(
    {"query": "I am working on autoencoders. Can you tell me briefly about autoencoders in less than 100 words?"}, 
    config={"configurable" : {"session_id": "id_123", "llm": ollama_llm}}
)

AIMessage(content='Hello Rohit! Happy to help.\n\nAutoencoders are a type of neural network used for unsupervised learning. They learn to compress and then reconstruct input data. Essentially, they have an encoder that reduces the data to a lower-dimensional "latent space" representation, and a decoder that reconstructs the original data from this compressed form. They\'re often used for dimensionality reduction, feature extraction, and anomaly detection.\n\n\n\nLet me know if you\'d like to explore a specific aspect of autoencoders!', additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T14:52:42.946281Z', 'done': True, 'done_reason': 'stop', 'total_duration': 13111111250, 'load_duration': 216455833, 'prompt_eval_count': 101, 'prompt_eval_duration': 5646838083, 'eval_count': 101, 'eval_duration': 7171665544, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fbf-0afe-7863-8935-60d37101c986-0', usage_metadata=

In [33]:
chat_map["id_123"]

ConversationSummaryMessageHistory(messages=[SystemMessage(content='Rohit introduced himself and Gamma, a helpful assistant, acknowledged him and offered assistance. Rohit then inquired about autoencoders, and Gamma briefly explained that they are a type of neural network used for unsupervised learning, compressing and reconstructing data through an encoder and decoder. They are commonly used for dimensionality reduction, feature extraction, and anomaly detection, and Gamma offered to explore specific aspects further.', additional_kwargs={}, response_metadata={})], llm=ChatOllama(model='gemma3:12b', temperature=0.0))

### 4. ConversationSummaryBufferHistory using RunnableWithMessageHistory
Store the latest information and summary details opposed to everything in summary. This will have detailed inforamtion of latest information and also will reduce the context window size 

In [48]:
class ConversationSummaryBufferedMessageHistory(BaseChatMessageHistory, BaseModel):
    messages : list[BaseMessage] = Field(default_factory=list)
    llm: ChatOllama = Field(default_factory=ChatOllama)
    k: int = Field(default_factory=int)

    def __init__(self, llm:ChatOllama, k:int):
        super().__init__(llm=llm, k=k)
    
    def add_messages(self, messages:list[BaseMessage]) -> None:
        """ Add messages to history, removing any messages beyond the last `k` messages and summarizing the messages we dropped """
        existing_summary:SystemMessage|None = None 
        old_messages=None 

        if len(self.messages) > 0 and isinstance(self.messages[0], SystemMessage):
            print(">> Found existing summary")
            existing_summary = self.messages.pop(0)

        self.messages.extend(messages)

        if len(self.messages) > self.k:
            print(f" >> Found {len(self.messages)} messages, Dropping {len(self.messages) - self.k} messages.")
            old_messages = self.messages[:self.k]
            self.messages = self.messages[-self.k:] 
        
        if old_messages is None:
            print (" >> No old message to update summary with")
            return 
        
        summary_prompt = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(
                """Given the existing conversation summary and the new message, 
                generate a new summary of the conversation.
                Ensure to maintain as much relevant information as possible."""                
            ),
            HumanMessagePromptTemplate.from_template(
                """Existing Conversation Summary:
                {existing_summary}

                New Messages: 
                {old_messages}"""                
            )
        ])

        new_summary = self.llm.invoke(
            summary_prompt.format_messages(existing_summary=existing_summary, old_messages=old_messages)
        )

        print(f" >> New Summary: {new_summary.content}")

        self.messages = [SystemMessage(content=new_summary.content)] + self.messages

    def clear(self) -> None: 
        """ clear the history """
        self.messages = [] 

    

In [49]:
chat_map = {}
def get_chat_history(session_id:str, llm:ChatOllama, k:int) -> ConversationSummaryBufferedMessageHistory:
    if session_id not in chat_map:
        chat_map[session_id] = ConversationSummaryBufferedMessageHistory(llm=llm, k=k)
    
    return chat_map[session_id]

In [50]:
pipeline_with_history = RunnableWithMessageHistory(
    pipeline, 
    get_session_history=get_chat_history, 
    input_messages_key="query",
    history_messages_key="history", 
    history_factory_config=[
        ConfigurableFieldSpec(
            id="session_id", annotation="str", name="Session ID", 
            description="The session id to use for chat history", 
            default="id_default"
        ), 

        ConfigurableFieldSpec(
            id="llm", annotation=ChatOllama, name="LLM",
            description="LLM to use for the conversation history",
            default=ollama_llm,
        ),

        ConfigurableFieldSpec(
            id="k", annotation=int, name="k",
            description="Number of messages to keep in history", 
            default=4
        )
        
    ]
)

In [51]:
pipeline_with_history.invoke(
    {"query": "Hi my name is Rohit"}, 
    config={"configurable" : {"session_id": "id_123", "llm": ollama_llm, "k": 4}}
)

 >> No old message to update summary with


AIMessage(content="Hi Rohit! It's nice to meet you. I'm Gamma, your helpful assistant. What can I do for you today?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T15:39:26.855384Z', 'done': True, 'done_reason': 'stop', 'total_duration': 7047462667, 'load_duration': 329409083, 'prompt_eval_count': 59, 'prompt_eval_duration': 4706900000, 'eval_count': 29, 'eval_duration': 1987341793, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fe9-eb76-73d3-8ff4-89906ff1991e-0', usage_metadata={'input_tokens': 59, 'output_tokens': 29, 'total_tokens': 88})

In [52]:
pipeline_with_history.invoke(
    {"query": "What is encoder and how is this different from autoencoders? Please explain in less than 100 words to me"}, 
    config={"configurable" : {"session_id": "id_123", "llm": ollama_llm, "k": 4}}
)

 >> No old message to update summary with


AIMessage(content="Hi Rohit!\n\nAn **encoder** is a neural network component that transforms data into a compressed representation (like squeezing information). Think of it as summarizing. \n\nAn **autoencoder** *uses* an encoder! It's a full neural network built with an encoder *and* a decoder. The encoder compresses, and the decoder tries to reconstruct the original data from that compressed form. Autoencoders learn efficient data representations.\n\n\n\nHope that helps!", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T15:39:34.068364Z', 'done': True, 'done_reason': 'stop', 'total_duration': 7197659375, 'load_duration': 206439417, 'prompt_eval_count': 121, 'prompt_eval_duration': 506304000, 'eval_count': 92, 'eval_duration': 6434379335, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fea-0715-7650-bcac-299fca7e4912-0', usage_metadata={'input_tokens': 121, 'output_tokens': 92, 'total_tokens': 213})

In [53]:
pipeline_with_history.invoke(
    {"query": "Why is sky blue"}, 
    config={"configurable" : {"session_id": "id_123", "llm": ollama_llm, "k": 4}}
)

 >> Found 6 messages, Dropping 2 messages.
 >> New Summary: Rohit introduced himself, and Gamma, an AI assistant, responded and offered help. Rohit then asked for a concise explanation of encoders and autoencoders. Gamma explained that an encoder compresses data, while an autoencoder uses an encoder and decoder to reconstruct data, learning efficient representations.


AIMessage(content="Hi Rohit!\n\nThe sky is blue because of a phenomenon called **Rayleigh scattering**. Sunlight is made up of all colors, but blue and violet light have shorter wavelengths. When sunlight hits the Earth's atmosphere, these shorter wavelengths are scattered more by the tiny air molecules than other colors. That scattered blue light is what we see!\n\n\n\nI hope that explanation makes sense!", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-30T15:39:40.032784Z', 'done': True, 'done_reason': 'stop', 'total_duration': 5949041916, 'load_duration': 188351250, 'prompt_eval_count': 226, 'prompt_eval_duration': 286952583, 'eval_count': 77, 'eval_duration': 5424217167, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6fea-2342-7153-bd96-cfaefeb6a2f9-0', usage_metadata={'input_tokens': 226, 'output_tokens': 77, 'total_tokens': 303})

In [54]:
pipeline_with_history.invoke(
    {"query": "Which autoencoder I should use?"}, 
    config={"configurable" : {"session_id": "id_123", "llm": ollama_llm, "k": 4}}
)

>> Found existing summary
 >> Found 6 messages, Dropping 2 messages.
 >> New Summary: Rohit initiated a conversation with Gamma, an AI assistant, and requested explanations of encoders and autoencoders. Gamma initially explained that an encoder compresses data and an autoencoder uses an encoder and decoder to reconstruct data and learn efficient representations. Following this, Gamma elaborated further on the difference between encoders and autoencoders, explaining that an autoencoder utilizes an encoder to compress data and a decoder to reconstruct it. Rohit then shifted the conversation to a new topic, asking why the sky is blue, to which Gamma responded with an explanation involving Rayleigh scattering.


AIMessage(content="Hi Rohit!\n\nChoosing the right autoencoder depends on your specific task and data. Here's a breakdown of common types and when you might use them:\n\n*   **Vanilla Autoencoder:** A good starting point for basic dimensionality reduction and feature learning. Simple to understand and implement.\n*   **Sparse Autoencoder:** Encourages sparsity in the hidden layer activations, leading to more interpretable features. Useful when you want to identify the most important features.\n*   **Denoising Autoencoder:** Trained to reconstruct clean data from noisy input. Great for robust feature learning and handling noisy datasets.\n*   **Variational Autoencoder (VAE):** Generates new data similar to the training data. Useful for generative tasks like image generation.\n*   **Contractive Autoencoder:** Encourages the learned representation to be insensitive to small changes in the input. Good for learning robust features.\n\nTo help me narrow it down, could you tell me:\n\n*   Wha

In [55]:
chat_map["id_123"]

ConversationSummaryBufferedMessageHistory(messages=[SystemMessage(content='Rohit initiated a conversation with Gamma, an AI assistant, and requested explanations of encoders and autoencoders. Gamma initially explained that an encoder compresses data and an autoencoder uses an encoder and decoder to reconstruct data and learn efficient representations. Following this, Gamma elaborated further on the difference between encoders and autoencoders, explaining that an autoencoder utilizes an encoder to compress data and a decoder to reconstruct it. Rohit then shifted the conversation to a new topic, asking why the sky is blue, to which Gamma responded with an explanation involving Rayleigh scattering.', additional_kwargs={}, response_metadata={}), HumanMessage(content='Why is sky blue', additional_kwargs={}, response_metadata={}), AIMessage(content="Hi Rohit!\n\nThe sky is blue because of a phenomenon called **Rayleigh scattering**. Sunlight is made up of all colors, but blue and violet ligh