# API calls are REST based and stateless. 
Everytime call is made to Model - it is brand new request for LLM. So, you can provide some memory to LLM with various technique to keep track of chat conversation which has already happened

In [1]:
from dotenv import load_dotenv 
load_dotenv(override=True)

True

In [2]:
import os
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [4]:
# set langsmit tracing off 
# os.environ['LANGSMITH_TRACING'] = False

In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama

# create llms 
google_llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-3-flash-preview")
openai_llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
ollama_llm = ChatOllama(temperature=0, model="gemma3:12b")

### Chat memory 
Conversation memory can be maintained using any of 4 techniques:
- ConversationBufferedMemory: <br>
Keep history of all interactions in a list. When making a call to LLM use this history list as context for LLM to keep track of what was discussed before. 

- ConversationBufferedWindowMemory: <br>
Keep track of all interactions in a list, but when making a call to LLM use only latest N conversations as context to LLM 

- ConversationSummaryMemory: <br> 
Instead of all conversations, just summarize and use this as input to LLM 

- ConversationSummaryBufferedMemory: <br>
Use Summary + Latest N interactions as context to LLM


### 1. ConversationBufferedMemory using RunnableWithMessageHistory 

In [12]:
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder, ChatPromptTemplate

system_prompt = """You are a helpful assistant and your name is Gamma. 
You provide answers to questions to best of your knowledge. 
If you dont know just say "I don't know" """

prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    MessagesPlaceholder(variable_name="history"),
    HumanMessagePromptTemplate.from_template("{query}")
])

In [13]:
prompt_template.input_variables

['history', 'query']

In [20]:
prompt_template

ChatPromptTemplate(input_variables=['history', 'query'], input_types={'history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages.chat.ChatMessageChunk, Tag(tag='ChatMessageChunk')], typing.Annotated[langchain_core.messages.system.SystemMessageChunk, Tag(tag='SystemMessageChunk')], typing.Annotated[langchain_c

In [21]:
# lets create the pipeline 
pipeline = prompt_template | ollama_llm 

```
To have this running we want to have RunnableWithMessageHistory wrapped around our pipeline. The object RunnableWithMessageHistory requires input parameters. One of those is get_session_history which returns ChatMessageHistory object based on a session id. 

In [22]:
from langchain_core.chat_history import InMemoryChatMessageHistory
chat_map = {} 

def get_chat_history(session_id) -> InMemoryChatMessageHistory:
    if session_id not in chat_map:
        chat_map[session_id] = InMemoryChatMessageHistory()
    return chat_map[session_id]

In [28]:
from langchain_core.runnables.history import RunnableWithMessageHistory
pipeline_with_history = RunnableWithMessageHistory(
    pipeline, 
    get_session_history=get_chat_history, 
    input_messages_key="query",
    history_messages_key="history"
)

In [30]:
pipeline_with_history.invoke({"query": "Hi my name is Rohit"}, 
                             config={"session_id": "id_123"})

AIMessage(content="Hi Rohit! It's nice to meet you. I'm Gamma, your helpful assistant. What can I do for you today?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T14:44:39.53471Z', 'done': True, 'done_reason': 'stop', 'total_duration': 12632427792, 'load_duration': 8370140959, 'prompt_eval_count': 59, 'prompt_eval_duration': 2185515458, 'eval_count': 29, 'eval_duration': 1990096502, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b656a-f48c-79e1-8f68-c5cbf316970f-0', usage_metadata={'input_tokens': 59, 'output_tokens': 29, 'total_tokens': 88})

In [31]:
pipeline_with_history.invoke({"query": "I am working on research paper for Autoencoders. Can you help me in 100 words what is it and how to use it?"}, 
                             config={"session_id": "id_123"})

AIMessage(content="Hello Rohit! Absolutely, I can help with that.\n\nAutoencoders are a type of neural network used for unsupervised learning. They learn to compress and then reconstruct input data. Think of it as a sophisticated compression algorithm! \n\nEssentially, an autoencoder has an encoder (compresses) and a decoder (reconstructs). You train it by feeding it data and minimizing the difference between the original and reconstructed data. They're used for dimensionality reduction, feature extraction, anomaly detection, and even generative tasks.\n\n\n\nHope this helps with your research!", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T14:46:33.236868Z', 'done': True, 'done_reason': 'stop', 'total_duration': 15119470208, 'load_duration': 306301416, 'prompt_eval_count': 126, 'prompt_eval_duration': 6649570708, 'eval_count': 112, 'eval_duration': 7991285702, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc

In [32]:
pipeline_with_history.invoke({"query": "What is my name again?"}, 
                             config={"session_id": "id_123"})

AIMessage(content='Your name is Rohit! ðŸ˜Š', additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T14:47:26.580474Z', 'done': True, 'done_reason': 'stop', 'total_duration': 6371676500, 'load_duration': 450692250, 'prompt_eval_count': 253, 'prompt_eval_duration': 5373336417, 'eval_count': 7, 'eval_duration': 439308291, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b656d-993f-7960-bb27-f96fe61ff384-0', usage_metadata={'input_tokens': 253, 'output_tokens': 7, 'total_tokens': 260})

In [37]:
# lets change the session id and see if any of these work 
pipeline_with_history.invoke({"query": "What is my name?"}, 
                             config={"session_id": "id_567"})

AIMessage(content="I don't know your name. I am Gamma, a helpful assistant, and I don't have access to personal information like that. ðŸ˜Š \n\n\n\nDo you want to tell me your name?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T14:59:53.29981Z', 'done': True, 'done_reason': 'stop', 'total_duration': 25742004292, 'load_duration': 6050446875, 'prompt_eval_count': 59, 'prompt_eval_duration': 16572694083, 'eval_count': 42, 'eval_duration': 2978987169, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6578-b2b6-7561-945d-69c29fb4a9a6-0', usage_metadata={'input_tokens': 59, 'output_tokens': 42, 'total_tokens': 101})

### 2. ConversationBufferWindowMemory with RunnableWithMessageHistory

In [47]:
from pydantic import BaseModel, Field 
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.messages import BaseMessage

class BufferedWindowMessageHistory(BaseChatMessageHistory, BaseModel):
    messages: list[BaseMessage] = Field(default_factory=list)
    k:int = Field(default_factory=int)

    def __init__(self, k:int):
        super().__init__(k=k)
        print(f"Instantiating BufferWindowMessageHistory with k = {k}")
    
    def add_messages(self, messages: list[BaseMessage]):
        self.messages.extend(messages)
        self.messages = self.messages[-self.k:]

    def clear(self):
        self.messages = []

In [48]:
chat_map = {} 

def get_chat_history(session_id:str, k:int=4) -> BufferedWindowMessageHistory:
    print(f"get_chat_history called with session={session_id} and k={k}")
    if session_id not in chat_map:
        chat_map[session_id] = BufferedWindowMessageHistory(k=k)
    return chat_map[session_id]


In [49]:
# lets wrap the buffered window message history inside runnablewithmessagehistory object 
from langchain_core.runnables import ConfigurableFieldSpec 
pipeline_with_trimmed_history = RunnableWithMessageHistory(
    pipeline, 
    get_session_history=get_chat_history, 
    input_messages_key="query", 
    history_messages_key="history",
    history_factory_config=[
        ConfigurableFieldSpec(
            id="session_id", 
            annotation=str, 
            name="Session ID", 
            description="Session id to use for Chat History", 
            default="id_default"
        ),
        ConfigurableFieldSpec(
            id="k", 
            annotation=int, 
            name="k",
            description="Number of messages to keep in history", 
            default=4
        )
    ]
)

In [50]:
pipeline_with_trimmed_history.invoke(
    {"query": "Hi my name is Rohit"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4
Instantiating BufferWindowMessageHistory with k = 4


AIMessage(content="Hi Rohit! It's nice to meet you. I'm Gamma, your helpful assistant. What can I do for you today?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T15:33:26.478798Z', 'done': True, 'done_reason': 'stop', 'total_duration': 25064628333, 'load_duration': 8126047583, 'prompt_eval_count': 59, 'prompt_eval_duration': 14697478541, 'eval_count': 29, 'eval_duration': 2063198293, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6597-6d5a-7a13-8494-0ee5b65f6e79-0', usage_metadata={'input_tokens': 59, 'output_tokens': 29, 'total_tokens': 88})

In [51]:
pipeline_with_trimmed_history.invoke(
    {"query": "I am working on autoencoders. Can you tell me briefly about autoencoders in less than 100 words?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content="Absolutely, Rohit! Here's a brief explanation of autoencoders:\n\nAutoencoders are a type of neural network used for unsupervised learning. They aim to learn a compressed, efficient representation (encoding) of input data. They do this by training a network to reconstruct its input. Essentially, they have an encoder that reduces dimensionality and a decoder that expands it back. They're used for dimensionality reduction, feature extraction, and anomaly detection.\n\n\n\nHope that helps!", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T15:34:28.795842Z', 'done': True, 'done_reason': 'stop', 'total_duration': 12412065167, 'load_duration': 304555917, 'prompt_eval_count': 121, 'prompt_eval_duration': 5343281250, 'eval_count': 94, 'eval_duration': 6610821079, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b6598-922c-7b61-b2de-afcf56d49d14-0', usage_metadata={'input_tokens': 121, 'output_

In [52]:
pipeline_with_trimmed_history.invoke(
    {"query": "OK.. But I have a large dataset with over 1000 columns. All these are categorical data and I want to understand the relationship between these columns without doing much of data pre-processing. What is the best way to do this?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content='Okay, Rohit, that\'s a challenging but interesting situation! Dealing with 1000+ categorical columns with minimal preprocessing requires a strategic approach. Here\'s a breakdown of potential methods, considering their pros and cons, to understand relationships without heavy preprocessing:\n\n**1. Autoencoders (Your Existing Interest - Potentially Powerful)**\n\n*   **How it works:** You can adapt autoencoders to handle categorical data directly. Instead of one-hot encoding (which would explode the dimensionality), you could use embedding layers for each categorical feature. The autoencoder learns to reconstruct the input, and the "latent space" (the compressed representation) reveals relationships.\n*   **Pros:** Can capture complex, non-linear relationships. Doesn\'t require explicit feature engineering.\n*   **Cons:** Can be computationally expensive to train. Requires careful tuning of architecture and hyperparameters. Interpretation of the latent space can be tr

In [53]:
pipeline_with_trimmed_history.invoke(
    {"query": "Thats very detailed approach. But is it possible to use Decision Tree in this sceanrio? Again I dont want to preprocess categorical data"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content="You've raised a great point, Rohit! Using Decision Trees (or their ensemble variants like Random Forests and Gradient Boosting) is indeed a viable option, and it's particularly attractive given your desire to avoid extensive preprocessing of categorical features. Here's a breakdown of how you can leverage Decision Trees in this scenario:\n\n**How Decision Trees Handle Categorical Data**\n\nDecision Trees natively handle categorical features without requiring one-hot encoding or other transformations. The tree-building algorithm directly considers the categories as possible split points. This is a significant advantage.\n\n**Methods & Considerations**\n\n1.  **Single Decision Tree:** You could train a single decision tree to predict a target variable (even if it's just a dummy variable). The tree's structure will reveal which features are most important for splitting the data, thus highlighting relationships.\n2.  **Random Forest:** A Random Forest is an ensemble of d

In [54]:
pipeline_with_trimmed_history.invoke(
    {"query": "Do you know what is weather today?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content='I don\'t have access to real-time information, including the current weather. I\'m a language model, and my knowledge is based on the data I was trained on, which isn\'t updated live.\n\nTo find out the weather today, I recommend checking these resources:\n\n*   **A weather app on your phone:** Most smartphones have built-in weather apps.\n*   **A weather website:** Popular options include Google Weather, AccuWeather, The Weather Channel, and more.\n*   **A quick Google search:** Just type "weather [your location]" into Google.\n\n\n\nHope you have a pleasant day, whatever the weather!', additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T15:45:07.171983Z', 'done': True, 'done_reason': 'stop', 'total_duration': 26882538750, 'load_duration': 304395000, 'prompt_eval_count': 1505, 'prompt_eval_duration': 16752892208, 'eval_count': 136, 'eval_duration': 9652199576, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'olla

In [55]:
pipeline_with_trimmed_history.invoke(
    {"query": "Do you know how much quantized is gemma3 model running on M1 mac?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content='Okay, this is a great and increasingly important question! Getting precise numbers for quantized Gemma 3 running on an M1 Mac is tricky because performance varies *a lot* based on the specific quantization method, the M1 Mac\'s configuration (memory, etc.), and the task being performed. However, I can give you a good overview and some realistic ranges based on what\'s been observed by the community.\n\nHere\'s a breakdown of what I know, combining information from various sources and community discussions:\n\n**Understanding the Variables**\n\n*   **Quantization Level:** This is the biggest factor. Common levels are:\n    *   **Q4_K_M:**  A popular choice, offering a good balance of size and performance.\n    *   **Q5_K_M:** Slightly better performance than Q4, but larger.\n    *   **Q8_0:**  Closest to full precision (FP16), offering the best performance but the largest size.\n*   **M1 Mac Configuration:**  M1, M1 Pro, M1 Max, and M1 Ultra all have different memory 

In [56]:
pipeline_with_trimmed_history.invoke(
    {"query": "What was I working on?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content="As a large language model, I have no memory of past conversations. Each interaction we have is fresh. I don't retain information from previous exchanges. \n\nSo, I don't know what you were working on before this conversation. You'll need to refer to your own records or notes to recall that.", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T15:48:01.20936Z', 'done': True, 'done_reason': 'stop', 'total_duration': 20268449542, 'load_duration': 259644292, 'prompt_eval_count': 1181, 'prompt_eval_duration': 15022874917, 'eval_count': 66, 'eval_duration': 4760634876, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b65a4-d8e0-7ee3-8765-1fa49747343f-0', usage_metadata={'input_tokens': 1181, 'output_tokens': 66, 'total_tokens': 1247})

In [57]:
pipeline_with_trimmed_history.invoke(
    {"query": "What my name again?"}, 
    config={"configurable" : {"session_id": "id_123", "k": 4}}
)

get_chat_history called with session=id_123 and k=4


AIMessage(content="You haven't told me your name! I'm Gamma, your helpful assistant. ðŸ˜Š What's your name?", additional_kwargs={}, response_metadata={'model': 'gemma3:12b', 'created_at': '2025-12-28T15:48:40.61057Z', 'done': True, 'done_reason': 'stop', 'total_duration': 13788257291, 'load_duration': 391920125, 'prompt_eval_count': 1108, 'prompt_eval_duration': 11587172542, 'eval_count': 26, 'eval_duration': 1763805959, 'logprobs': None, 'model_name': 'gemma3:12b', 'model_provider': 'ollama'}, id='lc_run--019b65a5-8c33-7c13-8c0f-ea61dff2fed5-0', usage_metadata={'input_tokens': 1108, 'output_tokens': 26, 'total_tokens': 1134})

```
this is one of the drawback of using window buffered memory technique. some of the inforamtion gets lost. But keeping all these inforamtion is going to cause latency and increase the cost. 