# Langchain Conversation - Vanilla without Chat history

In [3]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain

In [4]:
template = """Question: {question}"""

prompt = PromptTemplate.from_template(template)

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="../models/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf",
    temperature=0,
    max_tokens=2000,
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

# Wrap the LLM model with the LLMChain class and include the memory
llm_chain = LLMChain(llm=llm, prompt=prompt, memory=memory)

llama_model_loader: loaded meta data with 26 key-value pairs and 995 tensors from ../models/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mixtral-8x7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:    

In [7]:
question = """
Question: Awesome Alexa, can you please tell me my name ?
"""
llm_chain.invoke(question)

Llama.generate: prefix-match hit


Answer: Of course! Your name is Alexa.


llama_print_timings:        load time =     631.89 ms
llama_print_timings:      sample time =       2.04 ms /    13 runs   (    0.16 ms per token,  6372.55 tokens per second)
llama_print_timings: prompt eval time =    1113.08 ms /    14 tokens (   79.51 ms per token,    12.58 tokens per second)
llama_print_timings:        eval time =    1450.00 ms /    12 runs   (  120.83 ms per token,     8.28 tokens per second)
llama_print_timings:       total time =    2601.39 ms /    26 tokens


{'question': '\nQuestion: Awesome Alexa, can you please tell me my name ?\n',
 'chat_history': [HumanMessage(content='\nQuestion: Hey\n'),
  AIMessage(content='Answer: Hello! How can I help you today?'),
  HumanMessage(content='\nQuestion: Well, my name is Amit. What can I call you ?\n'),
  AIMessage(content='Answer: Hello Amit, you can call me Alexa. How can I assist you today?'),
  HumanMessage(content='\nQuestion: Awesome Alexa, can you please tell me my name ?\n'),
  AIMessage(content='Answer: Of course! Your name is Alexa.')],
 'text': 'Answer: Of course! Your name is Alexa.'}

# Langchain Conversation - with Memory Buffers

In [55]:
from langchain.chains import LLMChain
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain.memory import ConversationBufferWindowMemory
from langchain_experimental.chat_models import Llama2Chat
from os.path import expanduser
from langchain.llms import OpenAI

from langchain_community.llms import LlamaCpp

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)

from langchain_core.messages import SystemMessage


model_path = expanduser("~/work/amitbhatti/models/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf")

llm = LlamaCpp(
    model_path=model_path,
    streaming=True,
)

model = Llama2Chat(llm=llm)

template_messages = [
    SystemMessage(content="You are a helpful assistant."),
    MessagesPlaceholder(variable_name="chat_history"),
    HumanMessagePromptTemplate.from_template("{text}"),
]

prompt_template = ChatPromptTemplate.from_messages(template_messages)

##### Conversation Summarization Memory #######

# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# api_key = "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"
# api_key="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"
# memory_llm = OpenAI(temperature=0, api_key=api_key)

# memory= ConversationSummaryBufferMemory(
#     llm=memory_llm, 
#     max_token_limit=200, 
#     memory_key="chat_history",
#     return_messages=True
#     )

#chain = LLMChain(llm=model, prompt=prompt_template, memory=memory)

chain = ConversationChain(
    llm=model,
    # We set a low k=2, to only keep the last 2 interactions in memory
    memory=ConversationBufferWindowMemory(k=6)
)
chain.predict(input="I see, this is great. Seems, you have been having some almonds lately")

llama_model_loader: loaded meta data with 26 key-value pairs and 995 tensors from /home/jioaidev/work/amitbhatti/models/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mixtral-8x7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama

Llama.generate: prefix-match hit

llama_print_timings:        load time =     603.91 ms
llama_print_timings:      sample time =      10.50 ms /    58 runs   (    0.18 ms per token,  5524.34 tokens per second)
llama_print_timings: prompt eval time =    6457.94 ms /    82 tokens (   78.76 ms per token,    12.70 tokens per second)
llama_print_timings:        eval time =    6965.07 ms /    57 runs   (  122.19 ms per token,     8.18 tokens per second)
llama_print_timings:       total time =   13582.53 ms /   139 tokens


' I apologize for any confusion, but I do not consume food or have a physical body to eat almonds or any other food. I am an artificial intelligence designed to assist with information and tasks. If you have any questions or need assistance with something, feel free to ask!'

### Issue with no systemessageGeneration or incorporation with ConversationSummaryMemory
issue : https://github.com/langchain-ai/langchain/issues/7327

In [42]:
from langchain.memory import ConversationSummaryMemory
from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.chat_history import BaseChatMessageHistory
from typing import Any
class ConversationSummaryMemoryWithSystemMessage(ConversationSummaryMemory):
    @classmethod
    def from_messages(
        cls,
        llm: BaseLanguageModel,
        chat_memory: BaseChatMessageHistory,
        *,
        summarize_step: int = 2,
        **kwargs: Any,
    ) -> ConversationSummaryMemory:
        obj = cls(llm=llm, chat_memory=chat_memory, **kwargs)
        for i in range(0, len(obj.chat_memory.messages), summarize_step):
            # Include system messages in the summarization process
            messages = obj.chat_memory.messages[i : i + summarize_step]
            messages = [message for message in messages if not isinstance(message, SystemMessage)]
            obj.buffer = obj.predict_new_summary(messages, obj.buffer)
        return obj

## Different Memory Buffers to use 

### ConversationBufferWindowMemory

ConversationBufferWindowMemory focuses on the last K messages, acting as a window that keeps only the most recent interactions in memory. This method is useful for controlling the amount of context included in the request while still maintaining a recent history of the conversation

https://www.lotuslabs.ai/post/conversational-memory

In [1]:
from langchain.memory import ConversationBufferWindowMemory

# Initialize memory with a window size of 1
memory = ConversationBufferWindowMemory(k=1)

# Save context
memory.save_context({"input": "hi"}, {"output": "whats up"})
memory.save_context({"input": "not much you"}, {"output": "not much"})

# Load memory variables
result = memory.load_memory_variables({})
print(result) # Output: {'history': 'Human: not much you\nAI: not much'}


{'history': 'Human: not much you\nAI: not much'}


### ConversationTokenBufferMemory

ConversationTokenBufferMemory manages the buffer based on the total number of tokens, rather than the number of messages. This approach ensures that the memory usage is controlled by the token count, which is particularly useful for managing costs and processing times, especially in applications that interact with language models with token-based pricing



In [None]:
from langchain.memory import ConversationTokenBufferMemory
from langchain.llms import OpenAI

llm = OpenAI()

# Initialize memory with a maximum token limit
memory = ConversationTokenBufferMemory(llm=llm, max_token_limit=10)

# Save context
memory.save_context({"input": "hi"}, {"output": "what's up"})
memory.save_context({"input": "not much you"}, {"output": "not much"})

# Load memory variables
result = memory.load_memory_variables({})
print(result) # Output: {'history': 'Human: not much you\nAI: not much'}


###  ConversationSummaryBufferMemory 

ConversationSummaryBufferMemory  which strikes a balance between detailed context and memory efficiency. It allows for the summarization of previous interactions while keeping the most recent messages intact. This method is useful when precise context from the latest few messages is needed, while also having a summarized version of older interactions in the conversation 5.

In [12]:
from langchain.chains import ConversationChain

conversation_with_summary = ConversationChain(
    llm=llm,
    # We set a very low max_token_limit for the purposes of testing.
    memory=ConversationSummaryBufferMemory(llm=OpenAI(api_key="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"), 
                                           max_token_limit=40),
    verbose=True,
)
conversation_with_summary.predict(input="Hi, what's up?")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: Hi, what's up?
AI:[0m



llama_print_timings:        load time =     619.13 ms
llama_print_timings:      sample time =      43.42 ms /   256 runs   (    0.17 ms per token,  5896.17 tokens per second)
llama_print_timings: prompt eval time =    5468.81 ms /    70 tokens (   78.13 ms per token,    12.80 tokens per second)
llama_print_timings:        eval time =   31105.04 ms /   255 runs   (  121.98 ms per token,     8.20 tokens per second)
llama_print_timings:       total time =   37207.13 ms /   325 tokens


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: gsk_e7dG********************************************ASha. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
# GPU memory hierarchy -> Check 
# Vector query with prefill -> Check -> multiple user query -> using the same prefill.