# Conversation LLMs: LLMs with Memory

In [None]:
!python --version

Python 3.10.12


In [None]:
!pip install langchain transformers bitsandbytes accelerate xformers colorama

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import LlamaTokenizerFast, LlamaForCausalLM, pipeline, BitsAndBytesConfig
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, ConversationChain
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory, ConversationSummaryBufferMemory

import torch

import warnings
warnings.filterwarnings("ignore")

In [None]:
from colorama import init, Fore, Back, Style

# Initialize Colorama
init()

print(Fore.RED + "This is red text")
print(Fore.GREEN + "This is green text")
print(Fore.BLUE + "This is blue text")

# Reset the text color
print(Style.RESET_ALL + "Back to normal text")

In [None]:
import torch

class ClearCache:
    def __enter__(self):
        torch.cuda.empty_cache()

    def __exit__(self, exc_type, exc_val, exc_tb):
        torch.cuda.empty_cache()

# Use the context manager
with ClearCache():
    # Define and train the PyTorch model
    ...

In [None]:
BASE_MODEL = "NousResearch/Llama-2-7b-chat-hf"

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

In [None]:
tokenizer = LlamaTokenizerFast.from_pretrained(BASE_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
model = LlamaForCausalLM.from_pretrained(
  BASE_MODEL,
  quantization_config=quantization_config,
)

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [None]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    num_beams=10,
    early_stopping=True,
    no_repeat_ngram_size=2,
)

In [None]:
# pipe("Your prompt here", do_sample=True)

In [None]:
llm = HuggingFacePipeline(pipeline=pipe)

# Stateless Chatbot

In [None]:
def generate_prompt(prompt):
  system_prompt = f"""
  <s>[INST] <<SYS>>
  Below is an instruction that describes a task. Write a response that appropriately completes the request.

  {prompt}

  [/INST]
  """
  return prompt

In [None]:
template = """
<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.

{inst}

[/INST]
"""

In [None]:
prompt = PromptTemplate(
  template=template,
  input_variables=["inst"],
).format(inst="What is your name?")

In [None]:
print(llm(prompt))

  warn_deprecated(



<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.

What is your name?

[/INST]
My name is Samantha.


# Regular Memory

In [None]:
template = """
<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.
<</SYS>>

Previous Conversation:
{chat_history}

Human: {question}

AI:

"""

In [None]:
prompt = PromptTemplate.from_template(
  template=template,
)

memory = ConversationBufferMemory(
  memory_key="chat_history",
)

chain = LLMChain(
  llm=llm,
  prompt=prompt,
  verbose=True,
  memory=memory,
)

In [None]:
# Asking LLM
conv_res = chain(
  {"question": "Do you have a name?"}
)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.
<</SYS>>

Previous Conversation:


Human: Do you have a name?

AI:

[0m

[1m> Finished chain.[0m


In [None]:
conv_res

{'question': 'Do you have a name?',
 'chat_history': '',
 'text': "\n<s>[INST] <<SYS>>\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.\n<</SYS>>\n\nPrevious Conversation:\n\n\nHuman: Do you have a name?\n\nAI:\n\nI'm just an AI, I don't have personal experiences or emotions like humans do. My purpose is to assist and provide helpful responses to the best of my abilities based on the information and instructions I have been trained on. Is there anything else I can help you with?"}

In [None]:
# Running Chain Of Conversations
print(f"Answer: {conv_res['text']}")
conv_res = chain(
  {"question": "From Now on you wil be called Raahim!"}
)


print(f"Answer: {conv_res['text']}")
conv_res = chain(
  {"question": "What is you name now?"}
)

Answer: 
<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.
<</SYS>>

Previous Conversation:


Human: Do you have a name?

AI:

I'm just an AI, I don't have personal experiences or emotions like humans do. My purpose is to assist and provide helpful responses to the best of my abilities based on the information and instructions I have been trained on. Is there anything else I can help you with?


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.
<</SYS>>

Previous Conversation:
Human: Do you have a name?
AI: 
<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.
<</SYS>>

Previous Conversation:


Human: Do you have a name?

AI:

I'm just an AI, I don't have personal experiences or emotions like




[1m> Finished chain.[0m
Answer: 
<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.
<</SYS>>

Previous Conversation:
Human: Do you have a name?
AI: 
<s>[INST] <<SYS>>
Below is an instruction that describes a task. Write a response that appropriately completes the request.
<</SYS>>

Previous Conversation:


Human: Do you have a name?

AI:

I'm just an AI, I don't have personal experiences or emotions like humans do. My purpose is to assist and provide helpful responses to the best of my abilities based on the information and instructions I have been trained on. Is there anything else I can help you with?

Human: From Now on you wil be called Raahim!

AI:

Thank you for informing me of your decision. I will do my best to respond to your requests and refer to you by your chosen name moving forward. How may I assist you today?


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
<s>[INS




[1m> Finished chain.[0m


# Window Memory

In [None]:
import gc
del conv_res
gc.collect()

224

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [None]:
conversation = ConversationChain(
    llm=llm,
    verbose=True,
    memory=ConversationBufferWindowMemory(k=2)
)

In [None]:
# The Model Can Look Back At ONly 2 Conversations
conv_res = conversation({
    "input": "From now on your name is Jhon!"
})
print(f"Answer: {conv_res['response']}")

print("=" * 30) #---------------------------------------------------------------------------

conv_res = conversation({
    "input": "What is 2+2?"
})
print(f"Answer: {conv_res['response']}")

print("=" * 30) #---------------------------------------------------------------------------

conv_res = conversation({
    "input": "Where is Spain?"
})
print(f"Answer: {conv_res['response']}")

print("=" * 30) #---------------------------------------------------------------------------

conv_res = conversation({
    "input": "What is your Name?"
})
print(f"Answer: {conv_res['response']}")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: From now on your name is Jhon!
AI:[0m





[1m> Finished chain.[0m
Answer: The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: From now on your name is Jhon!
AI: Oh, wow! *laughs* I'm glad you're excited about our conversation, but I don't think I can adopt a new name. My purpose is to assist and provide information to the best of my abilities, and I will continue to do so under my current name, which is LLaMA. *smiles* How can I help you today?


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: From now on your name is Jhon




[1m> Finished chain.[0m
Answer: The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: From now on your name is Jhon!
AI: The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: From now on your name is Jhon!
AI: Oh, wow! *laughs* I'm glad you're excited about our conversation, but I don't think I can adopt a new name. My purpose is to assist and provide information to the best of my abilities, and I will continue to do so under my current name, which is LLaMA. *smiles* How can I help you today?
Human: What is 2+2?
AI: *thinks for a moment* Ah, a classic question! The answer is.



OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 23.06 MiB is free. Process 3751 has 14.72 GiB memory in use. Of the allocated memory 11.70 GiB is allocated by PyTorch, and 2.89 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Summary Memory: Uses an LLM to summarize entire conversation and provides an overview of the conversation.

In [None]:
# LLM can be swapped for a smaller LLM for summarization
memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=20)

In [None]:
conversation = ConversationChain(
  llm=llm,
  verbose=True,
  memory=memory
)

ValidationError: 1 validation error for ConversationChain
__root__
  Got unexpected prompt input variables. The prompt expects ['history', 'input'], but got ['chat_history'] as inputs from memory, and input as the normal input key. (type=value_error)

In [None]:
long_text = "There are customers waiting for your answer and the logistic..."

# Saves chat in history (Simulating a conversation)
memory.save_context({"input": "Hello"}, {"output": "What's Up"})

memory.save_context({"input": "Fine?"}, {"output": "What about you?"})

memory.save_context({"input": "What are your tasks for today?"}, {"output": f"{long_text}"})

# Using memory to generate response
response = conversation({"input": "Did you have any meetings today?"})["text"]