In [15]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

store = {}
def get_session_history(session_id:str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]

llm = ChatGroq(model="gemma2-9b-it",groq_api_key=groq_api_key,)
prompt = ChatPromptTemplate.from_messages([
    ("system","You are a helpful assistant please answer the question."),
    ("human","{input}")
])

chain = prompt | llm 

with_message_history = RunnableWithMessageHistory(
    chain,
    get_session_history,
)

config = {"configurable":{"session_id":"1"}}

response = with_message_history.invoke(
    {
        "input": "who is narendra mode ?",
    },
    config=config,
)



In [148]:
response = with_message_history.invoke(
    {
        "input": "what question i asked you previously ?",
    },
    config=config,
)

In [150]:
response = with_message_history.invoke(
    {
        "input": "summerize the conversaction ",
    },
    config=config,
)

In [151]:
store

{'1': InMemoryChatMessageHistory(messages=[HumanMessage(content='who is narendra mode ?'), AIMessage(content="Narendra Modi is the current Prime Minister of India. \n\nHere are some key facts about him:\n\n* **Full Name:** Narendra Damodardas Modi\n* **Born:** September 17, 1950, in Vadnagar, Gujarat, India\n* **Political Party:** Bharatiya Janata Party (BJP)\n* **Prime Minister of India:** Since May 26, 2014\n* **Background:** Modi rose through the ranks of the BJP, serving as Chief Minister of Gujarat from 2001 to 2014. He is known for his strong leadership, economic reforms, and focus on national security.\n\nIf you'd like to know more about a specific aspect of his life or career, just ask!  \n\n", response_metadata={'token_usage': {'completion_tokens': 166, 'prompt_tokens': 32, 'total_tokens': 198, 'completion_time': 0.301818182, 'prompt_time': 0.000333969, 'queue_time': 0.01300888, 'total_time': 0.302152151}, 'model_name': 'gemma2-9b-it', 'system_fingerprint': 'fp_10c08bf97d', 'f

In [132]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.messages import AIMessage,  HumanMessage

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

In [43]:
store = {}
def get_session_history(session_id:str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]

In [44]:
llm = ChatGroq(model="gemma2-9b-it",groq_api_key=groq_api_key,)
prompt = ChatPromptTemplate.from_messages([
    ("system","You are a helpful assistant please answer the question."),
    ("human","{input}")
])

chain = prompt | llm 
# chain.invoke({"messages":[
#     HumanMessage(content="Hii, My name is spandan"),
# ],"input":"who am i ?"})

In [45]:
with_message_history = RunnableWithMessageHistory(
    chain,
    get_session_history,
)

config = {"configurable":{"session_id":"1"}}

In [46]:
response = with_message_history.invoke(
    {
        "input": "who is narendra mode ?",
    },
    config=config,
)


In [47]:
response = with_message_history.invoke(
    {
        "input": "what question i asked you previously ?",
    },
    config=config,
)

In [48]:
print(response)
print(store)

content='You asked: "who is narendra mode ?" \n' response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 438, 'total_tokens': 452, 'completion_time': 0.025454545, 'prompt_time': 0.013814287, 'queue_time': 0.002980021000000001, 'total_time': 0.039268832}, 'model_name': 'gemma2-9b-it', 'system_fingerprint': 'fp_10c08bf97d', 'finish_reason': 'stop', 'logprobs': None} id='run-c038e24f-b6b9-4322-8199-0adbde137520-0' usage_metadata={'input_tokens': 438, 'output_tokens': 14, 'total_tokens': 452}
{'1': InMemoryChatMessageHistory(messages=[HumanMessage(content='who is narendra mode ?'), AIMessage(content="Narendra Modi is the current Prime Minister of India. \n\nHere are some key facts about him:\n\n* **Full Name:** Narendra Damodardas Modi\n* **Born:** September 17, 1950, in Vadnagar, Gujarat, India\n* **Political Party:** Bharatiya Janata Party (BJP)\n* **Prime Minister of India:** Since May 26, 2014\n\nNarendra Modi is a prominent figure in Indian politics known for his 

### Store conversational history in vector db

In [49]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = Chroma(
    collection_name="conv_history",
    embedding_function=embedding
    )



In [165]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain.schema import Document


# Modify get_session_history to use Chroma instead of store
def get_session_history(session_id: str) -> BaseChatMessageHistory:
    # Perform similarity search using session ID as the query
    results = vector_store.similarity_search_with_score(session_id)

    # If no relevant result is found (empty history), return an empty ChatMessageHistory
    if not results:  # No match or low similarity score
        return ChatMessageHistory()  # Return empty history if no match
    
    # Deserialize the retrieved stored history
    history_data = []
    for result in results[-3:]:
        history_data.append(eval(result[0].page_content)["messages"])

    # Create a new ChatMessageHistory instance from the saved history data
    history = ChatMessageHistory()
    history.messages = history_data  # Assuming messages are stored in a dict format
    return history  # Return the populated history

def save_session_history(session_id: str, history: ChatMessageHistory):
    # Manually construct the history as a dictionary
    history_data = {
        "messages":  history.messages
    }

    # Create a Document object to store in the vector store
    document = Document(
        page_content=str(history_data),  # Store as stringified dictionary
        metadata={"session_id": session_id}  # Attach session ID as metadata
    )

    # Add the document to Chroma (embed and store)
    vector_store.add_documents([document])



In [90]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(model="gemma2-9b-it",groq_api_key=groq_api_key,)
prompt = ChatPromptTemplate.from_messages([
    ("system","You are a helpful assistant please answer the question."),
    ("human","{input}")
])

chain = prompt | llm 

with_message_history = RunnableWithMessageHistory(
    chain,
    get_session_history,
)

config = {"configurable":{"session_id":"1"}}

response = with_message_history.invoke(
    {
        "input": "who is narendra mode ?",
    },
    config=config,
)

# Get session history after first interaction and save it
history = get_session_history("1")
history.add_user_message("who is narendra mode?")  # Add the user message
history.add_ai_message(response)               # Add the assistant's response
save_session_history("1", history)                   # Save history


In [92]:
response = with_message_history.invoke(
    {
        "input": "what is narendra modi mothers name ?",
    },
    config=config,
)

# Get session history after first interaction and save it
history = get_session_history("1")
history.add_user_message("what is narendra modi mothers name ?")  # Add the user message
history.add_ai_message(response)               # Add the assistant's response
save_session_history("1", history)                   # Save history



In [166]:
get_session_history("1")

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


InMemoryChatMessageHistory(messages=[[HumanMessage(content='what is narendra modi mothers name ?'), AIMessage(content="Narendra Modi's mother's name was **Heeraben Modi**. \n", response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 34, 'total_tokens': 54, 'completion_time': 0.036363636, 'prompt_time': 0.000402909, 'queue_time': 0.01350043, 'total_time': 0.036766545}, 'model_name': 'gemma2-9b-it', 'system_fingerprint': 'fp_10c08bf97d', 'finish_reason': 'stop', 'logprobs': None}, id='run-cffccb7e-3bb5-497b-9994-321a9fd8b03c-0', usage_metadata={'input_tokens': 34, 'output_tokens': 20, 'total_tokens': 54})], [HumanMessage(content='who is narendra mode?'), AIMessage(content="Narendra Modi is the current Prime Minister of India.  \n\nHere are some key facts about him:\n\n* **Full Name:** Narendra Damodardas Modi\n* **Born:** September 17, 1950, in Vadnagar, Gujarat, India\n* **Political Party:** Bharatiya Janata Party (BJP)\n* **Prime Minister of India:** Since May 26, 

In [157]:
res = vector_store.similarity_search_with_score("1")
for response in res:
    print(eval(response[0].page_content)["messages"])

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


[HumanMessage(content='what is narendra modi mothers name ?'), AIMessage(content="Narendra Modi's mother's name was **Heeraben Modi**. \n", response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 34, 'total_tokens': 54, 'completion_time': 0.036363636, 'prompt_time': 0.000402909, 'queue_time': 0.01350043, 'total_time': 0.036766545}, 'model_name': 'gemma2-9b-it', 'system_fingerprint': 'fp_10c08bf97d', 'finish_reason': 'stop', 'logprobs': None}, id='run-cffccb7e-3bb5-497b-9994-321a9fd8b03c-0', usage_metadata={'input_tokens': 34, 'output_tokens': 20, 'total_tokens': 54})]
[HumanMessage(content='who is narendra mode?'), AIMessage(content="Narendra Modi is the current Prime Minister of India.  \n\nHere are some key facts about him:\n\n* **Full Name:** Narendra Damodardas Modi\n* **Born:** September 17, 1950, in Vadnagar, Gujarat, India\n* **Political Party:** Bharatiya Janata Party (BJP)\n* **Prime Minister of India:** Since May 26, 2014\n* **Background:** Modi rose thro

In [168]:
response = with_message_history.invoke(
    {
        "input": "how to optimize openai token using langchain?",
    },
    config=config,
)
history = get_session_history("1")
history.add_user_message("how to optimize openai token using langchain?")  # Add the user message
history.add_ai_message(response)               # Add the assistant's response
save_session_history("1", history) 

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


In [169]:
response

AIMessage(content='Let\'s talk about optimizing OpenAI token usage with LangChain! \n\n**Understanding the Challenge**\n\nOpenAI\'s models, like GPT-3, are powerful but have a limit on the number of tokens they can process in a single request. Tokens are like the building blocks of text, and exceeding this limit results in errors or truncated responses.\n\n**LangChain to the Rescue**\n\nLangChain is a fantastic framework for building applications with large language models (LLMs) like OpenAI\'s. It offers several features that directly address token optimization:\n\n1. **Chunking:**\n\n   - **The Problem:**  Long pieces of text are difficult for LLMs to handle efficiently. \n   - **The Solution:** LangChain lets you break down large inputs into smaller, manageable chunks. You send each chunk to the LLM separately, and then piece the responses back together.\n\n2. **Memory:**\n\n   - **The Problem:** LLMs have limited short-term memory. They "forget" previous parts of a conversation qui

In [170]:
with_message_history.invoke(
    {
        "input": "is there any other way ?",
    },
    config=config,
)

AIMessage(content='You\'re right to ask! There are indeed several other ways to optimize OpenAI token usage with LangChain beyond the ones I mentioned. \n\nHere are a few more strategies:\n\n* **Parameter Tuning:** Experiment with different OpenAI API parameters like `max_tokens` and `temperature`.  \n    * `max_tokens`: Controls the maximum number of tokens OpenAI will generate in a response.  Adjust this to fit your needs.\n    * `temperature`:  Influences the "creativity" of the LLM\'s output. A lower temperature produces more deterministic, focused responses, which can be more token-efficient.\n\n* **Quantization:** Use quantized models when available. Quantization reduces the precision of the model\'s weights, making it smaller and faster, often with minimal impact on performance. LangChain might have built-in support for this or you might need to explore OpenAI\'s API options.\n\n* **Fine-Tuning:** If you have a specific use case, consider fine-tuning a smaller OpenAI model on yo

In [185]:
!pip install PyMySQL



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting PyMySQL
  Downloading PyMySQL-1.1.1-py3-none-any.whl.metadata (4.4 kB)
Downloading PyMySQL-1.1.1-py3-none-any.whl (44 kB)
Installing collected packages: PyMySQL
Successfully installed PyMySQL-1.1.1


In [12]:
# !pip install langchain_postgres
# !pip install psycopg-binary


Collecting psycopg-binary
  Downloading psycopg_binary-3.2.3-cp310-cp310-macosx_14_0_arm64.whl.metadata (2.8 kB)
Downloading psycopg_binary-3.2.3-cp310-cp310-macosx_14_0_arm64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: psycopg-binary
Successfully installed psycopg-binary-3.2.3


In [13]:
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_huggingface import HuggingFaceEmbeddings

In [18]:
DB_URL = "postgresql+psycopg2://lumin:root@localhost:5432/langchain"

In [25]:
llm = ChatGroq(model="gemma2-9b-it",groq_api_key=groq_api_key)
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



In [23]:
# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!
collection_name = "my_docs"


vector_store = PGVector(
    embeddings=embedding,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

Exception: Failed to create vector extension: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 6024 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 6024 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [42]:
import getpass
import os
import time

import pinecone
from pinecone import Pinecone

pinecone_api_key = ""

# pinecone.init(api_key=pinecone_api_key)


In [62]:
pc = Pinecone(api_key=pinecone_api_key) 
pc.delete_index("lumin")

In [60]:
from langchain_pinecone import PineconeVectorStore

# vector_store = PineconeVectorStore(index="attention", embedding=embedding)
# vector_store = Pinecone(index="attention", embedding=embedding, namespace="langchain")

In [44]:
from langchain.text_splitter import CharacterTextSplitter
from pypdf import PdfReader

pdf_reader = PdfReader('../1_dataIngestion/attention.pdf')

raw_text = ""
for i, page in enumerate(pdf_reader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 750,
    chunk_overlap  = 50,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)



In [45]:
documents = [
    
]
for text in texts:
    documents.append(Document(
    page_content=text,
    metadata={"source": "Attention is all you need"},
    ))

In [65]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
    documents,
    index_name="lumin",
    embedding=embedding,
    namespace="attention"
)

In [68]:
vectorstore_from_docs.similarity_search("what is normalization",filter={"source": "Attention is all you need"},)

[Document(id='c12a1594-4567-4dce-a26c-4d6989610a82', metadata={'source': 'Attention is all you need'}, page_content='to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs\nsuch as images, audio and video. Making generation less sequential is another research goals of ours.\nThe code we used to train and evaluate our models is available at https://github.com/\ntensorflow/tensor2tensor .\nAcknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful\ncomments, corrections and inspiration.\nReferences\n[1]Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint\narXiv:1607.06450 , 2016.\n[2]Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly\nlearning to align and translate. CoRR , abs/1409.0473, 2014.'),
 Document(id='4cf18215-2940-47c3-b064-b70e8dcbb50a', metadata={'source': 'Attention is all you need'}, page_content='query with the correspon