In [10]:
import os
import bs4
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings

In [8]:
groq_api_key=os.getenv("GROQ_API_KEY")
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")
llm=ChatGroq(groq_api_key=groq_api_key,model="Llama3-8b-8192")
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")



In [4]:
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7ff51a98ca30>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7ff51a98ded0>, model_name='Llama3-8b-8192')

In [9]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [11]:
# 1. Load,chunk and index the contents of the blog to create a retriever
loader = WebBaseLoader(
    web_path=("https://lilianweng.github.io/posts/2022-04-15-data-gen/",),
    bs_kwargs=dict(
        parse_only =bs4.SoupStrainer(
            class_ = ("post-content","post-title","post-header")
        )
    ),
)
documents = loader.load()
documents

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2022-04-15-data-gen/'}, page_content='\n\n      Learning with not Enough Data Part 3: Data Generation\n    \nDate: April 15, 2022  |  Estimated Reading Time: 28 min  |  Author: Lilian Weng\n\n\nHere comes the Part 3 on learning with not enough data (Previous: Part 1 and Part 2). Let’s consider two approaches for generating synthetic data for training.\n\nAugmented data. Given a set of existing training samples, we can apply a variety of augmentation, distortion and transformation to derive new data points without losing the key attributes. We have covered a bunch of augmentation methods on text and images in a previous post on contrastive learning. For the sake of post completeness, I duplicate the section on data augmentation here with some edits.\nNew data. Given few or even no data points, we can rely on powerful pretrained models to generate a number of new data points. This is especially true in recent years given t

In [12]:
text_splitter =RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
text = text_splitter.split_documents(documents)
vectorstore = Chroma.from_documents(documents=text,embedding=embeddings)
retriver = vectorstore.as_retriever()
retriver

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7ff5089dfdf0>)

In [18]:
# Prompt Template 
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}"),
    ]
)


In [19]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriver,question_answer_chain)

In [20]:
response = rag_chain.invoke({"input":"what is Data Augmentation? "})
response

{'input': 'what is Data Augmentation? ',
 'context': [Document(metadata={'source': 'https://lilianweng.github.io/posts/2022-04-15-data-gen/'}, page_content='Data Augmentation#\nThe goal of data augmentation is to modify the input format (e.g. text wording, visual appearance) while the semantic meaning stays unchanged.\nImage Augmentation#\nBasic Image Processing Operations#\nThere are several ways to modify an image while retaining its semantic information. We can use any one of the following augmentation or a composition of multiple operations.\n\nRandom cropping and then resize back to the original size.\nRandom color distortions\nRandom Gaussian blur\nRandom color jittering\nRandom horizontal flip\nRandom grayscale conversion\nAnd many more. Check PIL.ImageOps for inspiration.\n\nTask-Specific Augmentation Strategies#\nIf the downstream task is known, it is possible to learn the optimal augmentation strategies (i.e. what processing operations to use and how to combine them in sequen

In [21]:
response['answer']

'Data augmentation is a technique that modifies the input format of data, such as text or images, while keeping the semantic meaning unchanged. The goal is to increase the size and diversity of the training dataset, which can improve the performance of machine learning models.'

In [24]:
rag_chain.invoke({"input":"how many types with makdown use "})

{'input': 'how many types with makdown use ',
 'context': [Document(metadata={'source': 'https://lilianweng.github.io/posts/2022-04-15-data-gen/'}, page_content='Given a query $\\mathbf{q}$, MoCHi (“mixing of contrastive hard negatives”; Kalantidis et al. 2020) maintains a queue of $K$ negative features $Q={\\mathbf{n}_1, \\dots, \\mathbf{n}_K }$ and sorts these negative features by similarity to the query, $\\mathbf{q}^\\top \\mathbf{n}$, in descending order. The first $N$ items in the queue are considered as the hardest negatives, $Q^N$. Then synthetic hard examples can be generated by $\\mathbf{h} = \\tilde{\\mathbf{h}} / |\\tilde{\\mathbf{h}}|_2$ where $\\tilde{\\mathbf{h}} = \\alpha\\mathbf{n}_i + (1-\\alpha) \\mathbf{n}_j$ and $\\alpha \\in (0, 1)$. Even harder examples can be created by mixing with the query feature, $\\mathbf{h}’ = \\tilde{\\mathbf{h}’} / |\\tilde{\\mathbf{h}’}|_2$ where $\\tilde{\\mathbf{h}’} = \\beta\\mathbf{q} + (1-\\beta) \\mathbf{n}_j$ and $\\beta \\in (0,

## Adding Chat History

In [25]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

In [26]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system",contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human","{input}"),
    ]
)
 

In [27]:
history_aware_retriver = create_history_aware_retriever(llm,retriver,contextualize_q_prompt)

In [28]:
history_aware_retriver

RunnableBinding(bound=RunnableBranch(branches=[(RunnableLambda(lambda x: not x.get('chat_history', False)), RunnableLambda(lambda x: x['input'])
| VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7ff5089dfdf0>))], default=ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it 

In [30]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [31]:
question_answer_chain = create_stuff_documents_chain(llm,qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriver,question_answer_chain)

In [32]:
from langchain_core.messages import AIMessage,HumanMessage
chat_history = []
question = "what is Data Augmentation?"
response1 = rag_chain.invoke({"input":question,"chat_history":chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response1['answer'])
    ]
)
question2 = "Tell me about it?"
response2 = rag_chain.invoke({"input":question,"chat_history":chat_history})
print(response2['answer'])

Data augmentation is a method that modifies the input format of data, such as text or images, while keeping the semantic meaning unchanged.


In [33]:
chat_history

[HumanMessage(content='what is Data Augmentation?'),
 AIMessage(content='Data augmentation is a technique used to modify the input format of data, such as text or images, while preserving its semantic meaning. This is done to increase the size and diversity of the training dataset, which can improve the performance of machine learning models.')]

In [34]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [36]:
conversational_rag_chain.invoke(
    {"input": "What is Audio augmentation ?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

'Audio augmentation is a technique used to modify audio data to improve the generalization of machine learning models. It involves applying various operations to the audio signals to create new, augmented versions of the data, which can help the model learn more robust features and improve its performance on unseen data. Examples of audio augmentation methods include time masking, frequency masking, frequency shifting, and audio mixup.'

In [37]:
conversational_rag_chain.invoke(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'Common ways of audio augmentation include:\n\n* Audio mixup: mixing two audio clips together to create a new one\n* Time masking: masking a small chunk of the audio signal\n* Frequency masking: dropping off a small amount of frequency components on the spectrogram\n* Frequency shift: shifting the spectrogram by an integer between [-F, F]'

In [38]:
store

{'abc123': InMemoryChatMessageHistory(messages=[HumanMessage(content='What is Task Decomposition?'), AIMessage(content="I don't know."), HumanMessage(content='What is Audio augmentation ?'), AIMessage(content='Audio augmentation is a technique used to modify audio data to improve the generalization of machine learning models. It involves applying various operations to the audio signals to create new, augmented versions of the data, which can help the model learn more robust features and improve its performance on unseen data. Examples of audio augmentation methods include time masking, frequency masking, frequency shifting, and audio mixup.'), HumanMessage(content='What are common ways of doing it?'), AIMessage(content='Common ways of audio augmentation include:\n\n* Audio mixup: mixing two audio clips together to create a new one\n* Time masking: masking a small chunk of the audio signal\n* Frequency masking: dropping off a small amount of frequency components on the spectrogram\n* Fr