## Prerequisits:
- Install [ollama](https://ollama.com/)
- Run ollama from terminal: ```ollama run llama3.1```



### Read environment variables from .env

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

AZURE_OPENAI_EMBEDDING_MODEL: str = os.getenv('AZURE_OPENAI_EMBEDDING_MODEL')
AZURE_OPENAI_ENDPOINT: str = os.getenv('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_API_KEY: str = os.getenv('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_API_VERSION: str = os.getenv('AZURE_OPENAI_API_VERSION')

AZURE_SEARCH_API_KEY: str = os.getenv('AZURE_SEARCH_API_KEY')
AZURE_SEARCH_ENDPOINT: str = os.getenv('AZURE_SEARCH_ENDPOINT')

### Create LangChain object to facilitate creation of embeddings using Azure OpenAI embeddings model

In [2]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment=AZURE_OPENAI_EMBEDDING_MODEL,
    openai_api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
)

### Create LangChain object to facilitate creation of Azure AI Search index

In [3]:
from langchain_community.vectorstores.azuresearch import AzureSearch

index_name: str = "langchain-vector-demo"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
    azure_search_key=AZURE_SEARCH_API_KEY,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    additional_search_client_options={"retry_total": 4}
)

### Get sample data

In [4]:
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

docs = [WebBaseLoader(url).load() for url in urls]

docs_list = [item for sublist in docs for item in sublist]

USER_AGENT environment variable not set, consider setting it to identify your requests.


### Split data into chunks

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250,
    chunk_overlap=0
)

doc_splits = text_splitter.split_documents(docs_list)

### Add document chunks to Azure AI Search index (content and embeddings)

In [6]:
vector_store.add_documents(documents=doc_splits)

['MTFhZmY1MTUtNTJkYi00ZTJkLWI2MmEtNWY1OWY4OGVmMTU0',
 'MDk1ZjQzZTItZDc4MC00ZGE3LTliMzYtZTY0ZDhjZWU1Nzg1',
 'ZWYxN2MzZDMtMzYyMy00YjA5LWI3ZTUtZWQ2NTFmYWQ5Zjkx',
 'ZGFiM2IzODUtNGIwYi00MDBjLWIxYjctYTM1NTRlYzk2NmQw',
 'NDBiZTYwNWUtNjMzMC00YzdiLTliMTgtNTk1MTgxMTkyNTgw',
 'ZDM3MDI4ODMtMTZkOS00NGM5LWEzOTgtMzBiMTkyOGE0ZjRk',
 'ZDAyZmZmN2UtZTY1Ni00ZmI1LWJjMzktMzA3YzcyYjY2ZTI1',
 'OGZiZTJiYzEtZjk2MS00YTk3LTg0MmQtMWYwM2I2OGVmYWZk',
 'NzRjZGVjMzQtNjZiNS00ZDkwLWI5MGMtMmQ3YjNkM2Y2NWEw',
 'YzY5OWE0NDktZDFhNC00YmRkLWFlYmUtNzE3Y2NhNjczNzg2',
 'YzE2ODdlMTYtYTI3ZC00ZGQxLTk0ZjMtZDMwYWQ1NWUyNmEy',
 'NzIzMDRkMmYtNjUwMy00ZmIyLWI2ZDgtMGYwNDZkZGY3MTAx',
 'YjU1MjQzZDUtY2U2NS00ZWFmLTk4MTQtYzQ4M2EyYzBlMGE3',
 'YjQ0Zjc0MzgtNzVjMC00NTdhLWFhNWQtNjUxZWQ3ZWJlNzZl',
 'MmUzZDNlOWYtOGYwMC00NDRhLWExMTAtMTMwYjBmYzQ5ODVi',
 'MjQyOWYzMDktM2UzYi00MTM2LWIwMDItNjc0ZTI2NjQzYjUw',
 'ZDE0MTdjYzUtZDkxZC00Yjg4LTk4ZDgtYzlmMWVkYzBmOTY3',
 'NmFjNGFmZjUtMDZhOC00Yzg4LTk3YjgtMDE5YTUzZjUwZDkw',
 'NTRkODhkZGQtY2E4Yi00YzhlLWE5ODEtYjAxZDBkNzk5

### Search example 1: Similarity Search

In [7]:
docs = vector_store.similarity_search(
    query="What is CoT?",
    k=3,
    search_type="similarity",
)

In [8]:
from IPython.display import display, Markdown

for indx, doc in enumerate(docs, start=1):
    md_content = f"""### Document {indx}

**Source:** {doc.metadata['source']}

**Title:** {doc.metadata['title']}

**Description:** {doc.metadata['description']}

**Language:** {doc.metadata['language']}

**Content:** {doc.page_content}
    """
    display(Markdown(md_content))

### Document 1

**Source:** https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/

**Title:** Prompt Engineering | Lil'Log

**Description:** Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.
This post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models.

**Language:** en

**Content:** Two main types of CoT prompting:
    

### Document 2

**Source:** https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/

**Title:** Prompt Engineering | Lil'Log

**Description:** Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.
This post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models.

**Language:** en

**Content:** Few-shot CoT. It is to prompt the model with a few demonstrations, each containing manually written (or model-generated) high-quality reasoning chains.
    

### Document 3

**Source:** https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/

**Title:** Prompt Engineering | Lil'Log

**Description:** Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.
This post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models.

**Language:** en

**Content:** Fig. 3. Comparing CoT and PoT. (Image source: Chen et al. 2022).
External APIs#
TALM (Tool Augmented Language Models; Parisi et al. 2022) is a language model augmented with text-to-text API calls. LM is guided to generate |tool-call and tool input text conditioned on task input text to construct API call requests. When |result shows up, the specified tool API is called and the returned result gets appended to the text sequence. The final output is generated following |output token.
    

### Search example 2: Similarity Search with Relevance Score

In [9]:
docs_and_scores = vector_store.similarity_search_with_relevance_scores(
    query="What is CoT?",
    k=4,
    score_threshold=0.80,
)

In [10]:
for indx, (doc, score) in enumerate(docs_and_scores, start=1):
    md_content = f"""### Document {indx}

**Source:** {doc.metadata['source']}

**Title:** {doc.metadata['title']}

**Description:** {doc.metadata['description']}

**Language:** {doc.metadata['language']}

**Content:** {doc.page_content}

**Score:** {score}
    """
    display(Markdown(md_content))

### Document 1

**Source:** https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/

**Title:** Prompt Engineering | Lil'Log

**Description:** Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.
This post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models.

**Language:** en

**Content:** Two main types of CoT prompting:

**Score:** 0.85343915
    

### Document 2

**Source:** https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/

**Title:** Prompt Engineering | Lil'Log

**Description:** Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.
This post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models.

**Language:** en

**Content:** Two main types of CoT prompting:

**Score:** 0.85343915
    

### Document 3

**Source:** https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/

**Title:** Prompt Engineering | Lil'Log

**Description:** Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.
This post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models.

**Language:** en

**Content:** Few-shot CoT. It is to prompt the model with a few demonstrations, each containing manually written (or model-generated) high-quality reasoning chains.

**Score:** 0.85109925
    

### Document 4

**Source:** https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/

**Title:** Prompt Engineering | Lil'Log

**Description:** Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.
This post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models.

**Language:** en

**Content:** Few-shot CoT. It is to prompt the model with a few demonstrations, each containing manually written (or model-generated) high-quality reasoning chains.

**Score:** 0.85109925
    

### Create LangChain prompt template

In [11]:
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks.
    Use the following documents to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise:
    Question: {question}
    Documents: {documents}
    Answer:
    """,
    input_variables=["question", "documents"],
)

### Initialize the LLM with Llama 3.1 model

In [12]:
llm = ChatOllama(
    model="llama3.1",
    temperature=0,
)

### Create a chain combining the prompt template and LLM

In [13]:
rag_chain = prompt | llm | StrOutputParser()

### Define the RAG application class

In [14]:
class RAGApplication:
    def __init__(self, retriever, rag_chain):
        self.retriever = retriever
        self.rag_chain = rag_chain
    def run(self, question):
        # Retrieve relevant documents
        documents = self.retriever.similarity_search_with_relevance_scores(
            query=question,
            k=4,
            score_threshold=0.80,
        )
        # Extract content from retrieved documents
        doc_texts = "\\n".join([doc[0].page_content for doc in documents])
        # Get the answer from the language model
        answer = self.rag_chain.invoke({"question": question, "documents": doc_texts})
        return answer

### Initialize and run the RAG application

In [15]:
rag_application = RAGApplication(vector_store, rag_chain)

question = "What is CoT?"
answer = rag_application.run(question)
print("Question:", question)
print("Answer:", answer)

Question: What is CoT?
Answer: CoT stands for Chain-of-Thought, which refers to prompting models with a few demonstrations of high-quality reasoning chains. This allows the model to learn from and replicate the thought process behind the correct answers. It's used in few-shot learning scenarios where the model is given a limited number of examples to learn from.
