# **RAG**

**Imports**

In [1]:
import os
import dotenv
dotenv.load_dotenv(".env")

from opentelemetry import trace
tracer = trace.get_tracer(__name__)

In [None]:
from azure.ai.projects import AIProjectClient
from azure.ai.projects.models import ConnectionType
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

# create a project client using environment variables loaded from the .env file
project = AIProjectClient.from_connection_string(
    conn_str=os.environ["PROJECT_CONNECTION_STRING"], credential=DefaultAzureCredential()
)

# create a vector embeddings client that will be used to generate vector embeddings
chat = project.inference.get_chat_completions_client()
embeddings = project.inference.get_embeddings_client()

# use the project client to get the default search connection
search_connection = project.connections.get_default(
    connection_type=ConnectionType.AZURE_AI_SEARCH, include_credentials=True
)

# Create a search index client using the search connection
# This client will be used to create and delete search indexes
search_client = SearchClient(
    index_name=os.environ["SEARCH_INDEX_NAME"],
    endpoint=search_connection.endpoint_url,
    credential=AzureKeyCredential(key=search_connection.key),
)

### **RETRIEVAL**

**INTENT SYSTEM MESSAGE**

In [None]:
from azure.ai.inference.models import UserMessage, SystemMessage

# Define your INTENT_SYSTEM_PROMPT correctly with escaped braces
INTENT_SYSTEM_PROMPT = """
    # Intent Mapping System

    Your task is to understand the user's query and map it to a search intent.
    
    For example, if a user asks about "attention mechanisms in transformers", 
    create a search query like "attention mechanism transformer architecture neural networks".
    
    Avoid phrases like "I want" or "tell me about". Just provide keywords.
    
    The user's conversation history is:
    {conversation_history}
    
    Return only the search query, nothing else. Use the format: 
    {{"intent": "your search query here"}}
"""

# Then fix your get_intent_system_message function to properly escape the curly braces in the output
def get_intent_system_message(conversation_history):
    return SystemMessage(INTENT_SYSTEM_PROMPT.format(conversation_history=conversation_history)) 

**RETRIEVE DOCUMENTS**

In [5]:
from azure.search.documents.models import VectorizedQuery
import json

@tracer.start_as_current_span(name="get_product_documents")
def get_product_documents(messages: list, top: int=3) -> dict:
    intent_query_response = chat.complete(
        model=os.environ["chatModel"],
        messages=[get_intent_system_message(messages)]
    )

    enhanced_search_query = json.loads(intent_query_response.choices[0].message.content)["intent"]
    
    embedding = embeddings.embed(model=os.environ["embeddingModel"], input=enhanced_search_query)
    search_vector = embedding.data[0].embedding
    vector_query = VectorizedQuery(vector=search_vector, k_nearest_neighbors=50, fields="text_vector")

    search_results = search_client.search(
        search_text=enhanced_search_query,
        vector_queries=[vector_query],
        select=["id", "content", "title", "url"],
        top=top,
    )

    documents = [
        {
            "id": result["id"],
            "content": result["content"],
            "title": result["title"],
            "url": result["url"],
        }
        for result in search_results
    ]

    return documents

### **Completion**

**RAG SYSTEM MESSAGE**

In [None]:
SYSTEM_PROMPT = """You are a helpful AI assistant that provides accurate information based on the retrieved context.

### Retrieved Context:
{retrieved_context}

### Instructions:
1. Answer questions based on the retrieved context above
2. If the context doesn't contain the information needed, acknowledge the limitation
3. Do not make up information that is not supported by the context
4. Keep responses concise and focused on the user's question
5. Format your answers using Markdown when appropriate
6. When quoting directly from the context, use quotation marks

Remember: Only use information from the retrieved context to answer questions.
"""

def get_completion_system_message(retrieved_context):
    return SystemMessage(SYSTEM_PROMPT.format(retrieved_context=retrieved_context))

In [None]:
@tracer.start_as_current_span(name="chat_with_attentionIsAllYouNeed")
def chat_with_products(messages: list) -> dict:
    documents = get_product_documents(messages)
    
    # Create the system message
    system_message = get_completion_system_message(documents)

    # Format messages properly for the API
    formatted_messages = [system_message]
    
    # Add user messages
    for message in messages:
        print(f"message: {message}")
        formatted_messages.append(UserMessage(message["content"]))
    
    response = chat.complete(
        model=os.environ["chatModel"],
        messages=formatted_messages
    )
    # Return a chat protocol compliant response
    return response.choices[0].message

In [10]:
from azure.ai.inference.tracing import AIInferenceInstrumentor
from azure.monitor.opentelemetry import configure_azure_monitor
from azure.core.settings import settings

def enable_telemetry(project):
    AIInferenceInstrumentor().instrument()
    settings.tracing_implementation = "opentelemetry"
    application_insights_connection_string = project.telemetry.get_connection_string()
    configure_azure_monitor(connection_string=application_insights_connection_string)

In [13]:
# from config import enable_telemetry
enable_telemetry(project)

user_message = "how does attention relate to feed forward networks?"
response = chat_with_products(messages=[{"role": "user", "content": user_message}])

message: {'role': 'user', 'content': 'how does attention relate to feed forward networks?'}


In [14]:
from IPython.display import display, Markdown
display(Markdown(response.content))

In the Transformer model, attention mechanisms and feed forward networks are both integral parts of the architecture used in the encoder and decoder. The Transformer leverages self-attention to compute representations of input and output sequences without relying on sequential RNNs or convolution.

Based on the context, the relationship between attention and feed forward networks in the Transformer can be described as follows:

1. **Attention Mechanisms**:
   - Attention mechanisms are used to model dependencies between different positions of the input and output sequences. They do this without regard to the distance between the positions, enabling parallel computation.

2. **Feed Forward Networks**:
   - After applying attention, the output is passed through point-wise, fully connected feed forward layers. These layers are responsible for transforming the attention-weighted positions.

In the detailed architecture (Figure 1 from the context), self-attention and feed forward layers are stacked together:

- **Encoder and Decoder**:
  - Both the encoder and decoder use "stacked self-attention and point-wise, fully connected layers." This means that within each encoder and decoder layer, there is an attention computation followed by a feed forward network.

To summarize, attention mechanisms relate different positions within a sequence, creating a context-aware representation, while feed forward networks further process these representations to refine the output. Together, they work in tandem within the layers of the Transformer model.
