In [39]:
from openai import OpenAI
import os
import json
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
load_dotenv()

True

In [25]:
RUNPOD_ENDPOINT_ID = os.getenv("RUNPOD_CHATBOT_URL")


In [38]:
RUNPOD_EMBEDDING_KEY = os.getenv("RUNPOD_EMBEDDING_KEY")


# Connect to Endpoints

In [3]:
client = OpenAI(
    api_key=os.getenv("RUNPOD_TOKEN"),
    base_url = f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/openai/v1",
)

model_name = os.getenv("MODEL_NAME")

In [4]:
client


<openai.OpenAI at 0x280db177160>

In [5]:
response = client.chat.completions.create(
    model = model_name,
    messages = [
        {
            "role": "user",
            "content": "what is the capital of germany?"
        }
    ],
    temperature = 0.0,
    top_p = 0.8,
    max_tokens= 1000
)

In [6]:
response

ChatCompletion(id='chatcmpl-9a6ee438e98844e38b6c564a6ecdcd16', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The capital of Germany is Berlin.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[]), stop_reason=None)], created=1741052738, model='meta-llama/Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=42, total_tokens=50, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None)

In [7]:
response.choices[0].message.content

'The capital of Germany is Berlin.'

# Function for chatbot response
- append messages based on role

In [8]:
def get_chatbot_response(client, model_name, messages, temperature = 0.0):
    """_summary_

    Args:
        client (str): openai client
        model_name (str): huggingface model name
        messages (str): input messages
        temperature (float, optional): creativity of the model. Defaults to 0.0.

    Returns:
        str: _description_
    """
    input_messages = []
    for message in messages:
        input_messages.append({
            "role": message["role"],
            "content": message["content"]
        })
    
    response = client.chat.completions.create(
        
        model = model_name,
        messages = input_messages,
        temperature = temperature,
        top_p = 0.8,
        max_tokens= 2000
    ).choices[0].message.content   
    
    return response

In [9]:
messages = [
    {
        "role": "system",
        "content": "what is the capital of USA"
    }
]

get_chatbot_response(client, model_name, messages, temperature = 0.0)

'The capital of the United States of America (USA) is Washington, D.C. (short for District of Columbia).'

# Prompt Engineering Techniques

### 1. Structured output

In [10]:
system_prompt = """You're a helpful assistant that answers questions about capitals of countries.
Your output should be in a structured JSON format exactly like the one below. 
You are not allowed to write anything other than the JSON object:
[
    {
        
        "country": the country that you will get the capital of,
        "capital": the capital of the country mentioned,
    }
]
"""

messages = [
    {
        "role": "system",
        "content": system_prompt
    },
    {
        "role": "user",
        "content": "what is the capital of India?"
    }
]

response = get_chatbot_response(client, model_name, messages, temperature = 0.0)
print(response)

[
    {
        "country": "India",
        "capital": "New Delhi"
    }
]


In [12]:
json_response = json.loads(response)
print(json_response)

[{'country': 'India', 'capital': 'New Delhi'}]


In [13]:
type(json_response)

list

In [14]:
type(json_response[0])

dict

### 2. Input Structuring

In [16]:
user_input = """
Get me the capitals of the following countries:
'''
1. India
2. USA
3. Italy
'''
"""
messages = [
    {
        "role": "system",
        "content": system_prompt
    }
]

messages.append({
    "role": "user",
    "content": user_input
})


response = get_chatbot_response(client, model_name, messages, temperature = 0.0)
print(response)

[
    {
        "country": "India",
        "capital": "New Delhi"
    },
    {
        "country": "USA",
        "capital": "Washington, D.C."
    },
    {
        "country": "Italy",
        "capital": "Rome"
    }
]


In [None]:
# json format
json_response = json.loads(response)
json_response


[{'country': 'India', 'capital': 'New Delhi'},
 {'country': 'USA', 'capital': 'Washington, D.C.'},
 {'country': 'Italy', 'capital': 'Rome'}]

### 3. Chain of Thought Prompting
- Give the model time to think step by step
- best method out of all
- helps in complex reasoning
- reference: https://arxiv.org/abs/2205.11916

In [20]:
user_prompt = """
Calculate the result of this equation: 1+ 5

your output should be in a structured JSON format exactly like the one below. 
You are not allowed to write anything other than the JSON object:
{
    "result": the final result of the equation
}
"""

messages = [
    {
        "role": "user",
        "content": user_prompt
    }
]

response = get_chatbot_response(client, model_name, messages, temperature = 0.0)
print(response)

{
    "result": 6
}


Giving a more complex task to the model and asking it to break it down into smaller steps

In [22]:
user_prompt = """
Calculate the result of this equation: 234/2*8765+90876*11-98654

your output should be in a structured JSON format exactly like the one below. 
You are not allowed to write anything other than the JSON object:
{
    "result": the final result of the equation
}
"""

messages = [
    {
        "role": "user",
        "content": user_prompt
    }
]

response = get_chatbot_response(client, model_name, messages, temperature = 0.0)
print(response)

{
  "result": -234
}


In [21]:
234/2*8765+90876*11-98654

1926487.0

As you can see the actual output is different that what the model generated. Hence use the chain of thought prompting to get the model to think step by step and then generate the final output.

In [23]:
user_prompt = """
Calculate the result of this equation: 234/2*8765+90876*11-98654

your output should be in a structured JSON format exactly like the one below. 
You are not allowed to write anything other than the JSON object:
{
    "steps": This is where you solve the equation step by step following the BODMAS order of operations.
    You need to show your work and calculate each step leading to the final result. Feel free to write in free text
    
    "result": the final result of the equation
}
"""

messages = [
    {
        "role": "user",
        "content": user_prompt
    }
]

response = get_chatbot_response(client, model_name, messages, temperature = 0.0)
print(response)

{
    "steps": "First, we need to follow the BODMAS order of operations. 
    1. Divide 234 by 2: 234 / 2 = 117
    2. Multiply 117 by 8765: 117 * 8765 = 1025125
    3. Multiply 90876 by 11: 90876 * 11 = 998556
    4. Add 1025125 and 998556: 1025125 + 998556 = 2023681
    5. Subtract 98654 from 2023681: 2023681 - 98654 = 1928127",
    "result": 1928127
}


The above output is way better than the previous one. The model is able to break down the problem into smaller steps and then generate the final output. This is a great example of how chain of thought prompting can be used to improve the performance of the model.

# RAG

In [30]:
embedding_client = OpenAI(
    api_key=os.getenv("RUNPOD_TOKEN"),
    base_url = f"https://api.runpod.ai/v2/{RUNPOD_EMBEDDING_KEY}/openai/v1",
)

In [34]:
def get_embeddings(embedding_client, text_input, model_name):
    """returns the embedding of the input text using the specified model.

    Args:
        embedding_client (str): openai client
        text_input (str): input text
        model_name (str): huggingface model name
    Returns:
        str: _description_
    """
    
    output = embedding_client.embeddings.create(
        input = text_input,
        model=model_name
    )
    
    embeddings_lst = []
    for embedding_object in output.data:
        embeddings_lst.append(embedding_object.embedding)
    
    return embeddings_lst



In [32]:
user_prompt = "get me the embeddings of this text: 'hello world"

In [37]:
model_name

'meta-llama/Llama-3.1-8B-Instruct'

In [35]:
output = get_embeddings(embedding_client, user_prompt, model_name)

output

[[-0.0274015124887228,
  -0.012045375071465969,
  0.01661483757197857,
  -0.04113278165459633,
  0.022015802562236786,
  -0.004413077607750893,
  -0.05349092558026314,
  0.020566390827298164,
  0.024182291701436043,
  -0.0031963344663381577,
  -0.036189526319503784,
  0.004493176471441984,
  0.02430434711277485,
  -0.014471232891082764,
  0.037623681128025055,
  0.02778293564915657,
  -0.005206439644098282,
  0.02171066403388977,
  -0.08043472468852997,
  0.0010489164851605892,
  0.1196146160364151,
  0.025494391098618507,
  -0.0031066997908055782,
  -0.03609798476099968,
  -0.026699692010879517,
  0.02659289352595806,
  -0.017270885407924652,
  0.006053201388567686,
  0.017987962812185287,
  -0.11857714504003525,
  -0.022976992651820183,
  -0.04546575993299484,
  0.05114135146141052,
  0.0003101455222349614,
  0.061333004385232925,
  -0.001341659459285438,
  -0.04421468824148178,
  0.0213139820843935,
  -0.03001045249402523,
  0.041041240096092224,
  0.03948502987623215,
  -0.03884423

In [36]:
len(output[0])

384

In [None]:



# embedding_client.embeddings.create(
#     input = user_prompt,
#     model="BAAI/bge-small-en-v1.5"
# )