In [92]:
import pandas as pd
import tiktoken
import openai
import numpy as np
openai.api_key = "<openai-api-key>"
encoding = tiktoken.get_encoding("cl100k_base")



In [108]:
df = pd.read_csv("/Users/amrish_kushwaha/Desktop/workspace/hackathon-23/learning/instances.csv")
df = df.dropna()
df = df[["vendor","instance_type","vcpu","memory_gib","cost_in_dollars"]]
df["summarized"] = ("vendor: " + df.vendor.str.strip() + "; instance_type: " +   df.instance_type.str.strip()  + "; vcpu: " + df.vcpu.map(str) + "; memory_gib: " +  df.memory_gib.map(str) +  "; cost_in_dollars: " + df.cost_in_dollars.map(str))
df["token"] = df.summarized.apply(lambda x:len(encoding.encode(x)))

In [109]:
df.head(5)

Unnamed: 0,vendor,instance_type,vcpu,memory_gib,cost_in_dollars,summarized,token
0,AWS,t2.micro,1,1.0,0.0116,vendor: AWS; instance_type: t2.micro; vcpu: 1;...,40
1,AWS,t2.small,1,2.0,0.023,vendor: AWS; instance_type: t2.small; vcpu: 1;...,39
2,AWS,t2.medium,2,4.0,0.0464,vendor: AWS; instance_type: t2.medium; vcpu: 2...,40
3,AWS,t3.micro,2,1.0,0.0104,vendor: AWS; instance_type: t3.micro; vcpu: 2;...,40
4,AWS,t3.small,2,2.0,0.0208,vendor: AWS; instance_type: t3.small; vcpu: 2;...,40


In [110]:
def get_text_embedding(text, embeddding_mode="text-embedding-ada-002"):
    result = openai.Embedding.create(model=embeddding_mode, input=text)
    return result["data"][0]["embedding"]

def get_df_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    return { idx: get_text_embedding(r.summarized) for idx, r in df.iterrows()}

document_embeddings = get_df_embeddings(df)

In [74]:
document_embeddings

{0: [-0.007934235967695713,
  -0.01622537523508072,
  0.012368074618279934,
  -0.008133278228342533,
  -0.02292417734861374,
  0.027152109891176224,
  -0.02871699444949627,
  -0.025271503254771233,
  -0.009025536477565765,
  -0.031023139134049416,
  0.008881403133273125,
  0.022745726630091667,
  0.02701483853161335,
  -0.044530559331178665,
  0.0006619013147428632,
  0.01890215091407299,
  0.012958337552845478,
  -0.01467421930283308,
  -0.00029684751643799245,
  -0.012223940342664719,
  -0.015703747048974037,
  0.021249476820230484,
  -0.014825216494500637,
  0.015250755473971367,
  0.0009703310206532478,
  -0.002896408084779978,
  0.011173821054399014,
  -0.019931679591536522,
  0.018009893596172333,
  -0.0005053271306678653,
  0.008895129896700382,
  -0.015058576129376888,
  -0.01969831995666027,
  -0.020467035472393036,
  -0.016486190259456635,
  0.017062725499272346,
  -0.006873821374028921,
  0.005123622249811888,
  0.02973279543220997,
  -0.011743493378162384,
  0.0071586580015

In [111]:
def calculate_vector_similarity(x: list[float], y: list[float]) -> float:
    return np.dot(np.array(x), np.array(y))

def get_docs_with_similarity(query: str, df_embedding: dict[(str, str), np.array]) -> list[float, (str, str)]:
    query_embedding = get_text_embedding(query)
    document_similarities = sorted([(calculate_vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in df_embedding.items()], reverse=True)
    return document_similarities

# get_docs_with_similarity("Which instance_type is costliest", document_embeddings)[:3]



In [112]:
encoding = tiktoken.get_encoding("gpt2")
separator_len = len(encoding.encode("\n* "))

def create_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    relevant_document_sections = get_docs_with_similarity(question, context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in relevant_document_sections:
        document_section = df.loc[section_index]
        chosen_sections_len += document_section.token + separator_len
        if chosen_sections_len > 500:
            break

        chosen_sections.append("\n* " + document_section.summarized.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, check for multiple number of that instance_type and provide answer along with total instances required and total cost"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"


In [113]:
def get_answer(query: str, df: pd.DataFrame, document_embeddings: dict[(str, str), np.array]) -> str:
    prompt = create_prompt(query, document_embeddings, df)
    
    response = openai.Completion.create(prompt=prompt, temperature=0, max_tokens=500, model="text-davinci-003")
    return response["choices"][0]["text"]

In [114]:
query = "Please suggest the most cost-effective instance_type that satisfies the following both conditions: vcpu is 4, and memory_gib is 20. Also provide total cost and total number of instances"
response = get_answer(query, df, document_embeddings)
print(f"\nQ: {query}\nA:{response}")


Q: Please suggest the most cost-effective instance_type that satisfies the following both conditions: vcpu is 4, and memory_gib is 20. Also provide total cost and total number of instances
A: The most cost-effective instance_type that satisfies the conditions of vcpu is 4 and memory_gib is 20 is AWS m5.xlarge with a cost of 0.192 dollars per hour. The total cost for one instance is 0.192 dollars per hour and the total number of instances required is 1.


In [115]:
query = "Please suggest the costliest instance type irrespective of vcpu and memory value"
response = get_answer(query, df, document_embeddings)
print(f"\nQ: {query}\nA:{response}")


Q: Please suggest the costliest instance type irrespective of vcpu and memory value
A: The costliest instance type is n1-highmem-16 from GCP with a cost of 0.9472 dollars.


In [116]:
query = "Give me all instance types that have vcpu >= 10"
response = get_answer(query, df, document_embeddings)
print(f"\nQ: {query}\nA:{response}")


Q: Give me all instance types that have vcpu >= 10
A: n1-highcpu-16 (16 vCPU, 14.4 GiB memory, cost_in_dollars: 0.5632) and n1-standard-8 (8 vCPU, 30.0 GiB memory, cost_in_dollars: 0.38). Total instances required: 2, Total cost: 0.9432.


In [91]:
query = "Give me one cost-effective instance_type with 5 or more vcpu"
response = get_answer(query, df, document_embeddings)
print(f"\nQ: {query}\nA:{response}")


Q: Give me one cost-effective instance_type with 5 or more vcpu
A: The most cost-effective instance_type with 5 or more vcpu is m5.xlarge with 4 vcpu and 16 GiB memory at a cost of $0.192 per hour. If you need 5 vcpu, you can use two m5.xlarge instances for a total cost of $0.384 per hour.
