## consign similarity search with text embedding 
- note using data you provice to the openai model when sending a query to the text completion engine 

In [30]:
#!pip install tiktoken

In [63]:
import os
import ast
import openai
import tiktoken
import pandas as pd
import numpy as np

In [64]:
openai.api_key = os.environ["OPENAI_KEY"]

In [10]:
prompt = '''Only Answer the question below if you have 100% certainty of the facts.
Q: What does the start-up company Pentera do and who invested in it?
A: '''

In [70]:
def ask_open_ai(prompt, model="text-davinci-003"):
    res = openai.Completion.create(model = model,
                                   prompt = prompt,
                                   temperature = 0,
                                   max_tokens = 512)
    return res

def compnay_summary( name, url, city, country, type, investors ):
    investors_list = f"The investors in the company {name}"
    for investor in ast.literal_eval(investors):
        investors_list += f"{investor},"
    text = f"{name} has a headquartests in {city} in {country} and is in the field of {type}. {investors_list}. More info at {url}"
    return text


def token_number_count(val, encode_name):
    """ gpt2 """ 
    encoding = tiktoken.get_encoding(encode_name)
    num = len(encoding.encode(val))
    return num

def get_embedding( text, model="text-embedding-ada-002" ):
    result = openai.Embedding.create(model=model, input=text)
    return result['data'][0]['embedding']
    
def vector_similarity( v1, v2 ):
    return np.dot(np.array(v1), np.array(v2))

def get_prompt(context, question):
    prompt = f'''Only Answer the question below if you have 100% certainty of the facts.
    Context: {context}
    Q: {question}?
    A: '''
    return prompt

def ask_embedded_docs(df):
    qst = input("What question do you have about a Unicorn start-up?")
    prompt_embedding = get_embedding(qst)
    df['prompt_similarity'] = df['embedding'].apply( lambda vector:vector_similarity(vector, prompt_embedding) )    
    ctx = df.nlargest(1, 'prompt_similarity')
    prompt = get_prompt(ctx.iloc[0]['summary'], qst)
    print(prompt)
    resp = ask_open_ai(prompt)
    return resp
    

In [12]:
resp = ask_open_ai(prompt)

In [13]:
print(resp['choices'][0]['text'])

 I cannot answer this question with 100% certainty.


In [14]:
# Pentera is a start-up company that provides software solutions for the financial services industry. 
# The company's software helps financial advisors and institutions manage their client relationships, 
# automate processes, and improve compliance. Pentera has received investments from venture capital 
# firms such as Andreessen Horowitz, Lightspeed Venture Partners, and Formation 8. 

In [16]:
df = pd.read_csv("./data/unicorns.csv")

In [17]:
df.head()

Unnamed: 0,Updated at,Company,Crunchbase Url,Last Valuation (Billion $),Date Joined,Year Joined,City,Country,Industry,Investors,Company Website
0,"10/31/2022, 2:37:05 AM",Esusu,https://www.cbinsights.com/company/esusu,1.0,1/27/2022,2022,New York,United States,Fintech,"[""Next Play Ventures"",""Zeal Capital Partners"",...",
1,"10/31/2022, 2:37:05 AM",Fever Labs,https://www.cbinsights.com/company/fever-labs,1.0,1/26/2022,2022,New York,United States,Internet software & services,"[""Accel"",""14W"",""GS Growth""]",
2,"10/31/2022, 2:37:04 AM",Minio,https://www.cbinsights.com/company/minio,1.0,1/26/2022,2022,Palo Alto,United States,Data management & analytics,"[""General Catalyst"",""Nexus Venture Partners"",""...",
3,"10/31/2022, 2:37:04 AM",Darwinbox,https://www.cbinsights.com/company/darwinbox,1.0,1/25/2022,2022,Hyderabad,India,Internet software & services,"[""Lightspeed India Partners"",""Sequoia Capital ...",
4,"10/31/2022, 2:37:04 AM",Pentera,https://www.cbinsights.com/company/pcysys,1.0,1/11/2022,2022,Petah Tikva,Israel,Cybersecurity,"[""AWZ Ventures"",""Blackstone"",""Insight Partners""]",


In [28]:
df['summary'] = df.apply( lambda df: compnay_summary( df['Company'], df['Crunchbase Url'], df['City'], df['Country'], df['Industry'], df['Investors']  ), axis=1   )

In [29]:
df['summary'][0]

'Esusu has a headquartests in New York in United States and is in the field of Fintech. The investors in the company EsusuNext Play Ventures,Zeal Capital Partners,SoftBank Group,. More info at https://www.cbinsights.com/company/esusu'

In [34]:
token_count = token_number_count(df['summary'][0], 'cl100k_base')

In [35]:
print(token_count)

57


In [36]:
df['token_count'] = df['summary'].apply(lambda text: token_number_count(text, 'cl100k_base') )

In [37]:
token_count = df['token_count'].sum()

In [38]:
print(token_count)

70258


In [39]:
# $0.0004 per 1000 tokens (training) USD
price = token_count * 0.0004 / 1000


In [41]:
print(price) # around 3 cents

0.028103200000000002


In [44]:
vector = get_embedding(df['summary'][0])
len(vector)

1536

In [45]:
df['embedding'] = df['summary'].apply(get_embedding)

In [46]:
# df.to_csv('unicorns_with_embeddings.csv')

In [52]:
prompt_embedding = get_embedding("What does the start-up company Pentera do and who invested in it?")

In [53]:
df['prompt_similarity'] = df['embedding'].apply( lambda vector:vector_similarity(vector, prompt_embedding) )

In [54]:
df['prompt_similarity']

0       0.771109
1       0.749628
2       0.783438
3       0.756303
4       0.875022
          ...   
1194    0.752233
1195    0.769957
1196    0.752561
1197    0.771122
1198    0.761627
Name: prompt_similarity, Length: 1199, dtype: float64

In [55]:
row = df.nlargest(1, 'prompt_similarity')

In [57]:
row

Unnamed: 0,Updated at,Company,Crunchbase Url,Last Valuation (Billion $),Date Joined,Year Joined,City,Country,Industry,Investors,Company Website,summary,token_count,embedding,prompt_similarity
4,"10/31/2022, 2:37:04 AM",Pentera,https://www.cbinsights.com/company/pcysys,1.0,1/11/2022,2022,Petah Tikva,Israel,Cybersecurity,"[""AWZ Ventures"",""Blackstone"",""Insight Partners""]",,Pentera has a headquartests in Petah Tikva in ...,59,"[0.012362896464765072, -0.01194785162806511, 0...",0.875022


In [58]:
row.iloc[0]['summary']

'Pentera has a headquartests in Petah Tikva in Israel and is in the field of Cybersecurity . The investors in the company PenteraAWZ Ventures,Blackstone,Insight Partners,. More info at https://www.cbinsights.com/company/pcysys'

In [71]:
resp = ask_embedded_docs(df)

What question do you have about a Unicorn start-up? What does the start-up Minio do and who are the investors


Only Answer the question below if you have 100% certainty of the facts.
    Context: Minio has a headquartests in Palo Alto in United States and is in the field of Data management & analytics. The investors in the company MinioGeneral Catalyst,Nexus Venture Partners,Dell Technologies Capital,. More info at https://www.cbinsights.com/company/minio
    Q: What does the start-up Minio do and who are the investors?
    A: 


In [72]:
print(resp['choices'][0]['text'])

 Minio is a data management and analytics start-up with investors General Catalyst, Nexus Venture Partners, and Dell Technologies Capital.
