### Importing all the required libraries

In [10]:
import os
import pandas as pd
import ast
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '//Users//sayantrinisaha//Desktop//psyched-metrics-426921-h0-4a9007b858ad.json'
from google.cloud import aiplatform
from vertexai.language_models import TextGenerationModel

### Read the dataset on which we are going to do embedding

In [143]:
df_companies = pd.read_csv('companies.csv')
df_companies.head()

Unnamed: 0,Updated at,Company,Crunchbase Url,Last Valuation (Billion $),Date Joined,Year Joined,City,Country,Industry,Investors,Company Website
0,"10/31/2022, 2:37:05 AM",Esusu,https://www.cbinsights.com/company/esusu,1.0,1/27/2022,2022,New York,United States,Fintech,"[""Next Play Ventures"",""Zeal Capital Partners"",...",
1,"10/31/2022, 2:37:05 AM",Fever Labs,https://www.cbinsights.com/company/fever-labs,1.0,1/26/2022,2022,New York,United States,Internet software & services,"[""Accel"",""14W"",""GS Growth""]",
2,"10/31/2022, 2:37:04 AM",Minio,https://www.cbinsights.com/company/minio,1.0,1/26/2022,2022,Palo Alto,United States,Data management & analytics,"[""General Catalyst"",""Nexus Venture Partners"",""...",
3,"10/31/2022, 2:37:04 AM",Darwinbox,https://www.cbinsights.com/company/darwinbox,1.0,1/25/2022,2022,Hyderabad,India,Internet software & services,"[""Lightspeed India Partners"",""Sequoia Capital ...",
4,"10/31/2022, 2:37:04 AM",Pentera,https://www.cbinsights.com/company/pcysys,1.0,1/11/2022,2022,Petah Tikva,Israel,Cybersecurity,"[""AWZ Ventures"",""Blackstone"",""Insight Partners""]",


### Here I am using 'text-bison' to generate the result.

In [157]:
# I am first asking the model to give me back some information with any context context injection
llm = TextGenerationModel.from_pretrained('text-bison')
llm.predict(prompt='Tell me who are the venture capital firms for the company Minio?', max_output_tokens=1024, temperature=0)

 Minio, Inc. has raised a total of $105M in funding over 4 rounds. Their latest funding round was raised on May 11, 2022, from a Series B round. Minio, Inc. is funded by 10 investors. Their top investors include General Catalyst, Dell Technologies Capital, and A16Z.

In [155]:
# We fetch the investors from our csv file and it clearly shows that the llm is returning wrong investors.
ast.literal_eval(df_companies.iloc[2]['Investors'])

['General Catalyst', 'Nexus Venture Partners', 'Dell Technologies Capital']

#### We do find 'General Catalyst', 'Nexus Venture Partners' in the result but not 'Dell Technologies Capital'. Instead, the llm is giving us 'A16Z' as one of the venture capital firm, which is wrong. 

### I am doing here some context injection by creating an additional column 'summary', which will be able to inject some context.

In [161]:
def summary(company,crunchbase_url,city,country,industry,investor_list):
    investors = 'The investors in the company are'
     
    for investor in ast.literal_eval(investor_list):
        investors += f" {investor}, "

    text = f"{company} has headquarters in {city} in {country} and is in the field of {industry}. {investors}. You can find more information at {crunchbase_url}"

    return text
    
df_companies['summary'] = df_companies.apply(lambda df_companies: summary(df_companies['Company'],df_companies['Crunchbase Url'],df_companies['City'],df_companies['Country'],df_companies['Industry'],df_companies['Investors']),axis=1)
df_companies.head()

Unnamed: 0,Updated at,Company,Crunchbase Url,Last Valuation (Billion $),Date Joined,Year Joined,City,Country,Industry,Investors,Company Website,summary
0,"10/31/2022, 2:37:05 AM",Esusu,https://www.cbinsights.com/company/esusu,1.0,1/27/2022,2022,New York,United States,Fintech,"[""Next Play Ventures"",""Zeal Capital Partners"",...",,Esusu has headquarters in New York in United S...
1,"10/31/2022, 2:37:05 AM",Fever Labs,https://www.cbinsights.com/company/fever-labs,1.0,1/26/2022,2022,New York,United States,Internet software & services,"[""Accel"",""14W"",""GS Growth""]",,Fever Labs has headquarters in New York in Uni...
2,"10/31/2022, 2:37:04 AM",Minio,https://www.cbinsights.com/company/minio,1.0,1/26/2022,2022,Palo Alto,United States,Data management & analytics,"[""General Catalyst"",""Nexus Venture Partners"",""...",,Minio has headquarters in Palo Alto in United ...
3,"10/31/2022, 2:37:04 AM",Darwinbox,https://www.cbinsights.com/company/darwinbox,1.0,1/25/2022,2022,Hyderabad,India,Internet software & services,"[""Lightspeed India Partners"",""Sequoia Capital ...",,Darwinbox has headquarters in Hyderabad in Ind...
4,"10/31/2022, 2:37:04 AM",Pentera,https://www.cbinsights.com/company/pcysys,1.0,1/11/2022,2022,Petah Tikva,Israel,Cybersecurity,"[""AWZ Ventures"",""Blackstone"",""Insight Partners""]",,Pentera has headquarters in Petah Tikva in Isr...


In [165]:
context = df_companies.iloc[2]['summary']
prompt = f"Tell me who are the venture capital firms for the company Minio, here is some context:\n {context}"
llm.predict(prompt=prompt, max_output_tokens=1024, temperature=0)

 The venture capital firms for the company Minio are:

- General Catalyst
- Nexus Venture Partners
- Dell Technologies Capital

### We can automate this process using Text Embedding

#### Importing all the required libraries

In [170]:
from vertexai.language_models import TextEmbeddingModel
embedder = TextEmbeddingModel.from_pretrained('textembedding-gecko@001')
import numpy as np

#### Since creating the embedding(vectors) for each column wouls be costly, hence I am only taking the first 5 rows from our df_companies dataframe.

In [173]:
new_df_companies = df_companies.head(5)

In [175]:
# Creating the function which will return the embedding vector values for the column that we will pass 
def get_summary_embeddings(var_name):
    return embedder.get_embeddings([var_name])[0].values

#### Testing if my get_summary_embeddings is working or not

In [182]:
new_df_companies['embeddings'] = new_df_companies['summary'].apply(get_summary_embeddings)
new_df_companies.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_companies['embeddings'] = new_df_companies['summary'].apply(get_summary_embeddings)


Unnamed: 0,Updated at,Company,Crunchbase Url,Last Valuation (Billion $),Date Joined,Year Joined,City,Country,Industry,Investors,Company Website,summary,embeddings
0,"10/31/2022, 2:37:05 AM",Esusu,https://www.cbinsights.com/company/esusu,1.0,1/27/2022,2022,New York,United States,Fintech,"[""Next Play Ventures"",""Zeal Capital Partners"",...",,Esusu has headquarters in New York in United S...,"[-0.030785107985138893, 0.00467604398727417, -..."
1,"10/31/2022, 2:37:05 AM",Fever Labs,https://www.cbinsights.com/company/fever-labs,1.0,1/26/2022,2022,New York,United States,Internet software & services,"[""Accel"",""14W"",""GS Growth""]",,Fever Labs has headquarters in New York in Uni...,"[0.03269707411527634, -0.01659333147108555, -0..."
2,"10/31/2022, 2:37:04 AM",Minio,https://www.cbinsights.com/company/minio,1.0,1/26/2022,2022,Palo Alto,United States,Data management & analytics,"[""General Catalyst"",""Nexus Venture Partners"",""...",,Minio has headquarters in Palo Alto in United ...,"[0.014637074433267117, -0.036619797348976135, ..."
3,"10/31/2022, 2:37:04 AM",Darwinbox,https://www.cbinsights.com/company/darwinbox,1.0,1/25/2022,2022,Hyderabad,India,Internet software & services,"[""Lightspeed India Partners"",""Sequoia Capital ...",,Darwinbox has headquarters in Hyderabad in Ind...,"[0.016650568693876266, -0.02540149912238121, -..."
4,"10/31/2022, 2:37:04 AM",Pentera,https://www.cbinsights.com/company/pcysys,1.0,1/11/2022,2022,Petah Tikva,Israel,Cybersecurity,"[""AWZ Ventures"",""Blackstone"",""Insight Partners""]",,Pentera has headquarters in Petah Tikva in Isr...,"[-0.02143833413720131, -0.019953027367591858, ..."


#### We cannot take the dot product of a python list, but we can do dot product on a numpy array

In [125]:
def vector_similarity(A,B):
    vec1 = np.array(A)
    vec2 = np.array(B)

    return np.dot(np.array(vec1), np.array(vec2))

#### Creating the function here which will take the vector of the user question and the vector of the embedding column and will perform a dot product. Higher value of the dot product means it is more similar, while lesser value of dot product means not very similar. Finally, I am passing here the content of the 'summary' column for which it will give the higest dot product, as a context in the prompt.

In [None]:
# Query -> Vector -> Similar vector in the database -> Text -> Context -> Query+Context

In [190]:
def embed_prompt_loopup():
    question = input("What question do you have about a startup?")

    #API call -> Embedding
    prompt_embedding = get_summary_embeddings(question)

    #Similarity search - can be expensive it the dataframe is large.
    new_df_companies['prompt_similarity'] = new_df_companies['embeddings'].apply(lambda vector: vector_similarity(prompt_embedding, vector))

    #Most similar embedding/row
    context = new_df_companies.nlargest(1, 'prompt_similarity').iloc[0]['summary']                                                          
                                                                 
    # Prompt + context injection
    prompt = f"""Only answer the question below if you have 100% certainity of the facts. Use the context blow.
    Here is some context: \n
    {context}

    Question: {question}
    """

    results = llm.predict(prompt=prompt, temperature=0, max_output_tokens=2048)
    print(results.text)
embed_prompt_loopup()    

What question do you have about a startup? Tell me something about the company Minio


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_companies['prompt_similarity'] = new_df_companies['embeddings'].apply(lambda vector: vector_similarity(prompt_embedding, vector))


 Minio is a data management and analytics company headquartered in Palo Alto, United States. It has received investments from General Catalyst, Nexus Venture Partners, and Dell Technologies Capital.


### End of code