## Setup and Import Libraries

In [2]:
import vertexai
import pickle
import time
import pandas as pd
import numpy as np
from vertexai.generative_models import GenerativeModel, GenerationConfig
from vertexai.language_models import TextEmbeddingModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances_argmin as distances_argmin
from utils import authenticate, encode_text_to_embedding_batched
from IPython.display import Markdown, display

In [3]:
credentials, PROJECT_ID = authenticate()

In [4]:
REGION = 'us-central1'

In [5]:
vertexai.init(
    project=PROJECT_ID, 
    location=REGION, 
    credentials=credentials
)

In [7]:
embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-005")

In [9]:
model = GenerativeModel("gemini-2.0-flash-001")

## Load Stack Overflow Questions and Answers

In [10]:
df = pd.read_csv('so_database_app.csv')

In [11]:
print("Shape: " + str(df.shape))
df.head()

Shape: (2000, 3)


Unnamed: 0,input_text,output_text,category
0,"python's inspect.getfile returns ""<string>""<p>...",<p><code>&lt;string&gt;</code> means that the ...,python
1,Passing parameter to function while multithrea...,<p>Try this and note the difference:</p>\n<pre...,python
2,How do we test a specific method written in a ...,"<p>Duplicate of <a href=""https://stackoverflow...",python
3,how can i remove the black bg color of an imag...,<p>The alpha channel &quot;disappears&quot; be...,python
4,How to extract each sheet within an Excel file...,<p>You need to specify the <code>index</code> ...,python


## Load Question Embeddings

In [12]:
with open('question_embeddings_app.pkl', 'rb') as file:
      
    # Call load method to deserialze
    question_embeddings = pickle.load(file)
    print(question_embeddings)

[[-0.03571156 -0.00240684  0.05860338 ... -0.03100227 -0.00855574
  -0.01997405]
 [-0.02024316 -0.0026255   0.01940405 ... -0.02158143 -0.05655403
  -0.01040497]
 [-0.05175979 -0.03712264  0.02699278 ... -0.07055898 -0.0402537
   0.00092099]
 ...
 [-0.00580394 -0.01621097  0.05829635 ... -0.03350992 -0.05343556
  -0.06016821]
 [-0.00436622 -0.02692963  0.03363771 ... -0.01686567 -0.03812337
  -0.02329491]
 [-0.04240424 -0.01633749  0.05516777 ... -0.02697376 -0.01751165
  -0.04558187]]


In [13]:
df['embeddings'] = question_embeddings.tolist()
df

Unnamed: 0,input_text,output_text,category,embeddings
0,"python's inspect.getfile returns ""<string>""<p>...",<p><code>&lt;string&gt;</code> means that the ...,python,"[-0.03571155667304993, -0.0024068362545222044,..."
1,Passing parameter to function while multithrea...,<p>Try this and note the difference:</p>\n<pre...,python,"[-0.020243162289261818, -0.002625499852001667,..."
2,How do we test a specific method written in a ...,"<p>Duplicate of <a href=""https://stackoverflow...",python,"[-0.05175979062914848, -0.03712264448404312, 0..."
3,how can i remove the black bg color of an imag...,<p>The alpha channel &quot;disappears&quot; be...,python,"[0.02206624671816826, -0.028208276256918907, 0..."
4,How to extract each sheet within an Excel file...,<p>You need to specify the <code>index</code> ...,python,"[-0.05498068407177925, -0.0032414537854492664,..."
...,...,...,...,...
1995,Is it possible to made inline-block elements l...,<p>If this is only for the visual purpose then...,css,"[-0.009190441109240055, -0.01732615754008293, ..."
1996,Flip Clock code works on Codepen and doesn't w...,<p>You forgot to attach the CSS file for the f...,css,"[-0.009033069014549255, -0.0009270847076550126..."
1997,React Native How can I put one view in front o...,<p>You can do it using zIndex for example:</p>...,css,"[-0.005803938489407301, -0.016210969537496567,..."
1998,setting fixed width with 100% height of the pa...,<p>You can use <code>width: calc(100% - 100px)...,css,"[-0.004366223234683275, -0.02692963369190693, ..."


## Semantic Search

When a user asks a question, we can embed their query on the fly and search over all of the Stack Overflow question embeddings to find the most simliar datapoint.

In [14]:
query = ['How to specify a variable name in Python']

In [15]:
query_embedding = embedding_model.get_embeddings(query)[0].values

In [16]:
cosine_similarity_array = cosine_similarity(
    [query_embedding], list(df.embeddings.values)
)

cosine_similarity_array.shape

(1, 2000)

Once we have a similarity value between our query embedding and each of the database embeddings, we can extract the index with the highest value. This embedding corresponds to the Stack Overflow post that is most similiar to the question "How to concat dataframes pandas".

In [17]:
index_doc_cosine = np.argmax(cosine_similarity_array)

In [18]:
index_doc_distances = distances_argmin(
    [query_embedding], list(df.embeddings.values))[0]

In [19]:
df.input_text[index_doc_cosine]

"react native : How to increase the font?<p>In my example I want to increase the font of the variables\nWhat is the way to increase the font of <code>elem.Sampling_Method_Code</code>\nin my case ?</p>\n<pre><code>var digumMethods = '';\n    execDetailData.forEach((elem) =&gt; {\n      digumMethods = digumMethods + '\\n' + elem.Sampling_Method_Code + '\\n' + elem.Parameters_String + '\\n' +__________' + '\\n';\n    });\n\n</code></pre>"

In [20]:
df.output_text[index_doc_cosine]

"<p>This one can help</p>\n<pre><code>const bigTextElement = (text) =&gt; &lt;Text style={{fontSize:20}}&gt;{text}&lt;/Text&gt;;\n\nvar digumMethods = '';\n    execDetailData.forEach((elem) =&gt; {\n      digumMethods = bigTextElement(digumMethods) + '\\n' + elem.Sampling_Method_Code + '\\n' + elem.Parameters_String + '\\n' +__________' + '\\n';\n    });\n</code></pre>"

## Question answering with relevant context

Now that we have found the most simliar Stack Overflow question, we can take the corresponding answer and use an LLM to produce a more conversational response

In [21]:
context = "Question: " + df.input_text[index_doc_cosine] +\
"\n Answer: " + df.output_text[index_doc_cosine]

In [25]:
prompt = f"""Here is the context: {context}
             Using the relevant information from the context,
             provide an answer to the query: {query}."
             If the context doesn't provide \
             any relevant information, \
             answer with \
             [I couldn't find a good match in the \
             document database for your query]
             """

In [26]:
config = GenerationConfig(
    top_k=20,
    temperature=0.8,
)

In [27]:
response = model.generate_content(contents=prompt, generation_config=config)
display(Markdown(response.text))

[I couldn't find a good match in the document database for your query]


## When the documents don't provide useful information

Our current workflow returns the most similar question from our embeddings database. But what do we do when that question isn't actually relevant when answering the user query? In other words, we don't have a good match in our database.

In addition to providing a more conversational response, LLMs can help us handle these cases where the most similiar document isn't actually a reasonable answer to the user's query.

In [28]:
query = ['How to make the perfect lasagna']

In [29]:
query_embedding = embedding_model.get_embeddings(query)[0].values

In [30]:
cosine_similarity_array = cosine_similarity(
    [query_embedding], list(df.embeddings.values)
)

cosine_similarity_array.shape

(1, 2000)

In [31]:
index_doc = np.argmax(cosine_similarity_array)

In [32]:
context = df.input_text[index_doc] + \
"\n Answer: " + df.output_text[index_doc]

In [33]:
prompt = f"""Here is the context: {context}
             Using the relevant information from the context,
             provide an answer to the query: {query}."
             If the context doesn't provide \
             any relevant information, answer with 
             [I couldn't find a good match in the \
             document database for your query]
             """

In [34]:
config = GenerationConfig(
    temperature=0.2,
)

In [35]:
response = model.generate_content(contents=prompt, generation_config=config)
display(Markdown(response.text))

[I couldn't find a good match in the document database for your query]
