In [None]:
# Import python packages
import streamlit as st
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import *

# Write directly to the app
st.title("Use Vector Embeddings to make data searchable and only send the relevant data to the LLM for quick answers")

# Get the current credentials
session = get_active_session()



letters = session.table('BUILD_UK.DATA.LETTERS_TO_MP')

SMEDIA = session.table('BUILD_UK.DATA.SOCIAL_MEDIA')

st.markdown('#### Here are the letters')
letters

st.markdown('#### Here is the social media')
SMEDIA


object = letters.select(F.col('LETTER').alias('OBJECT'))\
.union(SMEDIA.select(F.col('V').astype(StringType()).alias('OBJECT')))



#### Present both letters and social media comments as an object

In [None]:
object = letters.select(F.col('LETTER').alias('OBJECT'))\
.union(SMEDIA.select(F.col('V').astype(StringType()).alias('OBJECT')))

st.write(object)

#### Use Vector Embeddings to put the unstructured data into an array using built in text embeddings

![alt text](https://docs.snowflake.com/en/_images/vector-similarity-vectors-example.png "Title")

In [None]:
embeds = object.with_column('EMBED',F.call_function('SNOWFLAKE.CORTEX.EMBED_TEXT_768',
                                                    F.lit('snowflake-arctic-embed-m'),
                                                    F.col('OBJECT'))).cache_result()

In [None]:
embeds

#### Create a question you want to ask the data about

In [None]:
poison_search = embeds.with_column('QUESTION',F.lit('Tell me about all the things mentioned about food?'))



#### Embed the Question the same way as the data

In [None]:
poison_search = poison_search.with_column('EMBEDQ',F.call_function('SNOWFLAKE.CORTEX.EMBED_TEXT_768',
                                                    F.lit('snowflake-arctic-embed-m'),
                                                    F.col('QUESTION'))).cache_result()

#### Use the Vector Cosine Similarity function to rank each row based on how close the vectors are - the closer the data is to the question, the higher the ranking

In [None]:
poison_similar = poison_search.with_column('search',F.call_function('VECTOR_COSINE_SIMILARITY'
                                           ,F.col('EMBED'),
                                          F.col('EMBEDQ')))

poison_similar 

#### Sort by relavance and only show the top 3

In [None]:
poison_similar = poison_similar.sort(F.col('SEARCH').desc()).limit(3)

#### Tidy up the dataframe to view only the question and the relevant datasets

In [None]:
poison_similar.select('OBJECT','QUESTION')

### Use Cortex Complete in the same way as before to provide a readable answer

In [None]:
ANSWER = poison_similar.with_column('ANSWER',
                                    F.call_function('SNOWFLAKE.CORTEX.COMPLETE',F.lit('mixtral-8x7b'),
                                                   F.concat(F.col('QUESTION'),
                                                           F.lit(' Based on the following data: '),
                                                           F.col('OBJECT'),
                                                           F.lit('Only provide the answer '),
                                                           F.lit('Do not provide additional commentary'))))

In [None]:
ANSWER

In [None]:
st.write(ANSWER.select('ANSWER').to_pandas().ANSWER.iloc[0])