In [None]:
from snowflake.snowpark.context import get_active_session

session = get_active_session()

In [None]:
from io import BytesIO
from PyPDF2 import PdfFileReader
from snowflake.snowpark.files import SnowflakeFile

def readpdf(file_path):
    whole_text = ""
    with SnowflakeFile.open(file_path, 'rb') as file:
        f = BytesIO(file.readall())
        pdf_reader = PdfFileReader(f)
        whole_text = ""
        for page in pdf_reader.pages:
            whole_text += page.extract_text()
    return whole_text

In [None]:
from snowflake.snowpark.types import StringType

session.udf.register(
    func = readpdf
  , return_type = StringType()
  , input_types = [StringType()]
  , is_permanent = True
  , name = 'SNOWPARK_PDF'
  , replace = True
  , packages=['snowflake-snowpark-python','pypdf2']
  , stage_location = 'RAW.PDF.UDF')

In [None]:
create or replace table raw_text as
    select 
        relative_path
        ,file_url
 ,snowpark_pdf(build_scoped_file_url(@folder,relative_path)) as raw_text
    from
        directory(@folder)
    where 
        relative_path != 'Tesla_Owners_Guide.pdf';

In [None]:
select * from raw_text;

In [None]:
select snowflake.cortex.complete(
    'llama2-70b-chat'
    ,concat(
        'summarise the following text'
        ,raw_text)) as summarization 
from raw_text
limit 1;

In [None]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from snowflake.snowpark.types import StringType, StructField, StructType

class text_chunker:

    def process(self,text):        
        text_raw=[]
        text_raw.append(text) 
        
        text_splitter = RecursiveCharacterTextSplitter(
            separators = ["\n"],
            chunk_size = 1000,
            chunk_overlap  = 50,
            length_function = len,
            add_start_index = True)
    
        chunks = text_splitter.create_documents(text_raw)
        df = pd.DataFrame(chunks, columns=['chunks','meta', 'raw'])
        
        yield from df.itertuples(index=False, name=None)

In [None]:
session.udtf.register( 
    handler = text_chunker,
    output_schema= StructType([
        StructField("chunk", StringType()),
        StructField("meta", StringType()),
        StructField("raw", StringType())]), 
    input_types = [StringType()] , 
    is_permanent = True , 
    name = 'CHUNK_TEXT' , 
    replace = True , 
    packages=['pandas','langchain'], 
    stage_location = 'RAW.PDF.UDF' )

In [None]:
create or replace table chunk_text as 
    select 
        replace(relative_path,'.pdf') as episode_name,
        func.*
    from 
        raw_text,
        table(chunk_text(raw_text)) as func;

In [None]:
select *
from chunk_text;

In [None]:
create or replace table vector_text as
select episode_name
    ,chunk
    ,snowflake.cortex.embed_text_768('e5-base-v2',chunk) as chunk_embedding
from chunk_text;

In [None]:
select * from vector_text limit 10;

In [None]:
select episode_name
    ,chunk
    ,vector_l2_distance(
        snowflake.cortex.embed_text(
        'e5-base-v2',
        'How is the performance of different AI platforms in translating from English to Hindi?')
        ,chunk_embedding) vector_distance
from vector_text
order by vector_distance asc
limit 1;

In [None]:
with p as(
    select 
    'How is the performance of different AI platforms in translating from English to Hindi?' as prompt
        ,snowflake.cortex.embed_text('e5-base-v2', prompt) prompt_embedding
)

,v as(
    select prompt
        ,chunk
        ,episode_name
        ,vector_l2_distance(prompt_embedding,chunk_embedding) vector_distance
    from vector_text, p
    order by vector_distance asc
    limit 1
)

select episode_name
    ,chunk
    ,snowflake.cortex.complete('llama2-70b-chat',concat( 
        'Answer the question based on the context. Be concise.'
        ,'Context: ', chunk
        ,'Question: ', prompt
        ,'Answer: ')) 
as response
from v;

In [None]:
import streamlit as st

st.title("Ask Your Data Anything :snowflake:")

st.subheader("Sample Questions")
st.caption("""
    - How is the performance of different AI platforms in translating from English to Hindi?
    - For DASH 2023 User Guide: “ Summarize the dashboard functionalities”
    - How do I open the front trunk on a Tesla Model X?
""")

prompt = st.text_input("Enter prompt", placeholder="Type your question here", label_visibility="collapsed")

quest_q = f'''
    with p as(
        select $${prompt}$$ as prompt
            ,snowflake.cortex.embed_text('e5-base-v2', prompt) prompt_embedding
    )
    ,v as(
        select prompt
            ,chunk
            ,episode_name
            ,vector_l2_distance(prompt_embedding,chunk_embedding) vector_distance
        from vector_text, p
        -- where episode_name = 'DASH 2023 Web Dashboard User Guide.pdf'
        order by vector_distance asc
        limit 1
    )
    select episode_name
        ,snowflake.cortex.complete('llama2-70b-chat',concat( 
            'Answer the question based on the context. Be concise.'
            ,'Context: ', chunk
            ,'Question: ', prompt
            ,'Answer: ')) 
    as response
    from v;
'''

if prompt:
    df_query = session.sql(quest_q).to_pandas()
    st.subheader('Information from the [' + df_query['EPISODE_NAME'][0] +'](https://app.snowflake.com/sfsenorthamerica/demo_dwilczak/#/data/databases/RAW/schemas/PDF/stage/FOLDER)')
    st.write(df_query['RESPONSE'][0])

In [None]:
insert into vector_text
with raw as(
    select relative_path
        ,file_url
        ,snowpark_pdf(build_scoped_file_url(@folder,relative_path)) as raw_text
    from directory(@folder)
    where relative_path = 'Tesla_Owners_Guide.pdf'
)
select relative_path
    ,func.chunk
    ,snowflake.cortex.embed_text('e5-base-v2',chunk) as chunk_embedding
from raw
    ,table(chunk_text(raw_text)) as func;