In [1]:
import numpy as np
import langchain
import os
import openai
from dotenv import load_dotenv

In [2]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# openai.api_key = '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'
openai.api_key

'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'

In [3]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, DocArrayInMemorySearch

In [4]:
# loading the data

In [5]:
loader = UnstructuredHTMLLoader("test_html.html")
data  = loader.load()
data

[Document(page_content='Article 1:\n\nTitle: Velvet Throat: From Child Actress to Music Sensation\n\nDate: March 15, 2020\n\nAbstract: Velvet Throat, the former child actress, has taken the music industry by storm with her debut single Doctor License. The song reached No. 1 on the Billboard Hot 100 and became the first song to surpass one billion streams in 2020. Velvet Throats success continued with her acclaimed album Sweet. In addition to her music career, she has also ventured into the world of cosmetics with her own line, VIPglow.', metadata={'source': 'test_html.html'})]

# Quick Start

In [6]:
# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
texts[0].page_content

'Article 1:\n\nTitle: Velvet Throat: From Child Actress to Music Sensation\n\nDate: March 15, 2020\n\nAbstract: Velvet Throat, the former child actress, has taken the music industry by storm with her debut single Doctor License. The song reached No. 1 on the Billboard Hot 100 and became the first song to surpass one billion streams in 2020. Velvet Throats success continued with her acclaimed album Sweet. In addition to her music career, she has also ventured into the world of cosmetics with her own line, VIPglow.'

In [7]:
llm = OpenAI(openai_api_key=openai.api_key, temperature = 0)
os.environ["OPENAI_API_KEY"] = openai.api_key

In [8]:
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

In [9]:
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)
db

<langchain.vectorstores.chroma.Chroma at 0x27e32a82140>

In [10]:
db.as_retriever()

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x0000027E32A82140>)

In [11]:
# expose this index in a retriever interface
retriever = db.as_retriever(
    search_type="similarity", search_kwargs={"k": 1}
)
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x0000027E32A82140>, search_kwargs={'k': 1})

##### texts

In [12]:
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="map_reduce",
    retriever=retriever,
    return_source_documents=False,
    verbose=False,
)

In [13]:
qa('What industry does Velvet Throat work at?')['result']

AuthenticationError: Incorrect API key provided: sk-Yze8X***************************************opaM. You can find your API key at https://platform.openai.com/account/api-keys.

In [None]:
qa('What is velvet throats cosmetic business name?')['result']

In [None]:
qa('What is velvet throats most famous song?')['result']

In [15]:
qa('list all industries, name and the industry type, that Velvet Throat has been involved with')['result']

' Velvet Throat - Music Industry, Cosmetics Industry.'

# Step by step

In [16]:
embed = embeddings.embed_query(' Hi my name is Sarvenaz')
len(embed)

1536

In [17]:
db2 = Chroma.from_documents(texts, embeddings)
db2

<langchain.vectorstores.chroma.Chroma at 0x216db8ab0d0>

In [18]:
query = 'Who the article is about?'
docs = db2.similarity_search(query)
len(docs)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


2

In [19]:
retriever = db2.as_retriever()

In [28]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature = 0.0) # language model

In [21]:
# if we have multiple documents, this is how we join them together:
qdocs = ''.join([docs[i].page_content for i in range(len(docs))])

In [22]:
# passing all docs as a single text to the model:
response = llm.call_as_llm(f'{qdocs} Question: list all industries, name and the industry type, that Velvet Throat has been involved with')

In [23]:
response

'Industries:\n\n1. Music Industry - Velvet Throat has become a music sensation with her successful debut single and album.\n2. Film Industry - Velvet Throat started her career as a child actress before transitioning into music.\n3. Cosmetics Industry - Velvet Throat has ventured into the world of cosmetics with her own line, VIPglow.'

In [24]:
# chain all the steps in one chain:
qa_stuff = RetrievalQA.from_chain_type(
    llm = llm # text generation model at the end
    , chain_type = 'stuff' # most simple chain that stuffs all the documents
    , retriever = retriever #interface for fetching documents
    , verbose = True
)

In [25]:
qa_stuff.run('list all industries, name and the industry type, that Velvet Throat has been involved with').result

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


AttributeError: 'str' object has no attribute 'result'

### notes:
chain_type = 
- map_reduce: look at all documents that might contain the answer and returns the summary as the final answer. Can work with any number of documents. Treat each document independantly -> may lead to redundancy. makes many calls. can be batched and run parallel. Can also be used for summarization
- Refine: builds upon the answer of the previos document. slow.
- map_rerank: do a single call to each document, score each answer and chooses the highest score as the respond. SHould be refined by explaining to the model how to define the scoring and how to choose the best one.
- stuff_method: combines all documents into one and extract the answer from it.

# Loading all documents

In [17]:
def load_htmls():
    all_files = [f for f in os.listdir('data') if f.endswith('.html')] # every html file in the folder
    docs = []
    for file in all_files:
        doc = UnstructuredHTMLLoader('data/'+file).load() # loading each document
        parsed_doc_name = file.split('/')[0].split('_') 
        first_name = parsed_doc_name[0]
        last_name = parsed_doc_name[1]
        doc_type = parsed_doc_name[2].split('.')[0] 
        # print(f'name:{first_name}, family_name:{last_name}, doc_type = {doc_type}')
        # adding to each document metadata for later easier search
        doc[0].metadata['name'] = ' '.join([first_name, last_name])
        doc[0].metadata['doc_type'] = doc_type # -> may not use it, keeping it for now
        docs.extend(doc)
    return docs

test = load_htmls()
test[-1].metadata['name']

'Zeus Manly'

In [37]:
def get_names(docs = None):
    ''' get a list of all names'''
    if docs is None:
        docs = load_htmls()
    return list(set([doc.metadata['name'] for doc in docs]))

get_names()

['Julia Harpman',
 'Helen Troy',
 'Robert King',
 'Mariann Avocado',
 'Hanna Smith',
 'Zeus Manly',
 'Jared Livinglife',
 'Jerry Smith',
 'Aphrodite Greek',
 'Velvet Throat']

## Name based retriever

In [39]:
def name_based_retriever(docs = None, name = 'All', embeddings = OpenAIEmbeddings(), chunk_size = 1000, chunk_overlap = 0):
    '''
    Creats custom retriever based on the name chosen by the user
    '''
    if docs == None:
        docs = load_htmls()
    
    if name == 'All':
        documents = docs
    else:
        documents = []
        for doc in docs:
            if doc.metadata['name']== name:
                documents.append(doc)
    # split the documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)

    #vector database
    db = Chroma.from_documents(texts, embeddings)

    # expose this index in a retriever interface
    retriever = db.as_retriever(
        search_type="similarity", search_kwargs={"k": 1}
    )
    
    return retriever, texts

# r, t = name_based_retriever(docs = None, name = 'Hanna Smith')
# t[2].page_content

# Interface

In [40]:
import gradio as gr

In [82]:
def message_and_history(input, history, name, llm = llm, verbose = True): 
    '''
    Here we are getting the user chat history and storing it in a list and adding it to the previous state.
    input: question
    history: state
    name: client name from dropdown
    llm: llm model
    retriever: RAG
    '''
    history = history or [] 
    print(history) 
    s = list(sum(history, ())) 
    print(s) 
    s.append(input) 
    print('#########################################') 
    print(s) 
    inp = ' '.join(s) 
    print(inp)
    print(f' chosen name is {name}')
    retriever, texts = name_based_retriever(docs = None, name = name)
    output = api_calling(question = inp, llm = llm, retriever = retriever, verbose = verbose) 
    history.append((input, output)) 
    print('------------------') 
    print(history) 
    print("*********************") 
    return history, history

SyntaxError: invalid syntax (2479718670.py, line 28)

In [80]:
def api_calling(question, llm, retriever, chain_type = 'stuff', verbose = True): 
    qa = RetrievalQA.from_chain_type(
        llm = llm # text generation model at the end
        , chain_type = chain_type
        , retriever = retriever #interface for fetching documents
        , verbose = verbose
    )
    respond = qa.run(question)
    
    return respond

# api_calling(question = 'Who is the article about?', llm = llm, retriever = retriever, verbose = False)

In [81]:


prompt = 'How can I help you?'
theme = gr.themes.Monochrome() #gr.themes.Soft() 
block = gr.Blocks(theme = theme)
with block:
    gr.Markdown('''<h1><center> Let's Learn More About Our (Prospective) Client! </center></h1>''')
    entity_name = gr.Dropdown(choices = sorted(['All']+get_names()), multiselect = False, label = "Choose or type in the entity's name", value = 'All')
    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder = prompt, label = 'Type here:')
    state = gr.State()
    submit = gr.Button('SEND')
    submit.click(fn = message_and_history,
                 inputs = [message, state, entity_name],
                 outputs = [chatbot, state])

block.launch(share = True)
# name                    

Running on local URL:  http://127.0.0.1:7869

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




[]
[]
#########################################
['who is this document for?']
who is this document for?
 chosen name is Robert King


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
------------------
[('who is this document for?', " I don't know.")]
*********************
[('who is this document for?', " I don't know.")]
['who is this document for?', " I don't know."]
#########################################
['who is this document for?', " I don't know.", 'Where does Robert King work?']
who is this document for?  I don't know. Where does Robert King work?
 chosen name is Robert King


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
------------------
[('who is this document for?', " I don't know."), ('Where does Robert King work?', " I don't know where Robert King works.")]
*********************
[('who is this document for?', " I don't know."), ('Where does Robert King work?', " I don't know where Robert King works.")]
['who is this docume

In [84]:
state

state

In [85]:
    input = message; history = state; name = entity_name 
    '''
    Here we are getting the user chat history and storing it in a list and adding it to the previous state.
    input: question
    history: state
    name: client name from dropdown
    llm: llm model
    retriever: RAG
    '''
    history = history or [] 
    print(history) 
    s = list(sum(history, ())) 
    print(s) 
    s.append(input) 
    print('#########################################') 
    print(s) 
    inp = ' '.join(s) 
    print(inp)
    print(f' chosen name is {name}')
    retriever, texts = name_based_retriever(docs = None, name = name)

state


TypeError: 'State' object is not iterable

In [None]:
O 