In [1]:
import numpy as np
import langchain
import os
import openai
from dotenv import load_dotenv

In [2]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, DocArrayInMemorySearch

##### texts

In [4]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature = 0.0) # language model

In [5]:
embeddings = OpenAIEmbeddings()

### notes:
chain_type = 
- map_reduce: look at all documents that might contain the answer and returns the summary as the final answer. Can work with any number of documents. Treat each document independantly -> may lead to redundancy. makes many calls. can be batched and run parallel. Can also be used for summarization
- Refine: builds upon the answer of the previos document. slow.
- map_rerank: do a single call to each document, score each answer and chooses the highest score as the respond. SHould be refined by explaining to the model how to define the scoring and how to choose the best one.
- stuff_method: combines all documents into one and extract the answer from it.

# Loading all documents

In [6]:
def load_htmls():
    all_files = [f for f in os.listdir('data') if f.endswith('.html')] # every html file in the folder
    docs = []
    for file in all_files:
        doc = UnstructuredHTMLLoader('data/'+file).load() # loading each document
        parsed_doc_name = file.split('/')[0].split('_') 
        first_name = parsed_doc_name[0]
        last_name = parsed_doc_name[1]
        doc_type = parsed_doc_name[2].split('.')[0] 
        # print(f'name:{first_name}, family_name:{last_name}, doc_type = {doc_type}')
        # adding to each document metadata for later easier search
        doc[0].metadata['name'] = ' '.join([first_name, last_name])
        doc[0].metadata['doc_type'] = doc_type # -> may not use it, keeping it for now
        docs.extend(doc)
    return docs

# test = load_htmls()
# test[-1].page_content

'Personal Email: N/A'

In [7]:
def get_names(docs = None):
    ''' get a list of all names'''
    if docs is None:
        docs = load_htmls()
    return list(set([doc.metadata['name'] for doc in docs]))

get_names()

['Mariann Avocado',
 'Hanna Smith',
 'Jared Livinglife',
 'Zeus Manly',
 'Velvet Throat',
 'Helen Troy',
 'Julia Harpman',
 'Robert King',
 'Aphrodite Greek',
 'Jerry Smith']

## Name based retriever

In [8]:
def name_based_retriever(docs = None, name = 'All', embeddings = OpenAIEmbeddings(), chunk_size = 1000, chunk_overlap = 0):
    '''
    Creats custom retriever based on the name chosen by the user
    '''
    if docs == None:
        docs = load_htmls()
    
    if name == 'All':
        print('all')
        documents = docs
    else:
        print(f'name is {name}')
        documents = [doc for doc in docs if doc.metadata['name'] == name] 
    # split the documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)

    #vector database
    db = Chroma.from_documents(texts, embeddings)

    # expose this index in a retriever interface
    client_filter = {'client_name': {'$eq': name}}
    retriever = db.as_retriever(
        search_type="similarity", search_kwargs={"k": 5}
    )
    
    return retriever, texts

# r, t = name_based_retriever(docs = None, name = 'Mariann Avocado')
# t[0]

In [23]:
name = 'Robert King'
r, t = name_based_retriever(docs = None, name = name)
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="map_reduce",
    retriever=r,
    return_source_documents=False,
    verbose=False,
)

name is Robert King


In [28]:
qa('What industry does Velvet Throat work at?')['result']

' Velvet Throat works in the hospitality industry.'

In [29]:
qa(f'What industry does {name} work at?')['result']

' Robert King works in the hedge fund industry.'

In [35]:
qa(f'Where has {name} donated money to?')['result']

' Robert King has donated $1 million to a local charity, the Red Cross, $500,000 to the Boys & Girls Club, $250,000 to Habitat for Humanity, $100,000 to the Democratic National Committee, and $50,000 to the California Democratic Party.'

# Interface

In [37]:
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
def message_and_history(input, history, name, llm = llm, verbose = True): 
    '''
    Here we are getting the user chat history and storing it in a list and adding it to the previous state.
    input: question
    history: state
    name: client name from dropdown
    llm: llm model
    retriever: RAG
    '''
    history = history or [] 
    print(history) 
    s = list(sum(history, ())) 
    print(s) 
    s.append(input) 
    print('#########################################') 
    print(s) 
    inp = ' '.join(s) 
    print(inp)
    print(f' chosen name is {name}')
    retriever, texts = name_based_retriever(docs = None, name = name)
    output = api_calling(question = inp, llm = llm, retriever = retriever, verbose = verbose) 
    history.append((input, output)) 
    print('------------------') 
    print(history) 
    print("*********************") 
    return history, history

In [41]:
def api_calling(question, llm, retriever, chain_type = 'map_reduce', verbose = True): 
    qa = RetrievalQA.from_chain_type(
        llm = llm # text generation model at the end
        , chain_type = chain_type
        , retriever = retriever #interface for fetching documents
        , verbose = verbose
    )
    respond = qa.run(question)
    
    return respond

# api_calling(question = 'Who is the article about?', llm = llm, retriever = retriever, verbose = False)

In [42]:


prompt = 'How can I help you?'
theme = gr.themes.Monochrome() #gr.themes.Soft() 
block = gr.Blocks(theme = theme)
with block:
    gr.Markdown('''<h1><center> Let's Learn More About Our (Prospective) Client! </center></h1>''')
    entity_name = gr.Dropdown(choices = sorted(['All']+get_names()), multiselect = False, label = "Choose or type in the entity's name", value = 'All')
    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder = prompt, label = 'Type here:')
    state = gr.State()
    submit = gr.Button('SEND')
    submit.click(fn = message_and_history,
                 inputs = [message, state, entity_name],
                 outputs = [chatbot, state])

block.launch(share = True)
# name                    

Running on local URL:  http://127.0.0.1:7861

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [84]:
state

state