# FHL demo

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from chroma2 import Chroma2
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ChatVectorDBChain
from langchain.callbacks import get_openai_callback
from langchain.embeddings import OpenAIEmbeddings
from pathlib import Path
from typing import List
from langchain.document_loaders import TextLoader


In [None]:
def load_dir(path, glob):
    """Load documents."""
    p = Path(path)
    docs = []
    for i in p.glob(glob):
        if i.is_file():
            sub_docs = TextLoader(str(i)).load()
            docs.extend(sub_docs)
    return docs

def chunks(xs, n):
    n = max(1, n)
    return (xs[i:i+n] for i in range(0, len(xs), n))

In [None]:
def load_handbook_all():
    return load_dir("data/handbook/", glob="**/*.md")

def load_handbook():
    directories=["travel","paid-time-off","people-policies", "hiring", "incentives", "legal"]
    documents = []
    for dir in directories:
        documents += load_dir(f'data/handbook/{dir}/', glob="**/*.md")
    return documents
def split_documents(documents):
    #text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=0)
    text_splitter = SpacyTextSplitter.from_tiktoken_encoder(chunk_size=800, chunk_overlap=0)
    return text_splitter.split_documents(documents)

    

### Initiatilize the vector store, also loading anything previous persisted under the "collection_name"

In [None]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma2(collection_name="handbook",embedding_function=embeddings)

## Load and process the employee handbook corpus

In [None]:
handbook_docs = load_handbook_all()
splitted = split_documents(handbook_docs)
splitted = [doc for doc in splitted if len(doc.page_content) < 8192]

In [None]:
total = len(splitted)

if total < 1000:
    vectorstore.add_documents(splitted)
else: 
    import time
    MAX_PER_MIN = 1500
    BATCH_PER_MIN = 3
    SPLIT_COUNT = MAX_PER_MIN / BATCH_PER_MIN
    MIN_TIME_PER_BATCH = 60 / BATCH_PER_MIN

    doc_chunks = chunks(splitted, int(SPLIT_COUNT))
    count = 0
    for docs in doc_chunks:
        print("chunk", len(docs))
        start = time.perf_counter()
        vectorstore.add_documents(documents=docs)
        elapsed = time.perf_counter() - start
        count += len(docs)
        if (count < total) and (elapsed < MIN_TIME_PER_BATCH):
            print("sleeping for", MIN_TIME_PER_BATCH - elapsed)
            time.sleep(MIN_TIME_PER_BATCH - elapsed)

In [None]:
# requires patching duckdb as there's a parser error
vectorstore._client._db.persist()
#vectorstore._client._db.get_save_folder()

## Retrieval

In [7]:

ai = OpenAI(temperature=0,verbose=True)

In [8]:
import re
import markdown
from IPython.display import display, HTML

template_hdr = """Given a question and the following contexts (labeled as #0, #1, and #2), follow these instructions.
First, see if the question can be answered solely by the information provided in the contexts. If it cannot, return this output:
Answer: Not found

Otherwise, Provide a verbose, comprehensive answer based solely on the information found in the provided contexts. Your output should of the form:
    Context: <comma separated array containing names of all contexts referenced in the answer>
    Answer: <answer to the question>
"""

def construct_prompt(query):
  output = template_hdr
  results = vectorstore.similarity_search(query,k=3)

  for i in range(len(results)):
    doc = results[i]
    output += f"""=== CONTEXT #{i} ===
    {doc.page_content}
    ===
    """
  output += f"""Question: {query}"""
  return output, results

def question(q):
  prompt, source_results = construct_prompt(q)
  response = ai(prompt)

  # parse the response
  context = ""
  answer = ""
  try:
    results = re.search(r'CONTEXT:\s*(.*?)\s*ANSWER:\s*(.*?)$', response, re.DOTALL|re.IGNORECASE)
    context = results.group(1)
    answer = results.group(2)

    found_answer = answer.lower().strip != "not found."

    # parse the context
    if found_answer:
      sources = context.split(",")
      sources = [element for element in sources if (not element.isspace() and element)]
      sources = [re.search(r'([0-9])', src).group(1) for src in sources]
      sources = [int(s) for s in sources]
      sources = [source_results[i] for i in sources]
    else:
      sources = source_results   

    output = f"""
    <h2><div style="width:800px">{answer}</div></h3><br>
    <h4>
    Sources:
    """

    for source in sources:
      output += f"""<div><h3><b>{source.metadata['source']}</b></h3></div>
      <i><div style="background-color:#333;margin-bottom:32px">...{markdown.markdown(source.page_content)}...</div></i>
      """
    
    display(HTML(output))
  except Exception as e:
    if (re.search("not found", response, re.IGNORECASE)):
      display(HTML(f"""<h2>No answer found</h2>"""))
    else:
      print("EXCEPTION!", e)
      print(prompt)
      print(response)
      print(context)
      print("\n\n\n", response)

In [10]:

# Leave policy
#question("How much PTO do I get?")
#question("How do I file time off?")

# Travel policy
#question("I'm very tall. What are my options for flights?")
#question("What's our policy re: Covid-19?")

# Unknown (not answered in DB)
#question("Are chainsaws ok?")
#question("When do paychecks arrive?")

# Hiring
#question("How do I submit someone for a referral?")
#question("Is there a referral bonus?")

# Legal
#question("How do I create a NDA?")
#question("Is there an bonus awarded for getting a patent?")

# Promo
# question("What dates are promotions finalized?")
# question("What are the levels in the career ladder?")

# Culture
# question("What are the cultural values at Gitlab?")

# Problematic questions
# question("Where should decisions made in a meeting be captured?") # includes extraneous information board meetings.
# question("What happens when the NDA is rejected?") # Nonsense answer.
