In [0]:
%pip install --upgrade --quiet langchain langchain-openai chromadb beautifulsoup4

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%pip install --upgrade --quiet tiktoken pypdf sentence_transformers InstructorEmbedding chromadb

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%pip install --upgrade --quiet  docx2txt

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:
# Configures the OpenAI API key for accessing OpenAI's models.
import os
import openai
import sys

# Set env var OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = "sk-"

In [0]:
# Import the ChatOpenAI class, which enables creating chat functionalities using OpenAI's models.
from langchain_openai import ChatOpenAI

# Initialize a ChatOpenAI object for handling conversational responses.
# 'model' specifies the OpenAI GPT model version, chosen here for its efficiency and suitability for chat applications.
# 'temperature' adjusts response generation randomness. A value of 0.2 is selected to produce more predictable, consistent answers,
# which is ideal for a chatbot aiming to deliver reliable information.
chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.2)


In [0]:
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import Docx2txtLoader

In [0]:
#Loads PDF and DOCX HR documents from the DBFS directory, preparing them for processing.
loader = DirectoryLoader(
    "/dbfs/mnt/hr_documents/", glob="./*.pdf", loader_cls=PyPDFLoader
)
data = loader.load()

In [0]:
doc_loader = DirectoryLoader(
    "/dbfs/mnt/hr_documents/", glob="./*.docx", loader_cls=Docx2txtLoader
) 
doc_data = doc_loader.load()

In [0]:
data = data + doc_data

In [0]:
import re


def clean_text(text):
    # Remove excessive spaces and line breaks
    text = re.sub(r"\s+", " ", text, flags=re.MULTILINE)

    return text.strip()

In [0]:
# Applies text cleaning to remove excessive spaces and line breaks from the document contents
for document in data:
    # Apply cleaning to the document's page content
    document.page_content = clean_text(document.page_content)

In [0]:
# Import RecursiveCharacterTextSplitter from langchain.text_splitter module.
# This class is designed for splitting text documents into smaller chunks.
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initializes the text splitter with specific parameters.
# 'chunk_size' is set to 500, indicating that each text chunk will be up to 500 characters long.
# 'chunk_overlap' is set to 0, meaning there will be no overlap between consecutive chunks of text.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

# Use the text splitter to split the loaded documents ('data') into smaller chunks.
# This process facilitates handling and processing of large text documents by breaking them down into manageable pieces.
all_splits = text_splitter.split_documents(data)


In [0]:
# Initializes a Chroma vector store with document chunks and embeddings, enabling efficient retrieval.
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
# Initialize a vector store using Chromadb with documents split into chunks and embeddings generated by OpenAI.
# 'documents' parameter takes the chunks of text split previously.
# 'embedding' parameter utilizes OpenAIEmbeddings() to generate vector embeddings for each text chunk.
# This setup allows for efficient storage and retrieval of document chunks based on semantic similarity.
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

In [0]:
# Converts the vector store into a retriever capable of semantic search with a specific configuration.
# 'k=4' configures the retriever to return the top 4 most relevant document chunks for any given query.
retriever = vectorstore.as_retriever(k=4)

# Use the retriever to find document chunks related to health plan coverages at the time of hire.
# This involves passing a specific query to the retriever's invoke method, which then searches the vector database
# for the most semantically relevant chunks of text based on the query's embeddings.
docs = retriever.invoke("what are health Plan Coverages at Time of Hire?")

# 'docs' contains the top 4 most relevant chunks as determined by the retriever, providing specific information
# or context related to the query about health plan coverages at the time of hire.
docs


[Document(page_content='CORE Benefits Plan Coverages at Time of Hire The company’s health insurance plan consists of the following core benefits: Medical Insurance (including prescription drug coverage) Dental Insurance Vision Insurance Basic Life Insurance Accidental Death and Dismemberment Insurance Employee Assistance Program Medical, dental and vision coverage is offered to all benefit eligible employees as a package, however, you can elect to opt -out of dental and/or vision and remain on the medical plan only', metadata={'page': 4, 'source': '/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf'}),
 Document(page_content='CORE Benefits Plan Coverages at Time of Hire The company’s health insurance plan consists of the following core benefits: Medical Insurance (including prescription drug coverage) Dental Insurance Vision Insurance Basic Life Insurance Accidental Death and Dismemberment Insurance Employee Assistance Program Medical, dental and vision coverage is offered to all benefit 

In [0]:
# Import the textwrap module for text formatting
import textwrap

# Define a function to wrap text, preserving original line breaks for readability
def wrap_text_preserve_newlines(text, width=110):
    """
    This function wraps text to a specified width while preserving newline characters.
    It ensures long lines are broken down into shorter ones without altering paragraph structures,
    making the text easier to read in console or text-based UIs.
    """
    # Split the input text into lines based on newline characters
    lines = text.split("\n")

    # Wrap each line individually to the specified width
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Re-join the wrapped lines with newline characters to preserve original paragraph breaks
    wrapped_text = "\n".join(wrapped_lines)

    return wrapped_text

# Define a function to process and print the response from a language model
def process_llm_response(llm_response):
    """
    Processes and prints the response from a language model, including the result and sources.
    It formats the result for better readability and lists any source documents provided in the response.
    """
    # Format and print the language model's response using the wrap_text_preserve_newlines function
    print(wrap_text_preserve_newlines(llm_response["result"]))

    # Print the sources of the response, if available, to provide context and references
    print("\n\nSources:")
    for source in llm_response["source_documents"]:
        print(source.metadata["source"])


In [0]:
# Import PromptTemplate from langchain.prompts to create structured prompts for language models.
from langchain.prompts import PromptTemplate

# Define a detailed prompt template for handling HR document inquiries.
# This template guides the model to provide concise, accurate, and polite answers to employee queries
# regarding HR documents, emphasizing clarity and utility in responses.
template = """As an HR assistant, your task is to address inquiries about the company's HR documents. 
When presented with a query from an employee, use the relevant information from the HR documents to formulate an answer that clarifies or further explores the employee's concern.
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.  Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
# Initialize the PromptTemplate object with input variables for dynamic content generation.
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

# Import RetrievalQA from langchain.chains to integrate question answering capabilities with document retrieval.
from langchain.chains import RetrievalQA

# Configure the QA chain with the chat model, document retriever, and custom prompt template.
# This setup allows for generating answers based on the HR documents, incorporating context and user questions.
qa_chain = RetrievalQA.from_chain_type(
    chat,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)


In [0]:
query = "give list of federal holidays?"
# Retrieve the response from the QA chain based on the query.
# The qa_chain uses the configured RetrievalQA setup to search through HR documents
# and generate an answer that matches the context and content of the query.
llm_response = qa_chain(query)

# Process and print the LLM's response along with any referenced source documents.
# This function formats the response for readability and lists sources to provide context
# and validation for the information provided.
process_llm_response(llm_response)

The list of federal holidays for 2023-2024 is as follows:
- New Year’s Day
- Birthday of Martin Luther King, Jr.
- Washington’s Birthday
- Memorial Day
- Juneteenth National Independence Day
- Independence Day
- Labor Day
- Columbus Day
- Veterans Day
- Thanksgiving Day
- Christmas Day

Thanks for asking!


Sources:
/dbfs/mnt/hr_documents/Federal Holidays 2023 - 2024.docx
/dbfs/mnt/hr_documents/Federal Holidays 2023 - 2024.docx
/dbfs/mnt/hr_documents/Federal Holidays 2023 - 2024.docx
/dbfs/mnt/hr_documents/Federal Holidays 2023 - 2024.docx


In [0]:
query = "What are the employee policies ?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

The employee policies at {ORGANIZATION NAME} are outlined in the Employee Handbook. This includes guidelines
on privileges and obligations of employment, the voluntary at-will employment policy, and the ability for
policies to be modified, amended, or revoked by the organization at any time without advance notice. Thanks
for asking!


Sources:
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf


In [0]:
query = "give details about the retirement plan ?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

The retirement plan at {ORGANIZATION NAME} is available for eligible full-time and part-time employees who are
21 years of age or older. The organization contributes to the employee’s retirement plan when the employee
becomes vested after one year of employment. More information about the retirement plan will be provided to
you at the time of employment. Thanks for asking!


Sources:
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf


In [0]:
query = "What do I do if I find a glitch?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

Thanks for asking! If you find a glitch, the first thing you should do is to document the details of the
glitch, including screenshots and steps to reproduce the issue. This will help the development team to
investigate and isolate the problem. Once you have documented the glitch, you can create a bug report
following the guidelines provided in the HR document "Creating Bug Reports (COE-KG-XXX)". Remember to
prioritize the glitch based on its impact on core functionalities, and provide as much information as possible
to help with the investigation.


Sources:
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx


In [0]:
query = "At what point do I get health benefits after joining?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

Coverage will be effective the 1st of the month following 30 days of employment. For example, if you are hired
on February 11th, your coverage will be effective on April 1st. Thanks for asking!


Sources:
/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf
/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf


In [0]:
query = "How serious does a bug need to be to get top priority?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

Thanks for asking! According to our HR documents, bugs that affect the core functionalities of the product are
considered high priority. These are the bugs that significantly impact the performance or usability of the
product. Bugs that have workarounds but still degrade the core functionalities are considered medium priority,
and bugs that do not affect critical/core functions of the product are low priority. I hope this helps clarify
the seriousness of bug priorities.


Sources:
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx


In [0]:
query = "What's the remote work policy look like?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

The remote work policy allows employees to work from home or another location outside of the office. It
outlines the expectations and guidelines for remote work, including communication, work hours, and equipment
usage. Thanks for asking!


Sources:
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf


In [0]:
query = "What options do I have for dental care?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

Thanks for asking! You have the option to receive dental services from any dental provider with our open
access Dental PPO plan. This plan allows you to receive services from both in-network and out-of-network
providers without the need for a Primary Dental Provider or referrals to specialists. The plan also covers a
range of services, including preventative services such as exams, cleanings, and X-rays, as well as basic and
major services. If you need to find an in-network dentist, you can search for a participating provider through
our network.


Sources:
/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf
/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf
/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf
/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf


In [0]:
query = "Need to update my address. Who do I talk to?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

You will need to notify your supervisor in writing of any changes in your personal mailing address. Thanks for
asking!


Sources:
/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf
/dbfs/mnt/hr_documents/Sample Benefit Guide.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf
/dbfs/mnt/hr_documents/Sample Employee Handbook.pdf


In [0]:
query = "Can you show me what a good bug report is ?"

llm_response = qa_chain(query)

process_llm_response(llm_response)

Thanks for asking! A good bug report should include proof that you observed the problem, a general location or
starting point to investigate the issue, and steps to reproduce the bug. It should also prioritize the defect
list into three main categories: high, medium, and low priority. Screenshots and error message capturing are
also important in a bug report. If you need more detailed information, you can refer to the document "Creating
Bug Reports (COE-KG-XXX)" for a guide on how to handle bugs and create bug reports in user stories.


Sources:
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx
/dbfs/mnt/hr_documents/Bug Reporting Directions.docx


In [0]:
# Import the ConversationBufferMemory class from langchain.memory
# This class is used to maintain a memory buffer for the conversation, allowing the chatbot to have
# a context-aware conversation based on previous interactions.
from langchain.memory import ConversationBufferMemory

# Initialize the ConversationBufferMemory with a specific memory key and configuration.
# 'memory_key' specifies the key under which conversation history is stored, here set to "chat_history".
# 'return_messages' set to True ensures that the memory will store and return the messages exchanged
# during the conversation, enabling the chatbot to reference past interactions for context.
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [0]:
from langchain.chains import ConversationalRetrievalChain

qa = ConversationalRetrievalChain.from_llm(chat, retriever=retriever, memory=memory)

In [0]:
question = "Plan Coverages at Time of Hire?"
result = qa({"question": question})

In [0]:
result["answer"]

'The plan coverages at the time of hire include Medical Insurance (including prescription drug coverage), Dental Insurance, Vision Insurance, Basic Life Insurance, Accidental Death and Dismemberment Insurance, and Employee Assistance Program. Medical, dental, and vision coverage is offered as a package, but employees can opt-out of dental and/or vision and remain on the medical plan only.'

In [0]:
question = "what is covered in vision?"
result = qa({"question": question})

In [0]:
result["answer"]

'The vision insurance plan covers an annual exam with a $5 copay for in-network providers, and a reimbursement of up to $18 for out-of-network providers. It also includes options for contact lens fitting and follow-up, with up to $40 reimbursement for out-of-network providers and 10% off retail costs.'

In [0]:
question = "what are the lens options?"
result = qa({"question": question})
result["answer"]

'The options for lenses include UV treatment, tint (solid and gradient), standard plastic scratch coating, standard polycarbonate for adults, standard anti-reflective coating, polarized, and other add-ons. There are also options for contact lenses, including conventional, disposable, and medically necessary, with specific copays and allowances.'