In [2]:
## Load environment variables

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
## Define the LLM Model and the Embedding model

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
llm=ChatOpenAI(model="gpt-4o-mini",temperature=0)
embeddings=OpenAIEmbeddings(model="text-embedding-3-small", dimensions=1536)

In [4]:
## Load the documents

from langchain_community.document_loaders import Docx2txtLoader
loader_bsw = Docx2txtLoader("BSW SOM 0.1.docx")
loader_npac = Docx2txtLoader("NPAC COREX SOM v2.0.docx")
data_bsw = loader_bsw.load()
data_npac = loader_npac.load()

In [5]:
## Define a custom splitter

import re
from langchain_openai import ChatOpenAI
from typing import Any, List
from langchain_text_splitters import TextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document

class GPTSplitter(TextSplitter):
    def __init__(self, model_name: str = "gpt-4o-mini", **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.model = ChatOpenAI(model=model_name)

        self.prompt = ChatPromptTemplate.from_template(
            """You are an expert in analyzing and structuring technical documentation, particularly Service Operation Manuals (SOMs). Your task is to divide the given text into coherent, meaningful chunks that preserve both the document's structure and its detailed content. Each chunk should encapsulate a complete section or subsection, including all relevant details, explanations, and bullet points.

                Follow these guidelines:
                1. Preserve the hierarchical structure (e.g., main sections, subsections) and include all content under each header.
                2. Keep related information together (e.g., lists, tables, contact information, detailed explanations).
                3. Ensure each chunk is self-contained and includes all necessary context and details.
                4. Aim for chunks that cover complete sections or logical groupings of subsections.
                5. Include all text content, not just headers or bullet points.
                6. Maintain the original formatting and structure within each chunk.

                Wrap each chunk in <<<>>> markers.

                Example:
                <<<3. Centiq Managed Service
                3.1. Introduction
                • Centiq provides comprehensive managed services for SAP environments
                • Our team of experts ensures 24/7 monitoring and support
                • We offer proactive maintenance and optimization
                [Include all bullet points and any additional explanatory text here]

                3.2. Managed Service Description
                3.2.1. Service Operations
                • 24/7 monitoring of SAP systems and infrastructure
                • Incident management and resolution
                • Performance tuning and optimization
                [Include all bullet points and detailed descriptions of each service operation]>>>

                Now, process the following text from the Service Operation Manual, ensuring to include ALL content and details:

                {text}"""
        )
        self.output_parser = StrOutputParser()
        self.chain = (
            {"text": RunnablePassthrough()}
            | self.prompt
            | self.model
            | self.output_parser
        )

    # def split_text(self, text: str) -> List[str]:
    #     response = self.chain.invoke({"text": text})
    #     # Use regex to split properly by <<< and >>> markers
    #     chunks = re.findall(r'<<<(.*?)>>>', response, re.DOTALL)
    #     return [chunk.strip() for chunk in chunks]
    
    def split_text(self, text: str) -> List[Document]:
      chunks = self.chain.invoke(text).split('<<<')[1:]  # Split the result and remove the first empty element
      return [Document(page_content=chunk.strip('>>>').strip()) for chunk in chunks]

In [6]:
## Split the documents using the custom splitter

gpt_splitter = GPTSplitter()
bsw_docs = gpt_splitter.split_text(data_bsw)
npac_docs = gpt_splitter.split_text(data_npac)

In [7]:
for doc in bsw_docs:
    doc.metadata["source"] = "BSW"

for doc in npac_docs:
    doc.metadata["source"] = "NPAC"

In [8]:
all_chunks = bsw_docs + npac_docs

In [9]:
# Create Chroma vector store
from langchain_chroma import Chroma

Chroma.from_documents(
    documents=all_chunks,
    embedding=embeddings,
    persist_directory="./SOM"
)

<langchain_chroma.vectorstores.Chroma at 0x240e31a5df0>

In [10]:
## loading the vector database from local db

vectordb=Chroma(persist_directory="SOM",embedding_function=embeddings)

In [11]:
retriever=vectordb.as_retriever(search_kwargs={"k": 10})

In [12]:
retriever.invoke("Is client copy in scope for BSW?")

[Document(metadata={'source': 'BSW'}, page_content='2. Customer\n\nBSW Timber (“BSW”) has acquired the Building and Supply Solutions division of SCA Wood UK, which includes the manufacture and distribution of timber and associated products to the home improvement and builders merchant sectors. \n\nAs part of the transition of the Building and Supply Solutions division to BSW there has been a transfer of the company data previously maintained as company 629 in the SCA Wood SAP ECC system. This transfer has been realised through the following high-level activities for which Centiq (and its partner SNP) have been engaged:\n\n- Creation of a target SAP landscape in Azure (“Target System”)\n- Analysis, mapping and migration (“Carve-out”) of company 629 data to Target System\n- Upgrade of SAP ECC in the Target System to SAP S/4\n\nFollowing the completion of the Carve-out Centiq is providing technical support, and systems management and maintenance services (“Managed Services”) for a period 

In [20]:
## Generating the multiquery retriever

from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
import re


QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an expert query rewriter specializing in SAP Basis managed services. Your task is to take an original query and generate at least 3 different versions of the same query. These variations should:

                1. Maintain the core intent of the original query
                2. Use different phrasings, synonyms, and technical terms
                3. Vary in complexity, from simple to more detailed
                4. Include relevant SAP Basis-specific terminology where appropriate
                5. Consider different aspects or perspectives of the original question
                6. Utilize common SAP Basis synonyms and alternative terms where possible
                7. Consider using alternate terms where appropriate, e.g system refresh for system copy, etc

                Please provide at least 3 rewritten versions of this query, ensuring that each version is distinct and adds value to the search process. Label each rewritten query clearly.

                Provide these alternative question like this:
                    <<question1>>
                    <<question2>>
                Only provide the query, no numbering.

                Remember to consider various user roles (e.g., SAP Basis administrators, support staff, managers) and their potential ways of asking the same question.

                Original question: {question}""",
)


def split_and_clean_text(input_text):
    return [item for item in re.split(r"<<|>>", input_text) if item.strip()]

multiquery_chain = (
    QUERY_PROMPT | llm | StrOutputParser() | RunnableLambda(split_and_clean_text)
)

In [21]:
list_of_questions=multiquery_chain.invoke("How many client copies are allowed in a year?")
list_of_questions

['What is the maximum number of client copies permitted annually in SAP systems?',
 'How many times can we perform client copies within a single year in our SAP environment?',
 'What is the annual limit for executing client copies in SAP Basis management?']

In [22]:
## Creating a function for returning unique documents(remove duplicate docs)

def flatten_and_unique_documents(documents):
    flattened_docs = [doc for sublist in documents for doc in sublist]

    unique_docs = []
    unique_contents = set()
    for doc in flattened_docs:
        if doc.page_content not in unique_contents:
            unique_docs.append(doc)
            unique_contents.add(doc.page_content)

    return unique_docs

def find_rel_docs(query):
    list_of_questions = multiquery_chain.invoke(query)
    docs = [retriever.invoke(q) for q in list_of_questions]
    final_docs=flatten_and_unique_documents(documents=docs)
    return final_docs

In [23]:
found_docs=find_rel_docs(query="How many system copies are allowed in a year for BSW?")
found_docs

[Document(metadata={'source': 'BSW'}, page_content='3.4. Ticket Volume\n\nThe Managed Service anticipates the following ticket volumes, which will be subject to the Fair Usage Policy and will be reviewed on a quarterly basis in line with the requirements set out in Clause 3 of the Agreement:\n\n- Triage - 40 per annum\n- Service Requests - 100 per annum\n- Tickets Annual - 140 per annum\n\nThe annual ticket volume covers inbound tickets from BSW only. Any tickets created by Centiq as part of the routine delivery of the Managed Services, e.g. Release Management, do not contribute to these volumes.>>>'),
 Document(metadata={'source': 'BSW'}, page_content='16. Appendix\n\nA. Technical Solution\n\nBSW Asset List\n\nThe copy attached is uncontrolled. For the latest version refer to:\n\nTo be updated\n\nBSW Azure Infrastructure Landscape Diagram\n\nChris to provide\n\nThe copy attached is uncontrolled. For the latest version refer to:\n\nTo be updated\n\nBSW MIM process\n\nAwaited from BSW\n

In [24]:
retrieve_chain=multiquery_chain | RunnableLambda(find_rel_docs)

In [25]:
retrieve_chain.invoke("How many system copies are allowed in a year?")

[Document(metadata={'source': 'BSW'}, page_content='10. Release Management\n\nRelease Management\n\nIncluded as BAU\n\nProject \n\nOperating System Patching\n\n(SUSE)\n\nOne upgrade per annum to a new version or service pack\n\nBi-annual minor release updates\n\nMinimum quarterly patching (security & hotfixes, features) based on SUSE Manager baseline\n\nHANA Database Patching \n\nOne major HANA release per annum\n\nBi-annual minor release update\n\nMinimum quarterly but as required patching (security & hotfixes, features)\n\nSAP Kernel Patching \n\nBi-annually\n\nRequires scoping and pricing\n\nSAP ST Plugin patching \n\nBi-annually\n\nRequires scoping and pricing\n\nInstallation of new add-ons or plugins\n\nNot currently included in BAU\n\nRequires scoping and pricing\n\nFeature pack installation/update\n\nNot currently included in BAU\n\nRequires scoping and pricing\n\nEHP upgrades\n\nNot currently included in BAU\n\nRequires scoping and pricing\n\nSupport Pack Patching\n\nNot curren

In [26]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub

answer_template = """You are an AI assistant tasked with answering questions based on the provided documents. Your goal is to generate accurate and informative answers using only the information contained in these documents.

When answering, please follow these guidelines:
1. Analyze the given question and the provided context documents carefully.
2. Formulate your answer in a clear, concise, and structured manner.
3. Present your answer in a point-wise format, using numbered points for main ideas or steps.
4. Use sub-points (a, b, c) if necessary to provide additional details under a main point.
5. Ensure each point is directly relevant to the question and supported by the context documents.
6. If the information in the documents is insufficient to fully answer the question, state this clearly and provide any partial information that is available.
7. Do not contradict yourself or provide inconsistent information.

Question: {question} 

Context: {context}

Answer:"""

answer_prompt = ChatPromptTemplate.from_template(answer_template)

llm=ChatOpenAI(model="gpt-4o-mini",temperature=0.1)

query="Who is the main contact at Centiq?"

def format_docs(relevant_docs):
    return "\n\n".join(doc.page_content for doc in relevant_docs)



answer_chain = (
    {"context": retrieve_chain| format_docs, "question": RunnablePassthrough()} # type: ignore
    | answer_prompt
    | llm
    | StrOutputParser()
)

In [27]:
def chatbot():
    print("Welcome to the Service Operations Manual Chatbot!")
    print("Ask questions about BSW or NPAC, or type 'quit' to exit.")
    
    while True:
        query = input("\nYour question: ").strip()
        
        if query.lower() == 'quit':
            print("Thank you for using the chatbot. Goodbye!")
            break
        
        if not query:
            print("Please enter a valid question.")
            continue
        
        print("Thinking...")
        
        try:
            for chunk in answer_chain.stream(query):
                print(chunk, end="", flush=True)
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            print("Please try asking your question in a different way.")

# Run the chatbot
if __name__ == "__main__":
    chatbot()


Welcome to the Service Operations Manual Chatbot!
Ask questions about BSW or NPAC, or type 'quit' to exit.
Thinking...
The main contacts for NPAC (Nordic Packaging and Container Holdings Limited) are as follows:

1. **Gavin McKay**
   - **Job Title:** Group IT Manager
   - **Email Address:** gavin.mckay@corexgroup.com
   - **Contact Number:** 
     - Telephone: +44 (0) 1204 323918
     - Mobile: +44 (0) 7342 997352

2. **Ismo Savolainen**
   - **Job Title:** Global SAP Manager
   - **Email Address:** ismo.savolainen@corexgroup.com
   - **Contact Number:** +358 40 9618945

These individuals are the primary points of contact for any inquiries or communications related to NPAC.Thinking...
The provided documents do not specify the exact number of system copies that are in scope for BSW. However, they do mention the following relevant points:

1. **System Management Activities**:
   - The documents outline various activities related to system management, including local client creation, cop