In [47]:

from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
import os
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# Extract the list part from the string
import re
from IPython.display import Image, display, Markdown

# from config import faiss_folder
import time

from langchain_core.messages import HumanMessage 
from operator import itemgetter


from dotenv import load_dotenv

load_dotenv(override=True)


def get_faiss_files(directory: str)-> list:

    '''This function will return all the files present in the directory folder
    ending with .faiss
    '''
    faiss_files = []
    
    try:
        # Iterate through the files in the specified directory
        for file_name in os.listdir(directory):
            # Check if the file starts with db_ ends with .faiss
            # if file_name.startswith("combined_") and file_name.endswith(".faiss"):
            if file_name.endswith(".faiss"):
                faiss_files.append(file_name)
                
        print(f"[INFO]: Found the following faiss indexes in the {directory} : [{faiss_files}]")
    except FileNotFoundError:
        print(f"Error: The directory '{directory}' does not exist.")
    except PermissionError:
        print(f"Error: Permission denied to access '{directory}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return faiss_files



def extract_docs_db(faiss_folder_path: str)-> list:
    """
    This function will extract all the faiss objects representing the document chunks from my vector store ( faiss_folder_path)
    
    """
    
    faiss_folder = faiss_folder_path
    vector_names = get_faiss_files(faiss_folder_path) # Extracting all the files present in the faiss folder
    

    # Handle the case when no vector files are found
    if not vector_names:
        print("No vector files found in the specified directory.")
        error_ = "[ERROR]: No faiss indexes found, Please try again or check if you have stored the data into vector store.! or Check if the faiss_folder_path is correct or not"
        return   error_# Return an empty list if no vector names are found

    loaded_relevant_dbs_name = [] # List of all the vector dbs present in VECTOR_STORE/faiss_store are loaded
    embeddings = OpenAIEmbeddings()

    for filename in vector_names: 
        if filename.endswith(".faiss"):
            index_name = filename[:-6]  # Remove '.faiss' for loading
            print("index", index_name)
            try:
                loaded_db = FAISS.load_local(folder_path = faiss_folder, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)
                loaded_relevant_dbs_name.append(loaded_db)
                
            except FileNotFoundError:
                print(f"Error: The file '{filename}' was not found.")
            except Exception as e:
                print(f"An error occurred while loading '{filename}': {e}")

    return loaded_relevant_dbs_name



def query_documents_context_extraction(query: str, combined_db: list, k_doc: int = 6, score_threshold: float = 0.6):
    """
    Extracts the most relevant chunks across multiple vector databases representing each uploaded file based on a the user query in the converstation.

    This function performs similarity searches on the provided 
    document-wise databases objetcs #langchain_community.vectorstores.faiss.FAISS
    for each query in sub_queries: this function retrieves the most relevant context (documents). 
    and filters the results based on a score threshold.

    Parameters:
        user_query (str): List of the search queries to be compared against the documents.
        combined_db (list): A list of document-wise faiss object to search through.
        k_doc (int): The number of top documents to retrieve from each database.
        score_threshold (float): The minimum score required for a document to be included in the results.

    Returns:
        list: A list of tuples containing the filtered documents and their scores.
              Each tuple is in the format (document, score).
    
    Example:
        results = compare_documents("example query", [Combined_DB.faiss], 5, 0.75)
    """
    
    try:
        results = []
        
        for db_ in combined_db:
            retriever = db_.as_retriever(
                search_type="similarity_score_threshold",
                search_kwargs={"score_threshold": score_threshold,
                                "k": k_doc}    
            )
            
            
            docs_ = retriever.invoke(query)
            print("DOCS_:", docs_)       
            results.extend(docs_)

        # Need to include this logic for removing any duplicates if present - not a issue of concern for now

        print("[INF0]: Successfully extracted the relevant chunks for answer generation")
        
        # return docs_without_duplicates
        return results
    except Exception as e:
        print(f"[ERROR]: Error occured during the chunks retreival.\n query_documents_context_extraction \ {e} ")

# creating individual chunks 

def ASK_Question_On_Your_Documents(user_query: str, document_wise_dbs, model_name: str = "gpt-4o"):

    '''This function is used to generate answer to the user query from the list of vector stores representing the context data for 
    generating answer to the user query.

    db_Aptusdatalabs.faiss which contains the combined data of all content present in the aptus website.
    The reason for giving a list of vector index is that in future more sources can be added like documents, texts, etc as the 
    data source for answering the user query

    Logic:
    1- Extracting all the faiss objects and loading it to the system.
    2- Taking the user query and run similarity search on all the vector stores. For now we have only one faiss store
    3- The retrieved chunks will be sent to the llm along with the sources
    4- The llm response will be returned in a format : answer = {"chatbot_response":"", source: []}
    Note: We are extracting the response in this format so that we can show the sources too in the responce generated.

    Parameters:
    - user_query (str): The user query in the UI
    - faiss_folder (list): The folder containing the vector indexes
    - document_wise_dbs: All the vector indexes found in the faiss folder
    - model_name: GPT Model to be used like gpt-3.5-turbo
    
    Returns:
    - Response (dict): The answer generated by the llm to the user query after taking the context. Format= {"chatbot_response":"", source: []}
 
    '''

     # Time calculation
    start_time = time.time()
    

    docs_ = query_documents_context_extraction(user_query, document_wise_dbs, score_threshold=0.45) #extracting the relevant chunks to the query
    print(f"[INFO]: No of chunks extracted: {len(docs_)}")

    # Calling the llm for analysis
    # Post vector loading anc hunks extraction
    if not docs_:
        return "Sorry, I couldn't find any relevant information related to your query. Kindly contact the Aptus team!"
    else:
        try:
            llm = ChatOpenAI(model = model_name)
            sources = []
            context_ = """ """
            for doc in docs_:
                # Adding only the page content into the context 
                sources.append(doc.metadata['source'])
                context_ += f"source:{doc.metadata['source']} {doc.page_content} \n\n"
            

            instructions = """ Give a short answer to the user query based on the provided documents.           
                            """

            final_prompt = ChatPromptTemplate.from_messages(
                [
                    (
                        "system",
                        "You are a bot that provides concise, direct answers. You can elaborate if the user requests it, but keep responses short by default. Always format answers in Markdown.",
                    ),
                    ("human", f"{instructions}"),
                    ("human", f"{user_query}"),                    
                    ("human", "documents: {input}"),
                ]
            )
            
            print("[INFO]: Generating answer to the given query.")

            chain = {"input": itemgetter("input")} | final_prompt | llm | StrOutputParser()

            ans = chain.invoke({"input": context_.strip()})
            # display(Markdown(ans))
            ans = {"chatbot_response":ans, "sources":sources}
            return ans
        
        except Exception as e:
            print("[ERROR]: Could not Generate answer to this question")
            return f"<p style='color: red; font-weight: bold;'>Could not Generate answer to this question</p>"
            print(e)
 

In [19]:
cd .


d:\projects_aptus\Aptus_BOT


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [36]:
faiss_folder_path = "./VECTOR_STORE/faiss_store"
document_wise_dbs = extract_docs_db(faiss_folder_path)
print("[INFO]: Successfully loaded the vector store in the system")

[INFO]: Found the following faiss indexes in the ./VECTOR_STORE/faiss_store : [['db_Aptusdatalabs.faiss']]
index db_Aptusdatalabs
[INFO]: Successfully loaded the vector store in the system


In [37]:
document_wise_dbs

[<langchain_community.vectorstores.faiss.FAISS at 0x2047fa96d70>]

In [48]:
 # Calling the llm for analysis
user_query = "APTUS DATA LABS SERVICES"
# ans = query_documents_context_extraction(user_query, document_wise_dbs)
# ans 
ans =  ASK_Question_On_Your_Documents(user_query, document_wise_dbs, "gpt-4o") 
# display(Markdown(ans[]))

DOCS_: [Document(metadata={'source': 'https://aptusdatalabs.com/amazon-web-services/', 'loc': 'https://aptusdatalabs.com/amazon-web-services/', 'lastmod': '2024-09-14T11:13:51+00:00'}, page_content="Amazon Web Services – Aptus Data Labs\n\nCOMPANY\n\nAptus Data Labs\n\nAptus Data Labs delivers tailored Data Science and AI solutions, leveraging expert teams and advanced technology to transform data into actionable insights. We enhance business outcomes, provide ongoing support, and ensure top security standards.\n\nAbout Us\n\nCareer Opportunities\n\nContact us\n\nQuick contact\n\nAddress\n\nNovel Tech Park, #46/4, GB Palya, Near Kudlu Gate, Hosur Main Road, Bengaluru, India, Pin – 560068\n\nCall Us\n\n+91 8861769911\n\n+91 8861799911\n\nMail Us\n\n[email\xa0protected]\n\nWorking Hours\n\nMonday to Friday : 9:30am - 6pm\n\nSaturday & Sunday : Closed\n\nHO Directions\n\nPLATFORMS\n\nAptPlan.ai\n\nIntelligent Supply Chain Planning\n\nAptCheck\n\nAI powered documentation compilation-evalua

In [49]:
ans

{'chatbot_response': 'Aptus Data Labs offers a range of services including advisory services, data engineering and value management, cloud solutions, artificial intelligence and analytics, generative AI, analytics modernization and migration, intelligent application development and integrations, operationalizing data and AI platforms, and production support. They are particularly recognized for leveraging Amazon Web Services (AWS) to provide cloud computing solutions, data management, AI/ML solutions, and cloud-native application development.',
 'sources': ['https://aptusdatalabs.com/amazon-web-services/',
  'https://aptusdatalabs.com/cloud-solutions/',
  'https://aptusdatalabs.com/',
  'https://aptusdatalabs.com/terms-and-conditions/',
  'https://aptusdatalabs.com/case-study/',
  'https://aptusdatalabs.com/industry-business-focus/']}

In [None]:
ans

In [39]:
# retriever = document_wise_dbs[0].as_retriever(
#                 search_type="similarity_score_threshold",
#                 search_kwargs={"score_threshold": 0.6,
#                                 "k": 5}    
#             )
            
            
# docs = retriever.invoke(user_query)       
# docs

In [33]:
docs[1].metadata

{'source': 'https://aptusdatalabs.com/cloud-solutions/',
 'loc': 'https://aptusdatalabs.com/cloud-solutions/',
 'lastmod': '2024-09-16T10:19:53+00:00'}

In [29]:
results = document_wise_dbs[0].similarity_search(
    "Aptus data labs services",
    k=2,
    # filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Aptus Data Labs

COMPANY

Aptus Data Labs

Aptus Data Labs delivers tailored Data Science and AI solutions, leveraging expert teams and advanced technology to transform data into actionable insights. We enhance business outcomes, provide ongoing support, and ensure top security standards.

About Us

Career Opportunities

Contact us

Quick contact

Address

Novel Tech Park, #46/4, GB Palya, Near Kudlu Gate, Hosur Main Road, Bengaluru, India, Pin – 560068

Call Us

+91 8861769911

+91 8861799911

Mail Us

[email protected]

Working Hours

Monday to Friday : 9:30am - 6pm

Saturday & Sunday : Closed

HO Directions

PLATFORMS

AptPlan.ai

Intelligent Supply Chain Planning

AptCheck

AI powered documentation compilation-evaluation Application

Aptveri5

AI-powered Auditing solutions

AptSpend

Track your Income and Expenses with Ease

AptGenAI

Generative AI Framework for Smart Business Solutions

Use Cases

Our ready solutions can be implemented in various industries as business processes

In [None]:
           
if __name__ == "__main__":
   
    faiss_folder = ".VECTOR_STORE/vector_idx"
    

    from IPython.display import Markdown, display
    


    questions = [
    "Compare the quarterly earnings reports of Tesla, Ford, and General Motors for the last two years to identify trends in profitability and market share.",
    "Analyze the marketing strategies of Coca-Cola, PepsiCo, and Dr Pepper Snapple Group from their recent marketing plans to determine the effectiveness of their campaigns.",
    "Evaluate the specifications and user reviews of the latest smartphone models from Apple, Samsung, and Google to compare performance, camera quality, and battery life.",
    "How do the compliance documents for the COVID-19 vaccine trials of Pfizer, Moderna, and Johnson & Johnson differ in terms of safety protocols and regulatory approvals?",
    "Compare the customer feedback reports from Airbnb, Vrbo, and Booking.com to analyze trends in user experience and service quality across platforms.",
    "Evaluate the sustainability reports of Unilever, Procter & Gamble, and Nestlé to compare their environmental impact initiatives and progress towards sustainability goals.",
    "How do the R&D expenditure reports of Google, Microsoft, and Amazon compare in the fields of artificial intelligence and machine learning over the past three years?",
    "Analyze the supply chain management strategies of Walmart, Target, and Costco, focusing on efficiency, logistics, and sustainability practices based on their annual reports.",
    "Compare the employee engagement surveys and HR policies of Amazon, Google, and Microsoft to identify differences in workplace culture and employee retention strategies.",
    "How do the patent filings in AI technology from IBM, Oracle, and NVIDIA compare over the past five years in terms of innovation and market impact?"
    ]

    # query=questions[0]
    query = "tell me about jv "
    document_wise_dbs = extract_docs_db(query=user_query, faiss_folder_path=faiss_folder)

    # Calling the llm for analysis
    ans =  ASK_Question_On_Your_Documents(query, faiss_folder) 

    # Display the result as Markdown in Jupyter
    # display(Markdown(ans))   
    print(ans)