In [8]:
# !pip install lxml
# !pip install bs4

In [1]:
import nest_asyncio
nest_asyncio.apply()
import pickle
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_openai import OpenAIEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
# from doc_ingest_scripts.config import debug_path
import os
from langchain.docstore.document import Document
from pathlib import Path
import json
import re
from uuid import uuid4
import tiktoken
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from typing import Dict
from langchain.memory import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import AIMessage, HumanMessage, BaseMessage





def remove_extra_newlines(text):

    if isinstance(text, str):
        return "\n\n".join([line.strip() for line in text.splitlines() if line.strip()])
    return text 



def filter_website_content_and_save_vectorstore(remove_links : list[str], documents: Document, embeddings, faiss_folder_path: str, index_name: "db_companywebsite"):
    """
    Removes extra newlines characters from the page_content of each document in the list of documents.
    Filters out document objects whose metadata source link is in the remove_links list, 
    creates a FAISS vector store, and saves it locally.

    Parameters:
    - remove_links (list): A list of links to remove.
    - documents (list): A list of document objects where each object has 'page_content' and 'metadata'.
                        'metadata' is a dictionary containing a 'source' key (link to the document).
    - embeddings: The embedding model to be used for creating vector embeddings.
    - folder_path_ (str): Path to save the FAISS index locally.
    - index_name (str): The name to use when saving the FAISS index.

    Returns:
    - filtered_documents (list): A list of document objects excluding those with links in remove_links.
    """ 

    valid_documents = [
        doc for doc in documents if hasattr(doc, 'metadata') and isinstance(doc.metadata, dict)
    ]
    
    for doc in valid_documents:
        if hasattr(doc, 'page_content'):
            doc.page_content = remove_extra_newlines(doc.page_content)

    # print(documents[0])

    filtered_documents = [
        doc for doc in valid_documents
        if doc.metadata.get('source') not in remove_links
    ]
    
    len(filtered_documents)
    
    embedding_dim = len(embeddings.embed_query("hello world"))  
    index = faiss.IndexFlatL2(embedding_dim)
    
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),  
        index_to_docstore_id={},
    )
    
    uuids = [str(uuid4()) for _ in range(len(filtered_documents))]

    vector_store.add_documents(documents= filtered_documents, ids=uuids)
    vector_store.save_local(folder_path= faiss_folder_path, index_name= index_name)

    return filtered_documents, vector_store





USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:

if __name__ == "__main__":

    embeddings = OpenAIEmbeddings()

    
    sitemap_loader = SitemapLoader(web_path= "https://aptusdatalabs.com/sitemap.xml")
    docs = sitemap_loader.load()
    print(docs[0])

    with open('doc_objects.pkl', 'wb') as file:
        pickle.dump(docs, file)

    print("pickle file made!!")
    list_of_sources = []

    for doc in docs:
        list_of_sources.append(doc.metadata['source'])

    print("list of sources made!!")
    #### 
    paths_to_remove = paths_to_remove = ['https://aptusdatalabs.com/enquiry-thank-you-page/',
                       'https://aptusdatalabs.com/terms-and-conditions/',
                       'https://aptusdatalabs.com/sign-in/',
                       'https://aptusdatalabs.com/features/',
                       'https://aptusdatalabs.com/sign-up/',
                       'https://aptusdatalabs.com/get-app/', 
                       'https://aptusdatalabs.com/terms-and-conditions/',
                       'https://aptusdatalabs.com/fun-fact/', 
                       'https://aptusdatalabs.com/gallery-3-columns/', 
                       'https://aptusdatalabs.com/gallery-2-columns/', 
                       'https://aptusdatalabs.com/supply-chain-test/',
                       'https://aptusdatalabs.com/data-and-ai-accelerators/',
                       ]
    remove_links = list_of_sources[80:190]
    remove_links2 = list_of_sources[192:218]
    paths_to_remove.extend(remove_links)
    paths_to_remove.extend(remove_links2)

    
    print("length of paths to remove --> ", len(paths_to_remove))
    
    
    print("calling the function for creating vectorstore and filtering documents")
    faiss_folder_path = "./VECTOR_STORE"
    company_name = "Aptusdatalabs"
    index_name = f"db_{company_name}"
    filtered_docs, vectorstore = filter_website_content_and_save_vectorstore(paths_to_remove, docs, embeddings= embeddings,faiss_folder_path=faiss_folder_path, index_name=index_name)
    print("filtered doc", len(filtered_docs))
    print("vectorstore made!")

Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.23s/it]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.94it/s]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.00s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.27s/it]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.08it/s]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.23s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.15s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.11s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.11s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.02s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.10s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.09s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.13s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.24s/it]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.14s/it]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.05it/s]
Fetching

page_content='



 A Dummy’s Guide to Generative AI – Aptus Data Labs



































































































 















 










COMPANY  










Aptus Data Labs 



 
Aptus Data Labs delivers tailored Data Science and AI solutions, leveraging expert teams and advanced technology to transform data into actionable insights. We enhance business outcomes, provide ongoing support, and ensure top security standards.

 



 







About Us










Career Opportunities










Contact us




 







Quick contact 



 




 



                    Address                
Novel Tech Park, #46/4, GB Palya, Near Kudlu Gate, Hosur Main Road, Bengaluru, India, Pin – 560068


 



 




 



                    Call Us                
+91 8861769911 
+91 8861799911



 



 




 



                    Mail Us                
[email protected]


 



 




 



                    Working Hours                
Monday to 

In [7]:
print(len(filtered_docs[0].page_content))

10232


In [8]:

from langchain.text_splitter import CharacterTextSplitter
model="gpt-3.5-turbo-0125" 
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def text_splitting(model_name,text):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        model_name=model_name,
        chunk_size=15000, chunk_overlap=1000)

    texts = text_splitter.split_text(text)
    docs = [Document(page_content=t) for t in texts]
    return docs

In [10]:
chunk_sizes = [num_tokens_from_string(filtered_doc.page_content, model) for filtered_doc in filtered_docs]

In [15]:
import numpy as np
print(f"max_size: {max(chunk_sizes)}, min size: {min(chunk_sizes)}, avg size: {np.mean(chunk_sizes)}")

max_size: 2918, min size: 992, avg size: 1684.3150684931506


 ### We have successfully created and saved vector index in the system

## Testing if the vector stores are working or not

In [5]:
cd .

d:\projects_aptus\Aptus_BOT\VECTOR_STORE


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [6]:
def load_document_vectors(faiss_folder_path: str, session_state: bool, model: str = 'gpt-4o'):
    """
    This function load the document vectors present in our folder_path and make a list of faiss objects on which we can run our query.
    These vectors are loaded and used for retrieval of relevant chunks during the document comparison query
    Parameters:
        folder_path (str): The path to the folder containing the vectors of uplodaded document
        
    Returns:
        list: A list of faiss db objects containing the indexed chunks
             
    Example:
        results = load_document_vectors(./debug) 
    """
    
    session_state = True
    embeddings =  OpenAIEmbeddings(model=model)
    loaded_document_wise_dbs = []

    # Load all index files matching the format "db_{i}"
    try:
        # Iterate through all the files in the specified directory
        for filename in os.listdir(faiss_folder_path):
            if filename.startswith("db_") and filename.endswith(".faiss"):
                print(filename)
                index_name = os.path.splitext(filename)[0]
                print(index_name)
                loaded_db = FAISS.load_local(folder_path = faiss_folder_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)
                loaded_document_wise_dbs.append(loaded_db)
    except FileNotFoundError:
        print(f"Error: The directory '{faiss_folder_path}' does not exist.")
    except PermissionError:
        print(f"Error: Permission denied to access '{faiss_folder_path}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return session_state, loaded_document_wise_dbs 

faiss_folder_path = "./faiss_store" 
model_name = "gpt-4o"
session_state = True
session_state, vector_stores = load_document_vectors(faiss_folder_path,session_state, model_name)

db_Aptusdatalabs.faiss
db_Aptusdatalabs


In [10]:
vector_stores

[<langchain_community.vectorstores.faiss.FAISS at 0x1357d1cabf0>]

In [8]:
def query_documents_context_extraction(query: str, combined_db: list, k_doc: int = 5, score_threshold: float = 0.65):
    """
    Extracts the most relevant chunks across multiple vector databases representing each uploaded file based on a the user query in the converstation.

    This function performs similarity searches on the provided 
    document-wise databases objetcs #langchain_community.vectorstores.faiss.FAISS
    for each query in sub_queries: this function retrieves the most relevant context (documents). 
    and filters the results based on a score threshold.

    Parameters:
        user_query (str): List of the search queries to be compared against the documents.
        combined_db (list): A list of document-wise faiss object to search through.
        k_doc (int): The number of top documents to retrieve from each database.
        score_threshold (float): The minimum score required for a document to be included in the results.

    Returns:
        list: A list of tuples containing the filtered documents and their scores.
              Each tuple is in the format (document, score).
    
    Example:
        results = compare_documents("example query", [Combined_DB.faiss], 5, 0.75)
    """
    
    try:
        results = []
        
        for db_ in combined_db:
            retriever = db_.as_retriever(
                search_type="similarity_score_threshold",
                search_kwargs={"score_threshold": score_threshold,
                                "k": k_doc}    
            )
            
            
            docs_ = retriever.invoke(query)
            # print("DOCS_:", docs_)       
            results.extend(docs_)

        # Need to include this logic for removing any duplicates if present - not a issue of concern for now

        print("[INF0]: Successfully extracted the relevant chunks for answer generation")
        
        # return docs_without_duplicates
        return results
    except Exception as e:
        print(f"[ERROR]: Error occured during the chunks retreival.\n query_documents_context_extraction \ {e} ")


In [12]:
from dotenv import load_dotenv
load_dotenv(override=True)
query = "aptplan"
combined_db = vector_stores
res= query_documents_context_extraction(query, combined_db)
res

[ERROR]: Error occured during the chunks retreival.
 query_documents_context_extraction \ Error code: 403 - {'error': {'message': 'You are not allowed to generate embeddings from this model', 'type': 'invalid_request_error', 'param': None, 'code': None}} 


In [13]:


def get_faiss_files(directory: str)-> list:

    '''This function will return all the files present in the directory folder
    ending with .faiss
    '''
    faiss_files = []
    
    try:
        # Iterate through the files in the specified directory
        for file_name in os.listdir(directory):
            # Check if the file starts with db_ ends with .faiss
            # if file_name.startswith("combined_") and file_name.endswith(".faiss"):
            if file_name.endswith(".faiss"):
                faiss_files.append(file_name)
                
        print(f"[INFO]: Found the following faiss indexes in the {directory} : [{faiss_files}]")
    except FileNotFoundError:
        print(f"Error: The directory '{directory}' does not exist.")
    except PermissionError:
        print(f"Error: Permission denied to access '{directory}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return faiss_files

def extract_docs_db(faiss_folder_path: str)-> list:
    """
    This function will extract all the faiss objects representing the document chunks from my vector store ( faiss_folder_path)
    
    """
    try:
        faiss_folder = faiss_folder_path
        vector_names = get_faiss_files(faiss_folder) # Extracting all the files present in the faiss folder
        

        # Handle the case when no vector files are found
        if not vector_names:
            print("No vector files found in the specified directory.")
            error_ = "[Error]: No faiss indexes found, Please try again or check if you have stored the data into vector store.! or Check if the faiss_folder_path is correct or not"
            return   error_# Return an empty list if no vector names are found

        loaded_relevant_dbs_name = [] # List of all the vector dbs present in VECTOR_STORE/faiss_store are loaded
        embeddings = OpenAIEmbeddings()

        for filename in vector_names: 
            if filename.endswith(".faiss"):
                index_name = filename[:-6]  # Remove '.faiss' for loading
                print("Loading index: ", index_name)
                try:
                    loaded_db = FAISS.load_local(folder_path = faiss_folder, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)
                    loaded_relevant_dbs_name.append(loaded_db)
                    
                except FileNotFoundError:
                    print(f"[Error]: The file '{filename}' was not found.")
                except Exception as e:
                    print(f"[Error]: An error occurred while loading '{filename}': {e}")

        return loaded_relevant_dbs_name
    
    except Exception as e:
        print(f"[Error]: Could not extract the relevant vectors from the vector store: {e}")



if faiss_folder_path:
    print("[INFO]: Successfully loaded the vector store in the system")
try:
    document_wise_dbs = extract_docs_db(faiss_folder_path)
except Exception as e:
    print(f"[Error]: Could not load the document vectors, {e}")


[INFO]: Successfully loaded the vector store in the system
[INFO]: Found the following faiss indexes in the ./faiss_store : [['db_Aptusdatalabs.faiss']]
Loading index:  db_Aptusdatalabs


In [15]:
query = "aptplan"
combined_db = document_wise_dbs
res= query_documents_context_extraction(query, combined_db)
res

[INF0]: Successfully extracted the relevant chunks for answer generation


[Document(metadata={'source': 'https://aptusdatalabs.com/aptplan/', 'loc': 'https://aptusdatalabs.com/aptplan/', 'lastmod': '2024-09-16T09:49:00+00:00'}, page_content='AptPlan – Aptus Data Labs\n\nCOMPANY\n\nAptus Data Labs\n\nAptus Data Labs delivers tailored Data Science and AI solutions, leveraging expert teams and advanced technology to transform data into actionable insights. We enhance business outcomes, provide ongoing support, and ensure top security standards.\n\nAbout Us\n\nCareer Opportunities\n\nContact us\n\nQuick contact\n\nAddress\n\nNovel Tech Park, #46/4, GB Palya, Near Kudlu Gate, Hosur Main Road, Bengaluru, India, Pin – 560068\n\nCall Us\n\n+91 8861769911\n\n+91 8861799911\n\nMail Us\n\n[email\xa0protected]\n\nWorking Hours\n\nMonday to Friday : 9:30am - 6pm\n\nSaturday & Sunday : Closed\n\nHO Directions\n\nPLATFORMS\n\nAptPlan.ai\n\nIntelligent Supply Chain Planning\n\nAptCheck\n\nAI powered documentation compilation-evaluation Application\n\nAptveri5\n\nAI-powered 

In [18]:
res[1].metadata

{'source': 'https://aptusdatalabs.com/aptcheck/',
 'loc': 'https://aptusdatalabs.com/aptcheck/',
 'lastmod': '2024-09-17T06:00:06+00:00'}

In [19]:
docs_ = res

In [20]:
sources = []
context_ = """ """
for doc in docs_:
    # Adding only the page content into the context 
    sources.append(doc.metadata['source'])

sources

['https://aptusdatalabs.com/aptplan/',
 'https://aptusdatalabs.com/aptcheck/',
 'https://aptusdatalabs.com/aptspend/',
 'https://aptusdatalabs.com/maximizing-profits-with-route-optimization-analytics-case-study/',
 'https://aptusdatalabs.com/ai-ml-based-automatic-checklist-scoring-application-case-study/']