In [19]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os, tempfile, glob, random
from pathlib import Path
from IPython.display import Markdown
from getpass import getpass
import numpy as np
from itertools import combinations

# LLM: HuggingFace
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.llms import HuggingFaceHub

# langchain prompts, memory, chains...
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from operator import itemgetter
from langchain.memory import ConversationBufferMemory
from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnablePassthrough
from langchain.schema import Document, format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string

# Document loaders
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    DirectoryLoader,
    CSVLoader,
    UnstructuredExcelLoader,
    Docx2txtLoader,
)

# Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

# OutputParser
from langchain_core.output_parsers import StrOutputParser

# Chroma: vectorstore
from langchain_community.vectorstores.chroma import Chroma

# Contextual Compression
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_transformers import EmbeddingsRedundantFilter,LongContextReorder
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers import ContextualCompressionRetriever

# Cohere
from langchain.retrievers.document_compressors import CohereRerank
from langchain_community.llms import Cohere

In [9]:
# Data Directories: where temp files and vectorstores will be saved

TMP_DIR = Path("./DataDocs").resolve().parent.joinpath("DataDocs", "CSV")
LOCAL_VECTOR_STORE_DIR = Path("./db").resolve().parent.joinpath("db", "vector_stores")

In [10]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'
os.environ['COHERE_API_KEY']="uRYYzGigTj1i83rqHQck9GZlKgo2Pb5K0AsrpYTx"

In [11]:
# Get environment variables: HUGGINGFACEHUB_API_TOKEN, and COHERE_API_KEY

def get_environment_variable(key):
    if key in os.environ:
        value = os.environ.get(key)
        print(f"\n[INFO]: {key} retrieved successfully.")
    else :
        print(f"\n[ERROR]: {key} is not found in your environment variables.") 
        value = getpass(f"Insert your {key}")
    return value

HF_key = get_environment_variable("HUGGINGFACEHUB_API_TOKEN")
cohere_api_key = get_environment_variable("COHERE_API_KEY")


[INFO]: HUGGINGFACEHUB_API_TOKEN retrieved successfully.

[INFO]: COHERE_API_KEY retrieved successfully.


In [12]:
def langchain_document_loader(TMP_DIR):
    """
    Load documents from the temporary directory (TMP_DIR). 
    Files can be in txt, pdf, CSV or docx format.
    """

    documents = []

    txt_loader = DirectoryLoader(
        TMP_DIR.as_posix(), glob="**/*.txt", loader_cls=TextLoader, show_progress=True
    )
    documents.extend(txt_loader.load())

    pdf_loader = DirectoryLoader(
        TMP_DIR.as_posix(), glob="**/*.pdf", loader_cls=PyPDFLoader, show_progress=True
    )
    documents.extend(pdf_loader.load())

    csv_loader = DirectoryLoader(
        TMP_DIR.as_posix(), glob="**/*.csv", loader_cls=CSVLoader, show_progress=True,
        loader_kwargs={"encoding":"utf8"}
    )
    documents.extend(csv_loader.load())

    doc_loader = DirectoryLoader(
        TMP_DIR.as_posix(),
        glob="**/*.docx",
        loader_cls=Docx2txtLoader,
        show_progress=True,
    )
    documents.extend(doc_loader.load())
    return documents

In [13]:
# load documents

documents = langchain_document_loader(TMP_DIR)
print(f"\nNumber of documents: {len(documents)}")

0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 7/7 [00:00<00:00, 30.77it/s]
0it [00:00, ?it/s]


Number of documents: 4350





In [14]:
# Display a random document

import random
random_document_id = random.choice(range(len(documents)))

Markdown(f"**Document[{random_document_id}]** \n\n **Page content** (first 1000 character):\n\n" +\
         documents[random_document_id].page_content[0:1000] + " ..."  +\
         "\n\n**Metadata:**\n\n" + str(documents[random_document_id].metadata))

**Document[3612]** 

 **Page content** (first 1000 character):

L.D. COLLEGE OF ENGINEERING-BATCH 2024: 167   200280116068           KHOJA NAVIZ ASHRAFALI      IT   2024         INCUBYTE ...

**Metadata:**

{'source': 'D:\\Assets\\Gen AI\\ChatBot\\DataDocs\\CSV\\PLACEMENT-2024.csv', 'row': 173}

In [15]:
# Create a RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " ", ""],    
    chunk_size = 1600,
    chunk_overlap= 200
)

# Text splitting
chunks = text_splitter.split_documents(documents=documents)
print(f"number of chunks: {len(chunks)}")

number of chunks: 4350


In [16]:
def select_embeddings_model(LLM_service="OpenAI"):
    """Connect to the embeddings API endpoint by specifying the name of the embedding model."""

    if LLM_service == "HuggingFace":
        embeddings = HuggingFaceInferenceAPIEmbeddings(    
            api_key=HF_key, 
            model_name="thenlper/gte-large"
        )
         
    return embeddings
   

embeddings_HuggingFace = select_embeddings_model(LLM_service="HuggingFace")

sentences = ["DATA",
             "2012",
             "CHAUHAN RIDHAM VIJAYKUMAR"]
# 1. Calculate embedding vectors
embedding_vectors = [embeddings_HuggingFace.embed_query(sentence) for sentence in sentences]

for combination in list(combinations(range(len(sentences)),2)):
    # 2. Calculate similarity using dot product from numpy:
    dot_prodduct = round(np.dot(embedding_vectors[combination[0]], embedding_vectors[combination[1]]),3)
    print(f"Similarty of sentences {combination}: {dot_prodduct}")

Similarty of sentences (0, 1): 0.793
Similarty of sentences (0, 2): 0.742
Similarty of sentences (1, 2): 0.772


In [23]:
def create_vectorstore(embeddings, documents, vectorstore_name):
    """Create a Chroma vector database."""
    persist_directory = (LOCAL_VECTOR_STORE_DIR.as_posix() + "/" + vectorstore_name)
    try:
        vector_store = Chroma.from_documents(
            documents=documents,
            embedding=embeddings,
            persist_directory=persist_directory
        )
        return vector_store
    except Exception as e:
        print(f"Error: {e}")
        raise


In [None]:
%%time

create_vectorstores = True # change to True to create vectorstores

if create_vectorstores:
    vector_store_HF = create_vectorstore(
        embeddings=embeddings_HuggingFace,
        documents = chunks,
        vectorstore_name="Vit_All_HF_Embeddings"
    )
    # print("vector_store_HF:",vector_store_HF._collection.count(),"chunks.")
    # print("")