https://medium.com/@octaviopavon7/local-rag-with-local-llm-huggingface-chroma-5e0fc3b6133a

In [1]:
# chroma.py
from chromadb import Client, ClientAPI

In [2]:
class Chroma():
    """
      Chroma class to instantiate a vector db in memory.
    """
    def __init__(self, default_database: str = "default", first_collection_name: str = "test", top_k: int = 1):
        self.api: ClientAPI = Client()
        self.collection_pointer = self.api.create_collection(first_collection_name)
        self.top_k = top_k
    
    def new_collection(self ,name: str, **kwargs):
        try:
            self.api.create_collection(name, **kwargs)
        except Exception as e:
            print(e)
    
    def add_data_to(self, data):
        try:
            self.collection_pointer.add(
                embeddings=data.get("embeddings"),
                documents=data.get("contents"),
                metadatas=data.get("metadatas"),
                ids=data.get("ids")
            )
        except Exception as e:
            print(e)
    
    def switch_collection(self, new_pointer: str):
        try:
            self.collection_pointer = self.api.get_collection(new_pointer)
        except Exception as e:
            print(e)
    
    def query(self, embedding: list[float], **kwargs):
        try:
            result = self.collection_pointer.query(query_embeddings=embedding, n_results=self.top_k, **kwargs)
            print(result)
        except Exception as e:
            print(e)

In [3]:
# chunking.py
import os
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, TextLoader

In [4]:
def extract(file, ext: str, chunking:int=400,overlap:int=20, **kwargs):
    if ext == "pdf":
        return extract_pdf(file, chunking, overlap, **kwargs)
    if ext == "txt":
        return extract_text(file,chunking, overlap, **kwargs)
    return extract_excel(file,chunking, overlap, **kwargs)

def extension(file_name:str):
    return file_name.split(".")[-1]

def extract_pdf(file,chunking, overlap, **kwargs):
    loader = UnstructuredPDFLoader(file)
    return load(loader, chunking, overlap, **kwargs)

def extract_text(file,chunking, overlap, **kwargs):
    loader = TextLoader(file, 'utf-8')
    return load(loader, chunking, overlap, **kwargs)

def load(loader: TextLoader | UnstructuredPDFLoader, chunking, overlap, **kwargs):
    return loader.load_and_split(text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunking, chunk_overlap=overlap),**kwargs)


def get_file_chunks(file_bytes: bytes, file_name: str, chunk_size: int = 200, chunk_overlap: int = 10):
    try:
        # Using temp-directory to store our read file for a little while
        # This is because langchain loaders only acept file paths and not its raw bytes.
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_file_path = os.path.join(temp_dir, file_name)
            
            with open(temp_file_path, "wb") as temp_file:
                temp_file.write(file_bytes)
                
            chunks = extract(temp_file_path, extension(file_name), chunk_size, chunk_overlap)

            return chunks
    except Exception as e:
        print("ERROR TRYING TO GET CHUNKS FROM FILE")

In [5]:
# embedding.py
from openai import OpenAI   
from os import environ as env

In [6]:

api_key = "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"

client = OpenAI(api_key=api_key)

def embedding(input: str):
    return client.embeddings.create(
        model="text-embedding-ada-002",
        input=input,
    ).data[0].embedding

In [7]:
# data_loader.py
#from chroma import Chroma
#from chunking import get_file_chunks
from langchain.schema import Document
#from embeddings import embedding

In [8]:
class Loader():

    def __init__(self, chroma_instance: Chroma):
        self.chroma = chroma_instance
        self.files_read = []
    
    def load_document(self, path: str):
        chunks = self.chunk_document(path)
        self.insert_data_to_chroma(chunks)
        print("Data added successfully.")

    def chunk_document(self, path: str):
        file = open(path, "rb")
        bytes = file.read()
        if bytes:
            self.files_read.append(path)
            print("File read successfully. Loading into vector store...")

        chunks: list[Document] = get_file_chunks(bytes, path)
        return chunks
    
    def insert_data_to_chroma(self, chunks: list[Document]):
        documents = []
        for index, chunk in enumerate(chunks):
            data={
                    "embedding": embedding(chunk.page_content),
                    "content": chunk.page_content,
                    "metadata": chunk.metadata,
                    "id": index
                }
            documents.append(data)

        embeddings = [data["embedding"] for data in documents]
        contents = [data["content"] for data in documents]
        metadatas = [data["metadata"] for data in documents]
        ids = [str(data["id"]) for data in documents]

        data={
            "embeddings":embeddings,
            "contents":contents,
            "metadatas":metadatas,
            "ids":ids
        }
        try:
            self.chroma.add_data_to(
                "test",
                data=data
            )
        except Exception as e:
            print(e)

In [9]:
from transformers import pipeline

In [10]:
class LLM():
    """
        LLM Base class - Use this class to instance multiple-llms.
        [See HF repository to view all llm models]
    """
    # text generation
    def __init__(self, default_model: str, tasks : list[str], initial_config: dict):
        self.tasks = tasks
        self.config = initial_config
        self.default_model = default_model
        try:
            self.actions = {}
            for pipe in self.task_create_pipelines(model=default_model, initial_config=initial_config):
                self.actions[pipe.task] = pipe
        except Exception as e:
            print(e)
            raise Exception("Error creating pipelines. Logs in console.")
    
    def task_create_pipelines(self, model: str, initial_config: dict):
        for task in self.tasks:
            yield pipeline(task, model=model, trust_remote_code=True, model_kwargs=initial_config)

    def task(self, task: str, prompt: str, **kwargs):
        if task in self.tasks:
            return self.actions[task](prompt, **kwargs)
        return "Task not available."
    
    def task_text(self, context:str, question:str):
        generation = self.actions["question-answering"](
            context=context,
            question=question
        )
        return generation

In [11]:
#from data_loader import Loader
#from chroma import Chroma
#from embeddings import embedding
#from llms import LLM

In [None]:
#ChromaInstance = Chroma()
LoaderInstance = Loader(chroma_instance=ChromaInstance)

# change it with your text file or pdf file path.
example_path = "LocalDocs/MIL-STD-882E.pdf"

LoaderInstance.load_document(example_path)

# Use in True whether you wan't that the model uses less vram but you need a GPU.
# Use in False whether you want to use it fully fp precision (will use more vram)
quantized = True

llm = LLM(default_model="timpal0l/mdeberta-v3-base-squad2", tasks=["question-answering"], initial_config={
    "device_map":"auto"
})

while True:
    user_query = input("- Prompt: ")
    context = ChromaInstance.query(embedding(user_query), {})
    
    user_query_template = create_template(context, user_query)
  
    completion = llm.task("question-answering", user_query_template)

    print(completion)

File read successfully. Loading into vector store...
ERROR TRYING TO GET CHUNKS FROM FILE
