In [3]:
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")
db = Chroma(collection_name="Docs", embedding_function=embedder)

ids = db.get()['ids']
if ids:
    db.delete(ids)
    print(f"Deleted {len(ids)} documents")
else:
    print("No documents found")

2024-12-21 14:36:21,439 - INFO - Use pytorch device_name: cpu
2024-12-21 14:36:21,439 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5


No documents found


In [4]:
import logging
from pathlib import Path
import psutil
import re
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.llms import LlamaCpp
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

class Settings:
    def __init__(self):
        self.embedder = "BAAI/bge-small-en-v1.5"
        self.ranker = "cross-encoder/ms-marco-TinyBERT-L-2"
        self.model = "models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
        self.gpuDepth = 0
        self.threads = min(12, psutil.cpu_count(logical=False))
        self.batch = 512
        self.chunk = 384
        self.overlap = 64
        self.context = 1024
        self.temp = 0.7
        self.topk = 3
        self.retrieve = 8
        self.rerank = 3
        self.db = "Docs"
        self.fusion = 60

class TextProcessor:
    def __init__(self, cfg):
        self.cfg = cfg
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=cfg.chunk,
            chunk_overlap=cfg.overlap,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        self.loaders = {'.pdf': PyPDFLoader, '.docx': Docx2txtLoader, '.txt': TextLoader}

    def clean(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.,!?;:-]', '', text)
        return text.strip()

    def load(self, path):
        ext = Path(path).suffix.lower()
        loader = self.loaders.get(ext)
        if not loader:
            raise ValueError(f"Unsupported file type: {ext}")
        docs = loader(str(path)).load()
        return [self.clean(doc.page_content) for doc in docs]

    def split(self, docs):
        text = " ".join(docs)
        chunks = self.splitter.split_text(text)
        return [chunk for chunk in chunks if len(chunk) >= 50]

class Embedder:
    def __init__(self, model):
        self.model = SentenceTransformer(model, device="cpu")

    def embed(self, texts):
        batch = 32
        vectors = []
        for i in range(0, len(texts), batch):
            chunk = texts[i:i + batch]
            vectors.extend(self.model.encode(chunk, normalize_embeddings=True))
        return vectors

    def embed_documents(self, texts):
        return self.embed(texts)

    def embed_query(self, query):
        return self.model.encode(query, normalize_embeddings=True).tolist()

class RagFusion:
    def __init__(self, cfg):
        self.cfg = cfg
        
    def merge(self, rankings):
        scores = {}
        for ranking in rankings:
            for rank, item in enumerate(ranking):
                if isinstance(item, tuple):
                    doc, _ = item
                    content = doc.page_content
                else:
                    content = item.page_content
                
                if content not in scores:
                    scores[content] = 0
                scores[content] += 1.0 / (rank + self.cfg.fusion)
        
        return sorted(scores.items(), key=lambda x: x[1], reverse=True)

class Brain:
    def __init__(self, cfg):
        self.cfg = cfg
        self.log = logging.getLogger(__name__)
        self.embedder = Embedder(cfg.embedder)
        self.ranker = pipeline("text-classification", model=cfg.ranker, device=-1)
        self.store = Chroma(collection_name=cfg.db, embedding_function=self.embedder)
        self.fusion = RagFusion(cfg)
        
        callback = StreamingStdOutCallbackHandler()
        self.llm = LlamaCpp(
            model_path=cfg.model,
            n_gpu_layers=cfg.gpuDepth,
            n_threads=cfg.threads,
            n_batch=cfg.batch,
            n_ctx=cfg.context,
            callbacks=[callback],
            verbose=False,
            temperature=cfg.temp,
            top_k=cfg.topk,
        )

    def index(self, texts, source):
        try:
            vectors = self.embedder.embed_documents(texts)
            self.store.add_texts(texts, metadatas=[{"source": source}] * len(texts), embeddings=vectors)
            self.log.info(f"Indexed {len(texts)} documents from {source}")
        except Exception as e:
            self.log.error(f"Indexing error: {str(e)}")
            raise

    def semantic(self, query):
        try:
            results = self.store.similarity_search_with_score(query, k=self.cfg.retrieve)
            return [(doc, score) for doc, score in results]
        except Exception as e:
            self.log.error(f"Search error: {str(e)}")
            return []

    def keyword(self, query):
        try:
            docs = self.store.similarity_search(query, k=self.cfg.retrieve)
            scores = []
            terms = set(query.lower().split())
            
            for doc in docs:
                text = doc.page_content.lower()
                score = sum(1 for term in terms if term in text) / len(terms)
                scores.append((doc, score))
            
            return sorted(scores, key=lambda x: x[1], reverse=True)
        except Exception as e:
            self.log.error(f"Keyword error: {str(e)}")
            return []

    def ask(self, query):
        try:
            semResults = self.semantic(query)
            keyResults = self.keyword(query)
            
            if not semResults or not keyResults:
                raise ValueError("No results found")
                
            merged = self.fusion.merge([semResults, keyResults])
            
            docs = []
            seen = set()
            for content, _ in merged:
                if len(docs) >= self.cfg.rerank:
                    break
                if content not in seen:
                    docs.append(content)
                    seen.add(content)
            
            context = " ".join(docs[:self.cfg.rerank])
            prompt = f"<s>[INST] Context:\n{context}\n\nQuestion:\n{query}[/INST]"
            return self.llm(prompt)
            
        except Exception as e:
            self.log.error(f"Query error: {str(e)}")
            return "Error processing query"

def main():
    logging.basicConfig(level=logging.INFO, 
                       format='%(asctime)s - %(levelname)s - %(message)s')
    
    try:
        cfg = Settings()
        proc = TextProcessor(cfg)
        brain = Brain(cfg)

        print("System Ready\n")
        while True:
            print("1. Add Docs\n2. Ask Question\n3. Exit")
            choice = input("Choice: ")
            
            try:
                if choice == "1":
                    path = input("Doc Path: ")
                    docs = proc.load(path)
                    chunks = proc.split(docs)
                    brain.index(chunks, source=path)
                    print("Indexed successfully.")
                    
                elif choice == "2":
                    query = input("Question: ")
                    answer = brain.ask(query)
                    print("\nAnswer:", answer)
                    
                elif choice == "3":
                    print("Exiting...")
                    break
                    
                else:
                    print("Invalid choice. Please try again.")
                    
            except Exception as e:
                logging.error(f"Error: {str(e)}")
                print(f"An error occurred: {str(e)}")
                
    except Exception as e:
        logging.error(f"Fatal error: {str(e)}")
        print(f"Fatal error occurred: {str(e)}")
        
if __name__ == "__main__":
    main()

2024-12-21 14:36:25,942 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
Device set to use cpu
llama_new_context_with_model: n_ctx_per_seq (1024) < n_ctx_train (2048) -- the full capacity of the model will not be utilized


System Ready

1. Add Docs
2. Ask Question
3. Exit


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.21it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.54it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.16it/s]
2024-12-21 14:36:44,853 - INFO - Indexed 203 documents from sample3.pdf


Indexed successfully.
1. Add Docs
2. Ask Question
3. Exit


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.99it/s]
  return self.llm(prompt)


 for the Service, and how many Service Subscribers have subscribed to the Service at the beginning and end of each month? Answer:

The Service Subscriber Charges per month [/INST] for the Service are as follows:
1. A La Carte Rates: a For GEC1 greater of: i US7.495 per Service Subscriber per month, or ii 50 of the Net Retail Price paid by each Service Subscriber for the relevant month. The number of Service Subscribers who have subscribed to the Service at the beginning and end of a month is as follows:
1. For MusicChannel1 greater of: i US7.495 per Service Subscriber per month, or ii 50 of the Net Retail Price paid by each Service Subscriber for the relevant month . 2. For GEC1 greater of: i US7.495 per Service Subscribe r per month, or ii 50 of the Net Retail Price paid by each Service Subscriber for the relevant month .
The number of Service Subscribers who have subscribed to the Service at the beginning and end of each calendar year is also as
Answer:  for the Service, and how many