In [1]:
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")
db = Chroma(collection_name="Docs", embedding_function=embedder)

ids = db.get()['ids']
if ids:
    db.delete(ids)
    print(f"Deleted {len(ids)} documents")
else:
    print("No documents found")

  from .autonotebook import tqdm as notebook_tqdm


No documents found


In [2]:
import logging
from pathlib import Path
import psutil
import re
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.llms import LlamaCpp
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

class Settings:
    def __init__(self):
        self.embedder = "BAAI/bge-small-en-v1.5"
        self.ranker = "cross-encoder/ms-marco-TinyBERT-L-2"
        self.model = "models/Llama-2-7b-chat-hf.Q8_0.gguf"
        self.gpuDepth = 0
        self.threads = min(12, psutil.cpu_count(logical=False))
        self.batch = 512
        self.chunk = 384
        self.overlap = 64
        self.context = 1024
        self.temp = 0.7
        self.topk = 3
        self.retrieve = 8
        self.rerank = 3
        self.db = "Docs"
        self.fusion = 60

class TextProcessor:
    def __init__(self, cfg):
        self.cfg = cfg
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=cfg.chunk,
            chunk_overlap=cfg.overlap,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        self.loaders = {'.pdf': PyPDFLoader, '.docx': Docx2txtLoader, '.txt': TextLoader}

    def clean(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.,!?;:-]', '', text)
        return text.strip()

    def load(self, path):
        ext = Path(path).suffix.lower()
        loader = self.loaders.get(ext)
        if not loader:
            raise ValueError(f"Unsupported file type: {ext}")
        docs = loader(str(path)).load()
        return [self.clean(doc.page_content) for doc in docs]

    def split(self, docs):
        text = " ".join(docs)
        chunks = self.splitter.split_text(text)
        return [chunk for chunk in chunks if len(chunk) >= 50]

class Embedder:
    def __init__(self, model):
        self.model = SentenceTransformer(model, device="cpu")

    def embed(self, texts):
        batch = 32
        vectors = []
        for i in range(0, len(texts), batch):
            chunk = texts[i:i + batch]
            vectors.extend(self.model.encode(chunk, normalize_embeddings=True))
        return vectors

    def embed_documents(self, texts):
        return self.embed(texts)

    def embed_query(self, query):
        return self.model.encode(query, normalize_embeddings=True).tolist()

class RagFusion:
    def __init__(self, cfg):
        self.cfg = cfg
        
    def merge(self, rankings):
        scores = {}
        for ranking in rankings:
            for rank, item in enumerate(ranking):
                if isinstance(item, tuple):
                    doc, _ = item
                    content = doc.page_content
                else:
                    content = item.page_content
                
                if content not in scores:
                    scores[content] = 0
                scores[content] += 1.0 / (rank + self.cfg.fusion)
        
        return sorted(scores.items(), key=lambda x: x[1], reverse=True)

class Brain:
    def __init__(self, cfg):
        self.cfg = cfg
        self.log = logging.getLogger(__name__)
        self.embedder = Embedder(cfg.embedder)
        self.ranker = pipeline("text-classification", model=cfg.ranker, device=-1)
        self.store = Chroma(collection_name=cfg.db, embedding_function=self.embedder)
        self.fusion = RagFusion(cfg)
        
        callback = StreamingStdOutCallbackHandler()
        self.llm = LlamaCpp(
            model_path=cfg.model,
            n_gpu_layers=cfg.gpuDepth,
            n_threads=cfg.threads,
            n_batch=cfg.batch,
            n_ctx=cfg.context,
            callbacks=[callback],
            verbose=False,
            temperature=cfg.temp,
            top_k=cfg.topk,
        )

    def index(self, texts, source):
        try:
            vectors = self.embedder.embed_documents(texts)
            self.store.add_texts(texts, metadatas=[{"source": source}] * len(texts), embeddings=vectors)
            self.log.info(f"Indexed {len(texts)} documents from {source}")
        except Exception as e:
            self.log.error(f"Indexing error: {str(e)}")
            raise

    def semantic(self, query):
        try:
            results = self.store.similarity_search_with_score(query, k=self.cfg.retrieve)
            return [(doc, score) for doc, score in results]
        except Exception as e:
            self.log.error(f"Search error: {str(e)}")
            return []

    def keyword(self, query):
        try:
            docs = self.store.similarity_search(query, k=self.cfg.retrieve)
            scores = []
            terms = set(query.lower().split())
            
            for doc in docs:
                text = doc.page_content.lower()
                score = sum(1 for term in terms if term in text) / len(terms)
                scores.append((doc, score))
            
            return sorted(scores, key=lambda x: x[1], reverse=True)
        except Exception as e:
            self.log.error(f"Keyword error: {str(e)}")
            return []

    def ask(self, query):
        try:
            semResults = self.semantic(query)
            keyResults = self.keyword(query)
            
            if not semResults or not keyResults:
                raise ValueError("No results found")
                
            merged = self.fusion.merge([semResults, keyResults])
            
            docs = []
            seen = set()
            for content, _ in merged:
                if len(docs) >= self.cfg.rerank:
                    break
                if content not in seen:
                    docs.append(content)
                    seen.add(content)
            
            context = " ".join(docs[:self.cfg.rerank])
            prompt = f"<s>[INST] Context:\n{context}\n\nQuestion:\n{query}[/INST]"
            return self.llm(prompt)
            
        except Exception as e:
            self.log.error(f"Query error: {str(e)}")
            return "Error processing query"

def main():
    logging.basicConfig(level=logging.INFO, 
                       format='%(asctime)s - %(levelname)s - %(message)s')
    
    try:
        cfg = Settings()
        proc = TextProcessor(cfg)
        brain = Brain(cfg)

        print("System Ready\n")
        while True:
            print("1. Add Docs\n2. Ask Question\n3. Exit")
            choice = input("Choice: ")
            
            try:
                if choice == "1":
                    path = input("Doc Path: ")
                    docs = proc.load(path)
                    chunks = proc.split(docs)
                    brain.index(chunks, source=path)
                    print("Indexed successfully.")
                    
                elif choice == "2":
                    query = input("Question: ")
                    answer = brain.ask(query)
                    print("\nAnswer:", answer)
                    
                elif choice == "3":
                    print("Exiting...")
                    break
                    
                else:
                    print("Invalid choice. Please try again.")
                    
            except Exception as e:
                logging.error(f"Error: {str(e)}")
                print(f"An error occurred: {str(e)}")
                
    except Exception as e:
        logging.error(f"Fatal error: {str(e)}")
        print(f"Fatal error occurred: {str(e)}")
        
if __name__ == "__main__":
    main()

2024-12-22 15:18:44,425 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
Device set to use cpu
llama_new_context_with_model: n_ctx_per_seq (1024) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


System Ready

1. Add Docs
2. Ask Question
3. Exit


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.04it/s]
2024-12-22 15:19:08,953 - INFO - Indexed 203 documents from sample3.pdf


Indexed successfully.
1. Add Docs
2. Ask Question
3. Exit


Batches: 100%|██████████| 1/1 [00:00<00:00, 17.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.41it/s]
  return self.llm(prompt)


  Based on the provided context, the answer to question 2 is:

Programmer shall have no later than 30 days from the respective date that Programmer grants any additional non-economic terms to Other Distributor.
Answer:   Based on the provided context, the answer to question 2 is:

Programmer shall have no later than 30 days from the respective date that Programmer grants any additional non-economic terms to Other Distributor.
1. Add Docs
2. Ask Question
3. Exit


Batches: 100%|██████████| 1/1 [00:00<00:00, 126.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 113.57it/s]


  Based on the given information, here are the service charges for a period of 3 months:

Month 1:

* Average number of Service Subscribers: .1 (as per the agreement)
* Number of Bulk-Bill Service Subscribers: x (to be determined based on the actual number of units subject to the Bulk-Bill Arrangement)
* Monthly retail rate XYZ charges per unit for programming only: $X
* Quotient obtained by dividing the monthly retail rate by the number of Service Subscribers: $Y
* License Fee payable by XYZ for Month 1: $0.1 x ($X / Y)

Month 2:

* Average number of Service Subscribers: .1 (as per the agreement)
* Number of Bulk-Bill Service Subscribers: x (to be determined based on the actual number of units subject to the Bulk-Bill Arrangement)
* Monthly retail rate XYZ charges per unit for programming only: $X
* Quotient obtained by dividing the monthly retail rate by the number
Answer:   Based on the given information, here are the service charges for a period of 3 months:

Month 1:

* Average nu

Batches: 100%|██████████| 1/1 [00:00<00:00, 30.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.59it/s]


  Sure! Based on the given text, here are two parties involved in the agreement:

1. XYZ
2. Programmer
Answer:   Sure! Based on the given text, here are two parties involved in the agreement:

1. XYZ
2. Programmer
1. Add Docs
2. Ask Question
3. Exit
Invalid choice. Please try again.
1. Add Docs
2. Ask Question
3. Exit


Batches: 100%|██████████| 1/1 [00:00<00:00, 99.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.05it/s]


  Based on the given text, XYZ is a party to the agreement. Specifically, it is mentioned that XYZ acknowledges receiving two receivers and decoders (IRDs) from the Programmer, and later in the text, XYZ grants the non-exclusive right and license to receive and distribute each linear Service, including all feeds and versions of each Service in the Territory via the Hi-Tech Distribution System to Subscribers during the Term. Therefore, XYZ is the company or entity that is party to the agreement with the Programmer.
Answer:   Based on the given text, XYZ is a party to the agreement. Specifically, it is mentioned that XYZ acknowledges receiving two receivers and decoders (IRDs) from the Programmer, and later in the text, XYZ grants the non-exclusive right and license to receive and distribute each linear Service, including all feeds and versions of each Service in the Territory via the Hi-Tech Distribution System to Subscribers during the Term. Therefore, XYZ is the company or entity that

Batches: 100%|██████████| 1/1 [00:00<00:00, 23.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.23it/s]


  Based on the provided context, XYZ is a company.
Answer:   Based on the provided context, XYZ is a company.
1. Add Docs
2. Ask Question
3. Exit
Exiting...


In [2]:
import logging
from pathlib import Path
import psutil
import re
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.llms import LlamaCpp
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings import HuggingFaceEmbeddings
import numpy as np

class Config:
    def __init__(self):
        self.embed = "intfloat/e5-base-v2"
        self.rank = "cross-encoder/ms-marco-MiniLM-L-6-v2"
        self.model = "models/llama-2-13b-chat.Q4_K_M.gguf"
        self.gpu = 0
        self.cpu = min(8, psutil.cpu_count(logical=False))
        self.batch = 32
        self.chunk = 384
        self.overlap = 64
        self.ctx = 2048
        self.temp = 0.1
        self.top = 5
        self.get = 10
        self.best = 3
        self.db = "Knowledge"
        self.blend = 40
        self.hist = 8

class DocProc:
    def __init__(self, cfg):
        self.cfg = cfg
        self.split = RecursiveCharacterTextSplitter(
            chunk_size=cfg.chunk,
            chunk_overlap=cfg.overlap,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        self.load = {'.pdf': PyPDFLoader, '.docx': Docx2txtLoader, '.txt': TextLoader}

    def clean(self, txt):
        txt = re.sub(r'\s+', ' ', txt)
        txt = re.sub(r'[^\w\s\.,!?;:-]', '', txt)
        return txt.strip()

    def read(self, path):
        ext = Path(path).suffix.lower()
        if ext not in self.load:
            raise ValueError(f"Cannot load: {ext}")
        docs = self.load[ext](str(path)).load()
        return [self.clean(d.page_content) for d in docs]

    def chunk(self, docs):
        txt = " ".join(docs)
        parts = self.split.split_text(txt)
        return [p for p in parts if len(p) >= 50]

class ChatMem:
    def __init__(self, size):
        self.size = size
        self.hist = []

    def add(self, msg):
        self.hist.append(msg)
        if len(self.hist) > self.size:
            self.hist.pop(0)

    def get(self):
        return self.hist

class Brain:
    def __init__(self, cfg):
        self.cfg = cfg
        self.log = logging.getLogger(__name__)
        
        model_kwargs = {'device': 'cpu'}
        encode_kwargs = {'normalize_embeddings': True, 'batch_size': 32}
        self.embed = HuggingFaceEmbeddings(
            model_name=cfg.embed,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
        )
        
        self.rank = pipeline("text-classification", model=cfg.rank, device=-1)
        self.store = Chroma(
            collection_name=cfg.db,
            embedding_function=self.embed
        )
        self.mem = ChatMem(cfg.hist)
        
        self.llm = LlamaCpp(
            model_path=cfg.model,
            n_gpu_layers=cfg.gpu,
            n_threads=cfg.cpu,
            n_batch=cfg.batch,
            n_ctx=cfg.ctx,
            callbacks=[StreamingStdOutCallbackHandler()],
            verbose=False,
            temperature=cfg.temp,
            top_k=cfg.top,
            use_mmap=True,
            use_mlock=True
        )

    def save(self, txts, src):
        try:
            self.store.add_texts(txts, metadatas=[{"src": src}] * len(txts))
            self.log.info(f"Saved {len(txts)} from {src}")
        except Exception as e:
            self.log.error(f"Save failed: {str(e)}")
            raise

    def deep(self, q):
        try:
            hits = self.store.similarity_search_with_score(q, k=self.cfg.get)
            pairs = []
            for doc, score in hits:
                rank = self.rank([q, doc.page_content])[0]
                pairs.append((doc.page_content, float(rank["score"])))
            return sorted(pairs, key=lambda x: x[1], reverse=True)
        except Exception as e:
            self.log.error(f"Deep search failed: {str(e)}")
            return []

    def ask(self, q):
        try:
            hits = self.deep(q)
            if not hits:
                return "No relevant information found"

            ctx = "\n".join([d[0] for d in hits[:self.cfg.best]])
            hist = "\n".join(self.mem.get())
            
            prompt = f"<s>[INST] Chat History:\n{hist}\n\nContext:\n{ctx}\n\nQuestion:\n{q}[/INST]"
            ans = self.llm(prompt)
            
            self.mem.add(f"User: {q}\nAssistant: {ans}")
            return ans
            
        except Exception as e:
            self.log.error(f"Query failed: {str(e)}")
            return "Error processing query"

def main():
    logging.basicConfig(level=logging.INFO, 
                       format='%(asctime)s - %(levelname)s - %(message)s')
    
    try:
        cfg = Config()
        proc = DocProc(cfg)
        brain = Brain(cfg)

        print("System Ready\n")
        while True:
            print("1. Add Docs\n2. Ask Question\n3. Exit")
            pick = input("Choice: ")
            
            try:
                if pick == "1":
                    path = input("Doc Path: ")
                    docs = proc.read(path)
                    parts = proc.chunk(docs)
                    brain.save(parts, path)
                    print("Added successfully")
                    
                elif pick == "2":
                    q = input("Question: ")
                    ans = brain.ask(q)
                    print("\nAnswer:", ans)
                    
                elif pick == "3":
                    print("Exiting...")
                    break
                    
                else:
                    print("Invalid choice")
                    
            except Exception as e:
                logging.error(f"Error: {str(e)}")
                print(f"Error: {str(e)}")
                
    except Exception as e:
        logging.error(f"Fatal: {str(e)}")
        print(f"Fatal error: {str(e)}")
        
if __name__ == "__main__":
    main()

  self.embed = HuggingFaceEmbeddings(
2024-12-22 18:55:32,956 - INFO - Load pretrained SentenceTransformer: intfloat/e5-base-v2
Device set to use cpu
llama_new_context_with_model: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


System Ready

1. Add Docs
2. Ask Question
3. Exit


2024-12-22 18:55:54,977 - INFO - Saved 203 from sample3.pdf


Added successfully
1. Add Docs
2. Ask Question
3. Exit


2024-12-22 18:56:23,070 - ERROR - Error: Cannot load: 


Error: Cannot load: 
1. Add Docs
2. Ask Question
3. Exit


2024-12-22 18:56:46,425 - INFO - Saved 203 from sample3.pdf


Added successfully
1. Add Docs
2. Ask Question
3. Exit


  ans = self.llm(prompt)


  Based on the chat history provided, the name of the distributor is:

Other Distributor
Answer:   Based on the chat history provided, the name of the distributor is:

Other Distributor
1. Add Docs
2. Ask Question
3. Exit




  Based on the chat history provided, the name of the distributor is "Other Distributor".

To answer your question, we need to know which specific aspect of the service subscriber charges you would like to know. Here are some options:

1. A La Carte Rates: The a la carte rates for GEC1 and MusicChannel1 are as follows:
	* For GEC1: US$7.495 per Service Subscriber per month, or 50% of the Net Retail Price paid by each Service Subscriber for the relevant month.
	* For MusicChannel1: US$7.495 per Service Subscriber per month, or 50% of the Net Retail Price paid by each Service Subscriber for the relevant month.
2. Package Rates: The rates for packages comprising of the service are as follows:
	* For GEC1 and MusicChannel1, the rate is US$7.495 per Service Subscriber per month.
3. Number of Service Subscribers: At the beginning of the relevant month, there were x number of Service Subscribers who had subscribed to the service
Answer:   Based on the chat history provided, the name of the di



Exiting...
