In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
import chromadb
import os




#Parallel Computing Libraries
import concurrent.futures


In [6]:
global embeddings
global text_Splitter
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
text_Splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)



In [21]:
import chromadb.config


def Vector_DB_Retriever(file_name):
    loader=PyPDFLoader(f"Data/{file_name}")
    file=loader.load()
    doc=text_Splitter.split_documents(file)
    client_settings=chromadb.config.Settings(
        is_persistent=False,
        anonymized_telemetry=False,
    )
    Doc=Chroma.from_documents(doc,embeddings,client_settings=client_settings,
                              collection_name=file_name.split('.')[0],
                              collection_metadata={'hnsw':'cosine'})
    retriever_Doc=Doc.as_retriever(search_type='mmr',search_kwargs={'k':10,"include_metadata":True})
    return retriever_Doc

In [9]:
files=[s for s in os.listdir('.//Data') if s.endswith('.pdf')]

In [22]:
with concurrent.futures.ThreadPoolExecutor(max_workers=len(files)) as excecutor:
    results=list(excecutor.map(Vector_DB_Retriever,files))

In [25]:
MGR=MergerRetriever(retrievers=results)

In [28]:
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever

In [29]:
cross_encoder_model=HuggingFaceCrossEncoder(model_name='BAAI/bge-reranker-base')
rerank_compressor=CrossEncoderReranker(model=cross_encoder_model,top_n=5)
pipeline=DocumentCompressorPipeline(transformers=[rerank_compressor])
Final_Retriever=ContextualCompressionRetriever(base_compressor=pipeline,base_retriever=MGR)



config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [38]:
question="Give me architecture of Comparison between GPT3 and LLAMA2"
context=Final_Retriever.invoke(question)

In [39]:
context

[Document(page_content='(such as BLOOM (Scao et al., 2022), LLaMa-1 (Touvron et al., 2023), and Falcon (Penedo et al., 2023)) that\nmatch the performance of closed pretrained competitors like GPT-3 (Brown et al., 2020) and Chinchilla\n(Hoffmann et al., 2022), but none of these models are suitable substitutes for closed “product” LLMs, such\nasChatGPT,BARD,andClaude. TheseclosedproductLLMsareheavilyfine-tunedtoalignwithhuman', metadata={'page': 2, 'source': 'Data/LLAMA_2.pdf'}),
 Document(page_content='distribution, recovering strong performance in the few-shot setting.\nOn Natural Questions (NQs) GPT-3 achieves 14.6% in the zero-shot setting, 23.0% in the one-shot setting, and 29.9% in\nthe few-shot setting, compared to 36.6% for ﬁne-tuned T5 11B+SSM. Similar to WebQS, the large gain from zero-shot\nto few-shot may suggest a distribution shift, and may also explain the less competitive performance compared to', metadata={'page': 13, 'source': 'Data/GPT3.pdf'}),
 Document(page_content='

In [34]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

In [36]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
model_path="C:\\Users\\rouna\\Desktop\\OpenSource_Model\\mistral-7b-v0.1.Q8_0.gguf"

In [37]:
n_gpu_layers = 30  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    temperature=0.1,
    repeat_penalty=1.5,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=8192,  # Context window size
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from C:\Users\rouna\Desktop\OpenSource_Model\mistral-7b-v0.1.Q8_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:       

In [44]:
question="Give me architecture of Comparison between GPT3 and LLAMA2?"
context=Final_Retriever.invoke(question)


prompt_template=f"""
You are a chatbot that answer question mentioned in the end of the Prompt using Steps and Instruction given below

Steps:
1. First Understand the question.
2. Then find out the texts from the context which can asnwer the question.
3. Construct a detail asnwer from the Context given below
4. Summarize the answer and into a final answer and provide that as anwer nothing else

Instruction:
1. Don't give steps you done in between only give the final answer which you summarized
2. Always try to give elaborate answer
3. If possible make bullet points

Context:{context}

Question:{question}

Answer:
"""

In [45]:
print(prompt_template)


You are a chatbot that answer question mentioned in the end of the Prompt using Steps and Instruction given below

Steps:
1. First Understand the question.
2. Then find out the texts from the context which can asnwer the question.
3. Construct a detail asnwer from the Context given below
4. Summarize the answer and into a final answer and provide that as anwer nothing else

Instruction:
1. Don't give steps you done in between only give the final answer which you summarized
2. Always try to give elaborate answer
3. If possible make bullet points

Context:[Document(page_content='(such as BLOOM (Scao et al., 2022), LLaMa-1 (Touvron et al., 2023), and Falcon (Penedo et al., 2023)) that\nmatch the performance of closed pretrained competitors like GPT-3 (Brown et al., 2020) and Chinchilla\n(Hoffmann et al., 2022), but none of these models are suitable substitutes for closed “product” LLMs, such\nasChatGPT,BARD,andClaude. TheseclosedproductLLMsareheavilyfine-tunedtoalignwithhuman', metadata=

In [46]:
llm.invoke(prompt_template)

The comparison is based on the following factors:- Architecture- The architectures are different in terms of their size as well as number\nof layers used for training. In case of LLama 1, it has a total of around \u00a735 billion parameters,\nto be precise.\nit uses an architecture that is similar to GPT but with some modifications such as the use of larger hidden states and attention heads which makes\nLLAMA more efficient in terms of performance. In case of Llama 2, it has a total number \u03bcmore parameters than LLAM1.\nit uses an architecture that is similar to GPT but with some modifications such as the use of larger hidden states and attention heads which makes\nLLAMA more efficient in terms of performance. In case of Llama 2, it has a total number \u03bcmore parameters than LLAM1.\nit uses an architecture that is similar to GPT but with some modifications such as the use of larger hidden states and attention heads which makes\nLLAMA more efficient in terms of performance. In ca


llama_print_timings:        load time =   49816.71 ms
llama_print_timings:      sample time =      83.84 ms /   256 runs   (    0.33 ms per token,  3053.36 tokens per second)
llama_print_timings: prompt eval time =  107180.70 ms /  1101 tokens (   97.35 ms per token,    10.27 tokens per second)
llama_print_timings:        eval time =  115268.58 ms /   255 runs   (  452.03 ms per token,     2.21 tokens per second)
llama_print_timings:       total time =  223306.08 ms /  1356 tokens


'The comparison is based on the following factors:- Architecture- The architectures are different in terms of their size as well as number\\nof layers used for training. In case of LLama 1, it has a total of around \\u00a735 billion parameters,\\nto be precise.\\nit uses an architecture that is similar to GPT but with some modifications such as the use of larger hidden states and attention heads which makes\\nLLAMA more efficient in terms of performance. In case of Llama 2, it has a total number \\u03bcmore parameters than LLAM1.\\nit uses an architecture that is similar to GPT but with some modifications such as the use of larger hidden states and attention heads which makes\\nLLAMA more efficient in terms of performance. In case of Llama 2, it has a total number \\u03bcmore parameters than LLAM1.\\nit uses an architecture that is similar to GPT but with some modifications such as the use of larger hidden states and attention heads which makes\\nLLAMA more efficient in terms of perfor