### Importing required modules, functions and libraries

In [None]:
pip install tiktoken

In [None]:
pip install torch

In [None]:
pip install -r C:\Programming\Gadgeon\requirements.txt


In [None]:
pip install sentence_transformers

In [None]:

pip install numpy scipy

In [None]:
import pandas as pd
import tiktoken
import torch
import numpy as np
from sentence_transformers import util
from sentence_transformers import SentenceTransformer
from langchain.llms import LlamaCpp
from langchain import PromptTemplate
from langchain import LLMChain

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

### Importing the embedding model and the locally downloaded LLM model

In [None]:

class TextProcessor:
    def __init__(self, document_path, model_name='msmarco-distilbert-base-tas-b', max_tokens=100, top_k=3):
        self.document_path = document_path
        self.model_name = model_name
        self.max_tokens = max_tokens
        self.top_k = top_k
        self.tokenizer = tiktoken.get_encoding('cl100k_base')
        self.model = SentenceTransformer(model_name)
        self.texts = []
        self.sentences = []
        self.embeddings = None
        self.context = ""

    def read_document(self):
        with open(self.document_path, 'r') as f:
            self.texts = f.read()

    def tokenize_sentences(self):
        self.sentences = self.texts.split('.')

    def calculate_tokens(self):
        n_tokens = [len(self.tokenizer.encode(" " + sentence)) for sentence in self.sentences]
        return n_tokens

    def split_sentences(self):
        chunks = []
        tokens_so_far = 0
        chunk = []

        n_tokens = self.calculate_tokens()

        for sentence, token in zip(self.sentences, n_tokens):
            if token + tokens_so_far > self.max_tokens:
                chunks.append(". ".join(chunk) + ".")
                chunk = []
                tokens_so_far = 0

            if token > self.max_tokens:
                continue

            chunk.append(sentence)
            tokens_so_far += token + 1

        return chunks

    def create_dataframe(self, chunks):
        df = pd.DataFrame(chunks, columns=['text'])
        df['n_tokens'] = df.text.apply(lambda x: len(self.tokenizer.encode(x)))
        return df

    def encode_sentences(self):
        sentences = self.df['text'].tolist()
        self.embeddings = self.model.encode(sentences)

    def encode_question(self, question):
        embed_q = self.model.encode(question)
        return embed_q

    def find_top_results(self, embed_q):
        cosine_scores = util.cos_sim(embed_q, self.embeddings)[0]
        top_results = torch.topk(cosine_scores, k=self.top_k)
        return top_results

    def generate_context(self, top_results):
        self.context = ""
        for i in top_results.indices:
            self.context += self.df['text'][i.item()] + " "

    def process(self, question):
        self.read_document()
        self.tokenize_sentences()
        chunks = self.split_sentences()
        self.df = self.create_dataframe(chunks)
        self.encode_sentences()
        embed_q = self.encode_question(question)
        top_results = self.find_top_results(embed_q)
        self.generate_context(top_results)


if __name__ == "__main__":
    processor = TextProcessor("C:\Programming\documents.txt")
    question = "what is NLP?"
    processor.process(question)
    print("Context: \n{}".format(processor.context))

### Text Processing


In [None]:

class LanguageModelProcessor:
    def __init__(self, model_path, n_gpu_layers=10, n_batch=512):
        self.model_path = model_path
        self.n_gpu_layers = n_gpu_layers
        self.n_batch = n_batch
        self.callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
        self.llm = None
        self.llm_chain = None
        self.template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

        {context}

        Question: {question}
        Answer:"""
        self.prompt = PromptTemplate(template=self.template, input_variables=["context", "question"])

    def initialize_llm(self):
        self.llm = LlamaCpp(
            model_path=self.model_path,
            n_gpu_layers=self.n_gpu_layers,
            n_batch=self.n_batch,
            callback_manager=self.callback_manager,
            verbose=True,
        )

    def initialize_llm_chain(self):
        self.llm_chain = LLMChain(prompt=self.prompt, llm=self.llm)

    def get_llm_response(self, context, question):
        response = self.llm_chain.run({"context": context, "question": question})
        return response

    def process(self, context, question):
        self.initialize_llm()
        self.initialize_llm_chain()
        response = self.get_llm_response(context, question)

        return response


### Creation and Indexing of Embedding Vectors


In [None]:

if __name__ == "__main__":
    text_processor = TextProcessor("C:\Programming\documents.txt")
    question = "what's NLP"
    text_processor.process(question)
    context = text_processor.context
    print("Context: \n{}".format(context))

    processor = LanguageModelProcessor(model_path="C:\Programming\Gadgeon\GPT4All-13B-snoozy.ggmlv3.q4_0.bin")
    response = processor.process(context, question)
    print(response)

### Running of query

In [None]:

query = input("enter query")
   
docsearch = Chroma.from_documents(splitting.texts, embeddings)

#Using the RetrievalQA chain of Langchain to query from the index created
MIN_DOCS = 1 
  
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                retriever=docsearch.as_retriever(search_type="similarity", search_kwargs={"k": MIN_DOCS}))
#query is inputted and run
qa.run(query)
