In [None]:
%pip install langchain tiktoken chromadb pypdf transformers InstructorEmbedding
%pip install accelerate bitsandbytes sentencepiece Xformers
%pip install sentence-transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", device_map='auto')
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [None]:
# imports
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
# from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

from IPython.display import Markdown, display
import chromadb

In [None]:
corpus_directory = '/mnt/Senai_grp/RahulM/pfizerproj/bin/corpus_test 2'
loader = DirectoryLoader(corpus_directory, glob="*.txt", loader_cls=TextLoader)
documents = loader.load()

# Splitting text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Get the number of text chunks
num_chunks = len(texts)

print(f"Number of text chunks: {num_chunks}")

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True}  # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

#  Use the BGE embeddings to encode text chunks (texts) and perform further tasks
#  Example: Get embeddings for text chunks
# embeddings = model_norm.encode(texts)

In [None]:
persist_directory = 'db'
vectordb = Chroma.from_documents(documents=texts,
                                embedding=model_norm,
                                persist_directory=persist_directory)

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=0,
    # do_sample=True,
    top_p=0.95,
    repetition_penalty=1.15
)
local_llm = HuggingFacePipeline(pipeline = pipe)

In [None]:
qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
query = "Examples of useful water soluble excipients in the present invention includes?"
llm_response = qa_chain(query)
process_llm_response(llm_response)