In [1]:
import os
import torch
import chromadb
import PyPDF2
import time
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores import Chroma
from langchain.document_loaders.pdf import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering,AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
# import accelerate

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def pdf_to_text(file_path):
    pdf_file = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page_num in range( len(pdf_reader.pages)):
        text += pdf_reader.pages[page_num].extract_text()
    pdf_file.close()
    return text

# Initialize text splitter and embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
# Convert PDF to text
text = pdf_to_text(os.path.join('./', './2024.pdf'))
docs = [Document(page_content=x) for x in text_splitter.split_text(text)] #very important converting str to documents
texts = text_splitter.split_documents(docs)

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

persist_directory = 'db'
## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None


In [None]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

In [None]:
#mistralai--Mistral-7B
"""
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
# Create a tokenizer object by loading the pretrained "Intel/dynamic_tinybert" tokenizer.
# "meta-llama/Llama-2-7b-chat-hf"
# "deepset/roberta-base-squad2"
# model_path="/home/administrator/logs_jai/jai/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/73068f3702d050a2fd5aa2ca1e612e5036429398"
model_path="/home/administrator/logs_jai/jai/v/models--CohereForAI--c4ai-command-r-v01/snapshots/9c33b0976099d0f406f0d007613676fe42b78e3b"
tokenizer = AutoTokenizer.from_pretrained(model_path,cache_dir=model_path)
# tokenizer = LlamaTokenizer.from_pretrained(model_path, cache_dir=model_path)

# Create a question-answering model object by loading the pretrained "Intel/dynamic_tinybert" model.
# model = AutoModelForQuestionAnswering.from_pretrained(model_path,cache_dir=model_path)
model = LlamaForCausalLM.from_pretrained(model_path, cache_dir=model_path)
"""

In [None]:
# torch.cuda.empty_cache()
# torch.backends.cudnn.benchmark=True
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:16 "

In [None]:
#zephyr-7b-->HuggingFaceH4/zephyr-7b-beta
# NousResearch/Hermes-2-Pro-Mistral-7B
#Xwin-LM/Xwin-LM-13B-V0.1
# device_map="auto"
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", cache_dir="./",use_fast=False,
    trust_remote_code=True)
# ,device_map="auto"
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", cache_dir="./",torch_dtype=torch.bfloat16,trust_remote_code=True,load_in_4bit=True)

In [None]:
# Specify the model name you want to use
# model_name = "./models--mistralai--Mistral-7B-Instruct-v0.1"

# Load the tokenizer associated with the specified model
# tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
# question_answerer = pipeline(
#     "text-generation", 
#     model=model, 
#     tokenizer=tokenizer,'
#     return_tensors='pt'
# )

generate_text = pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation', #question-ans,text2text,image2text 
    # we pass model parameters here too
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    top_p=0.15,  # select from top tokens whose probability add up to 15%
    top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
    max_new_tokens=2048,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=generate_text,
    # model_kwargs={"temperature": 0.1, "max_length": 512},
)

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 4})

In [None]:
t1=time.time()
qa_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=retriever, return_source_documents=False)

In [None]:
query="""
write a code for text summarization llm and that should run on NVIDA GPU and it should take only transformers/hugging face models as input
"""

In [None]:
l=qa_chain(query)
t2=time.time()

In [None]:
# print(l['query'])
print(l['result'],t2-t1)

In [None]:
# ## Cite sources
# def process_llm_response(llm_response):
#     print(llm_response['result'])
#     print('\n\nSources:')
#     for source in llm_response["source_documents"]:
#         print(source.metadata['source'])

In [None]:
# # full example
# query = """
# give me some example on pointer to pointers in c with an example
# """
# llm_response = qa_chain(query)
# print(llm_response['result'])
# # process_llm_response(llm_response)