In [1]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
Lufthansa = './business_1.txt'
Japan = './business_13.txt'


In [3]:
# import the LangChain pdf document loader
from langchain.document_loaders import TextLoader


In [4]:
loaders = [
    TextLoader(Lufthansa),
    TextLoader(Japan),
]

docs = []
for l in loaders:
    docs.extend(l.load())
    

#### Create Embeddings

Video Tutorial : https://youtu.be/oppILeoM_k0?si=5yyNjWmeBp7nR0XS

We will use the open source SoTA BGE Embeddings to create the embedding , you can also use the OpenAI embedding to create the embeddings.

Embedding MTEB Leaderboard: https://huggingface.co/spaces/mteb/leaderboard

Model we will use: https://huggingface.co/BAAI/bge-base-en-v1.5

In [5]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)


#### Vector Store

Only create child splits

In [6]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever


In [7]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=embeddings
)

# The storage layer for the parent documents
store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)


In [8]:
retriever.add_documents(docs, ids=None)


In [9]:
## This should yield two keys, because we added two documents.
list(store.yield_keys())


['60bd0f50-5dc1-46c5-a912-73dbb50a45ec',
 '1bb4a985-4a7e-4ff6-9fc4-5678d291036f']

#### Let's now call the vector store search functionality -

we should see that it returns small chunks (since we're storing the small chunks).

In [10]:
sub_docs = vectorstore.similarity_search("Tell me what happened in the the year 2014 in Japan?")


In [11]:
print(sub_docs[0].page_content)


1997. The news sent Tokyo shares to an eight-month high, as investors hoped for a recovery from the three quarters of contraction seen from April 2004 on. The Nikkei 225 index ended the day up 0.7%


In [12]:
print(len(sub_docs[0].page_content))


197


Let's now retrieve from the overall retriever. This should return large documents - since it returns the documents where the smaller chunks are located.

In [13]:
retrieved_docs = retriever.get_relevant_documents("Tell me what happened in the the year 2014 in Japan?")


In [14]:
len(retrieved_docs[0].page_content) # retrieving the full document


1704

In [15]:
print(retrieved_docs[0].page_content)


Industrial revival hope for Japan

Japanese industry is growing faster than expected, boosting hopes that the country's retreat back into recession is over.


Within the overall industrial output figure, there were signs of a pullback from the export slowdown. Among the best-performing sectors were key overseas sales areas such as cars, chemicals and electronic goods. With US growth doing better than expected the picture for exports in early 2005 could also be one of sustained demand. Electronics were also one of the keys to the improved domestic market, with products such as flat-screen TVs in high demand during January.



#### Retrieving larger chunks

Sometimes, the full documents can be too big to want to retrieve them as is. In that case, what we really want to do is to

- first split the raw documents into larger chunks,
- and then split it into smaller chunks.
- We then index the smaller chunks, but on retrieval we retrieve the larger chunks (but still not the full documents).

In [16]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=800)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=embeddings
)
# The storage layer for the parent documents
store = InMemoryStore()


In [17]:
parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)


In [18]:
parent_retriever.add_documents(docs)


In [19]:
len(list(store.yield_keys()))


6

#### Vectore Store Search

In [20]:
sub_docs = vectorstore.similarity_search("Tell me what happened in the the year 2014 in Japan?")


In [21]:
print(len(sub_docs[0].page_content))


197


In [22]:
print((sub_docs[0].page_content))


1997. The news sent Tokyo shares to an eight-month high, as investors hoped for a recovery from the three quarters of contraction seen from April 2004 on. The Nikkei 225 index ended the day up 0.7%


In [23]:
retrieved_docs = parent_retriever.get_relevant_documents("Tell me what happened in the the year 2014 in Japan?")


In [24]:
print(len(retrieved_docs[0].page_content)) ### Parent chunks - larger chunks but not full document


795


In [25]:
print(retrieved_docs[0].page_content)


Industrial output rose 2.1% - adjusted for the time of year - in January from a month earlier. At the same time, retail sales picked up faster than at any time since 1997. The news sent Tokyo shares to an eight-month high, as investors hoped for a recovery from the three quarters of contraction seen from April 2004 on. The Nikkei 225 index ended the day up 0.7% at 11,740.60 points, with the yen strengthening 0.7% against the dollar to 104.53 yen. Weaker exports, normally the engine for Japan's economy in the face of weak domestic demand, had helped trigger a 0.1% contraction in the final three months of last year after two previous quarters of shrinking GDP. Only an exceptionally strong performance in the early months of 2004 kept the year as a whole from showing a decline. The output


#### Retrieval QA with Parent Retriever

In [26]:
import torch 
import time
import transformers # HF import
from langchain import HuggingFacePipeline # To build the HF pipeline using Llama-2
from langchain import PromptTemplate,  LLMChain # To create PromptTemplate and LLMChain
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM , AutoModel  # For creating the model and tokenizer


In [27]:
from transformers import GPTQConfig

#mname = 'TheBloke/Llama-2-7B-Chat-GGUF'
mname = "TheBloke/Mistral-7B-OpenOrca-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(mname)
tokenizer.pad_token = tokenizer.eos_token

quantization_config_loading = GPTQConfig(bits=4, 
                                         disable_exllama=True, 
                                         use_cuda_fp16=True,
                                         tokenizer=tokenizer)

model = AutoModelForCausalLM.from_pretrained(mname,
                                             quantization_config=quantization_config_loading,
                                             device_map="auto")

model.eval()

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 128,
                do_sample=True,
                top_k=1,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.2
                )

llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. ['use_cuda_fp16', 'use_exllama', 'max_input_length', 'exllama_config', 'disable_exllama']) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.
CUDA extension not installed.
CUDA extension not installed.


In [None]:
# https://medium.com/@onkarmishra/using-langchain-for-question-answering-on-own-data-3af0a82789ed

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

# Default system prompt
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, 
while being safe. Your answers should not include any harmful, unethical, racist, sexist, 
toxic, dangerous, or illegal content. Please ensure that your responses are socially 
unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of 
answering something not correct. If you don't know the answer to a question, please don't 
share false information.

Always say "thanks for asking!" at the end of the answer. """

def get_prompt_template(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT):
    System_PROMPT = B_SYS + new_system_prompt + E_SYS
    PromptTemplate = B_INST + System_PROMPT + instruction + E_INST

    return PromptTemplate

instruction = '''Use the following pieces of context to answer the question at the end. 
{context}
Question: {question}\n' 
Helpful Answer:'''

template = get_prompt_template(instruction)

prompt = PromptTemplate(template=template,
                        input_variables=["context", "question"])



In [28]:
#### Integrate the Parent Document Retriever in the 

from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=parent_retriever
                                 chain_type_kwargs={"prompt": prompt}) # parent document retriever


In [29]:
qa.run("Tell me what happened in the the year 2014 in Japan?")


  warn_deprecated(


' In 2014, industrial output in Japan increased by 2.1%, and retail sales also improved significantly. This led to a rise in Tokyo stock market, reaching an eight-month high.'

In [30]:

qa.run("How is Lufthan doing currently?")


' Lufthansa is currently doing well as they have returned to profit in 2004 after posting huge losses in 2003.'