## Imports


In [1]:
%%capture
!pip install transformers==4.35.2

In [2]:
%%capture
!pip install langchain  accelerate tiktoken openai gradio torch accelerate \
safetensors sentence-transformers faiss-gpu bitsandbytes pypdf typing-extensions

In [3]:
%%capture
!pip uninstall typing-extensions --yes
!pip install typing-extensions

# The Processes involved in creating the chatbot

## Import required packages

In [4]:
%%capture
!pip install PyPDF2

In [5]:

import torch
import PyPDF2 # pdf reader
import time
from pypdf import PdfReader
from io import BytesIO
from langchain.prompts import PromptTemplate # for custom prompt specification
from langchain.text_splitter import RecursiveCharacterTextSplitter # splitter for chunks
from langchain.embeddings import HuggingFaceEmbeddings # embeddings
from langchain.vectorstores import FAISS # vector store database
from langchain.chains import RetrievalQA # qa and retriever chain
from langchain.memory import ConversationBufferMemory # for model's memoy on past conversations
from langchain.document_loaders import PyPDFDirectoryLoader # loader fo files from firectory

from langchain.llms.huggingface_pipeline import HuggingFacePipeline # pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig



In [6]:
# Initialize some variables

In [7]:
CHUNK_SIZE = 100
# Using HuggingFaceEmbeddings with the chosen embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",model_kwargs = {"device": "cuda"})
    # model_name="sentence-transformers/all-mpnet-base-v2"
# transformer model configuration
# this massively model's precision for memory effieciency
# The models accuacy is reduced.
quant_config = BitsAndBytesConfig(

    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## The LLM in use - `DeciLM-7B-instruct`

In [8]:
tensor_1 = torch.rand (4,4)

In [9]:
model_id = "Deci/DeciLM-7B-instruct" # model repo id
device = 'cuda' # Run on gpu if available else run on cpu


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             trust_remote_code=True,
                                             device_map = "auto")

# quatisation if you have enough memory on collab
# model = AutoModelForCausalLM.from_pretrained(model_id,
#                                              trust_remote_code=True,
#                                              device_map = "auto",
#                                              quantization_config=quant_config)

# create a pipeline
pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                return_full_text = True,
                max_new_tokens=200,
                repetition_penalty = 1.1,
                num_beams=5,
                no_repeat_ngram_size=4,
                early_stopping=True)

llm = HuggingFacePipeline(pipeline=pipe)

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

configuration_decilm.py:   0%|          | 0.00/576 [00:00<?, ?B/s]

version_check.py:   0%|          | 0.00/383 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Deci/DeciLM-7B-instruct:
- version_check.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


(…)sformers_v4_35_2__configuration_llama.py:   0%|          | 0.00/9.20k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Deci/DeciLM-7B-instruct:
- transformers_v4_35_2__configuration_llama.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Deci/DeciLM-7B-instruct:
- configuration_decilm.py
- version_check.py
- transformers_v4_35_2__configuration_llama.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_decilm.py:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

transformers_v4_35_2__modeling_llama.py:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

(…)ers_v4_35_2__modeling_attn_mask_utils.py:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Deci/DeciLM-7B-instruct:
- transformers_v4_35_2__modeling_attn_mask_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Deci/DeciLM-7B-instruct:
- transformers_v4_35_2__modeling_llama.py
- transformers_v4_35_2__modeling_attn_mask_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Deci/DeciLM-7B-instruct:
- modeling_decilm.py
- transformers_v4_35_2__modeling_llama.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



## Loading the PDFs from directory with PyPDFDirectoryLoader

We load the files with PyPDFDirectoryLoader DocumentLoader which returns Document object. Document is an object with page_content and metdata. The metadata can be sources, page numbers etc.


In [11]:
pdf_paths = "/content/"

loader = PyPDFDirectoryLoader(
    path= pdf_paths,
    glob="*.pdf"
)
documents=loader.load()

print(len(documents))
documents[0] # display four documents

14


Document(page_content='10\nricket grew out of the many stick-and-\nball games played in England 500 years\nago. The word ‘bat’ is an old English word that\nsimply means stick or club. By the seventeenth\ncentury, cricket had evolved enough to be\nrecognisable as a distinct game. T ill the middle of\nthe eighteenth century, bats were roughly the same\nshape as hockey sticks, curving outwards at the\nbottom. There was a simple reason for this: the ball\nwas bowled underar m, along the gr ound and the\ncurve at the end of the bat gave the batsman the\nbest chance of making contact.\nOne of the peculiarities of cricket is that a\nTest match can go on for five days and still endCBefore you read\nSport is an integral part of a healthy life. It is one way in\nwhich we amuse ourselves, compete with each other and\nstay fit. Among the various sports such as hockey, football\nand tennis, cricket appears to be the most appealing\nnational entertainment today. How much do we really\nknow about the

In [12]:
documents[0].page_content, documents[0].metadata

('10\nricket grew out of the many stick-and-\nball games played in England 500 years\nago. The word ‘bat’ is an old English word that\nsimply means stick or club. By the seventeenth\ncentury, cricket had evolved enough to be\nrecognisable as a distinct game. T ill the middle of\nthe eighteenth century, bats were roughly the same\nshape as hockey sticks, curving outwards at the\nbottom. There was a simple reason for this: the ball\nwas bowled underar m, along the gr ound and the\ncurve at the end of the bat gave the batsman the\nbest chance of making contact.\nOne of the peculiarities of cricket is that a\nTest match can go on for five days and still endCBefore you read\nSport is an integral part of a healthy life. It is one way in\nwhich we amuse ourselves, compete with each other and\nstay fit. Among the various sports such as hockey, football\nand tennis, cricket appears to be the most appealing\nnational entertainment today. How much do we really\nknow about the game called ‘cricket

## Splitting the documents

We can not pass the entire documents into the language model since they have the maximum number of tokens. for instance the DeciLM-7B-instruct has 4096. Large chunks also make it hard for the model to search for relevant information from very prmpts.

So we need to split the documents into small text chunks

We split the documents with `RecursiveCharacterTextSplitter`. We sent the

* `chunk_size = 100`(100 characters) and

* `chunk_overlap =  100`(100 characters of overlap between chunks. The overlap helps mitigate the possibility of separating a statement from important context related to it)

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,
                                                chunk_overlap=100)

splits = text_splitter.split_documents(documents)

# length of all splits

print(f"We have, {len(splits)} chunks in memory")

We have, 483 chunks in memory


## Creating a Vector store

Now that we've got 119 text chunks in memory, we need to store and index them so that we
can search them later in our Question answering app.

We use a Vector store database for this. In this case FAISS. We embedd the text chunks and sttore them. This
helps for easier similarity check between the stored chunks and our query to the bot.


We use Hugging Face embeddings.

In [14]:
vectorstore_db = FAISS.from_documents(splits, embeddings) # create vector db for similarity search

## Next we create a Question Answering chain.

First, we want an application that let's the user ask a question, searches for documents relevant to that question, passes the retrieved documents and initial question to a model, and finally returns an answer.

To allow this, We use a `Retriever` interface which wraps an index that can return relevant documents given a string query. It uses the similarity search capabilities of a vector store to facillitate retrieval. In that case, We will convert the `vectorstore_db` we created above into a retriever.

In [15]:

# performs a similarity check and returns the top K embeddings
# that are similar to the question’s embeddings
retriever = vectorstore_db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

For instance based on our PDFs, we can get the relevant documents based on a query.

PDF 1 : "History of cricket"

In [16]:
retrieved_relevant_docs = retriever.get_relevant_documents(
    "when was world's first crciket club formed"
)

print(f"Retrieved documents: {len(retrieved_relevant_docs)}")
f" {retrieved_relevant_docs[0].page_content}"


Retrieved documents: 6


' apart”. The world’s first cricket club was for med\nin Hambledon in the1760s and the Marylebone'

Since we have seen how the retriever is working, We will implement the  Question answeing chain using `RetrievalQA`
from Langchain. However improve user experience and the quality of the conversation with the chatbot,
we need to implement `Memory`

### Implementing memory

Large Language Models are by default `stateless` meaning each incoming query is processed independently of other interactions. The only thing that exists for a stateless agent is the current input, nothing else.

For our chatbot, remembering previous interactions is important. So we will allow the DeciLM-7B-instruct LLM Lto remember previous interactions with the user.


We will use the `ConversationBufferMemory` from LangChain.

First, we create a custom prompt

### Create a custom prompt

In [17]:
custom_prompt_template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context and answer the question at the end.
If you don't know the answer just say you d not know an do not try to make up the answer nor try to use outside souces to answer. Keep the answer as concise as possible.
Context= {context}
History = {history}
Question= {question}
Helpful Answer:
"""

prompt = PromptTemplate(template=custom_prompt_template,
                        input_variables=["question", "context", "history"])

## Final question answeing chain with memory

In [18]:
qa_chain_with_memory = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff',
                                                   retriever = vectorstore_db.as_retriever(),
                                                   return_source_documents = True,
                                                   chain_type_kwargs = {"verbose": True,
                                                                        "prompt": prompt,
                                                                        "memory": ConversationBufferMemory(
                                                                            input_key="question",
                                                                            memory_key="history",
                                                                            return_messages=True)})



Try some queries now and see the answers

In [19]:
query = "Based on the Deep unlearning pdf,when was first cricket club formed?"
qa_chain_with_memory({"query": query})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are an assistant for question-answering tasks. Use the following pieces of retrieved context and answer the question at the end.
If you don't know the answer just say you d not know an do not try to make up the answer nor try to use outside souces to answer. Keep the answer as concise as possible.
Context= THE STORY OF  CRICKET /141
apart”. The world’s first cricket club was for med

Cricket Club (MCC) was founded in 1787.
During the 1760s and 1770s it became common

apart”. The world’s first cricket club was for med
in Hambledon in the1760s and the Marylebone

THE STORY OF  CRICKET /145
the foundation of the
Indian National Congress
in 1885, an organisation
History = []
Question= Based on the Deep unlearning pdf,when was first cricket club formed?
Helpful Answer:
[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


{'query': 'Based on the Deep unlearning pdf,when was first cricket club formed?',
 'result': 'The first cricket club, MCC (Marylebone Cricket Club), was formed in 1987.',
 'source_documents': [Document(page_content='THE STORY OF  CRICKET /141\napart”. The world’s first cricket club was for med', metadata={'source': '/content/cricket.pdf', 'page': 2}),
  Document(page_content='Cricket Club (MCC) was founded in 1787.\nDuring the 1760s and 1770s it became common', metadata={'source': '/content/cricket.pdf', 'page': 2}),
  Document(page_content='apart”. The world’s first cricket club was for med\nin Hambledon in the1760s and the Marylebone', metadata={'source': '/content/cricket.pdf', 'page': 2}),
  Document(page_content='THE STORY OF  CRICKET /145\nthe foundation of the\nIndian National Congress\nin 1885, an organisation', metadata={'source': '/content/cricket.pdf', 'page': 6})]}

Ask a follow-up question to see if the model has history of previous queries

In [20]:
qa_chain_with_memory({"query": "when was first indian cricket stadiim  built ?"})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are an assistant for question-answering tasks. Use the following pieces of retrieved context and answer the question at the end.
If you don't know the answer just say you d not know an do not try to make up the answer nor try to use outside souces to answer. Keep the answer as concise as possible.
Context= the country’s first Test captain.
India entered the world of Test cricket in 1932,

(i)India joined the world of Test
cricket before Independence.

THE STORY OF  CRICKET /145
the foundation of the
Indian National Congress
in 1885, an organisation

Parsis founded the first Indian cricket club, the________________
________________
________________
History = [HumanMessage(content='Based on the Deep unlearning pdf,when was first cricket club formed?'), AIMessage(content='The first cricket club, MCC (Marylebone Cricket Club), was formed in 1987.')]
Que

{'query': 'when was first indian cricket stadiim  built ?',
 'result': 'I do not know.',
 'source_documents': [Document(page_content='the country’s first Test captain.\nIndia entered the world of Test cricket in 1932,', metadata={'source': '/content/cricket.pdf', 'page': 6}),
  Document(page_content='(i)India joined the world of Test\ncricket before Independence.', metadata={'source': '/content/cricket.pdf', 'page': 7}),
  Document(page_content='THE STORY OF  CRICKET /145\nthe foundation of the\nIndian National Congress\nin 1885, an organisation', metadata={'source': '/content/cricket.pdf', 'page': 6}),
  Document(page_content='Parsis founded the first Indian cricket club, the________________\n________________\n________________', metadata={'source': '/content/cricket.pdf', 'page': 4})]}

## Gradio Interface for your chatbot

In [21]:
import gradio as gr


In [22]:

def load_llm():
    # Loads the  DeciLM-7B-instruct llm when called
    model_id = "Deci/DeciLM-7B-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                 trust_remote_code=True,
                                                 device_map = "auto",
                                                 )
    pipe = pipeline("text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    temperature=0,
                    num_beams=5,
                    no_repeat_ngram_size=4,
                    early_stopping=True,
                    max_new_tokens=100,
                )

    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

def add_text(history, text):
  # Adding user query to the chatbot and chain
  # use history with curent user question
  if not text:
      raise gr.Error('Enter text')
  history = history + [(text, '')]
  return history

def upload_file(files):
  # Loads files when the file upload button is clicked
  # Displays them on the File window
  # print(type(file))
  return files

def process_file(files):

    """Function reads each loaded file, and extracts text from each of their pages
    The extracted text is store in the 'text variable which is the passed to the splitter
    to make smaller chunks necessary for easier information retrieval and adhere to max-tokens(4096) of DeciLM-7B-instruct"""

    pdf_text = ""
    for file in files:
      pdf = PyPDF2.PdfReader(file.name)
      for page in pdf.pages:
          pdf_text += page.extract_text()


    # split into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=200)
    splits = text_splitter.create_documents([pdf_text])

    # create a FAISS vector store db
    # embedd the chunks and store in the db
    vectorstore_db = FAISS.from_documents(splits, embeddings)

    #create a custom prompt
    custom_prompt_template = """You have been given the following documents to answer the user's question.
    If you do not have information from the files given to answer the questions just say I don't have information from the given files to answer. Do not try to make up an answer.
    Context: {context}
    History: {history}
    Question: {question}

    Helpful answer:
    """
    prompt = PromptTemplate(template=custom_prompt_template, input_variables=["question", "context", "history"])

    # set QA chain with memory
    qa_chain_with_memory = RetrievalQA.from_chain_type(llm=load_llm(),
                                                       chain_type='stuff',
                                                       return_source_documents=True,
                                                       retriever=vectorstore_db.as_retriever(),
                                                       chain_type_kwargs={"verbose": True,
                                                                          "prompt": prompt,
                                                                          "memory": ConversationBufferMemory(
                                                                              input_key="question",
                                                                              memory_key="history",
                                                                              return_messages=True) })
    # get answers
    return qa_chain_with_memory

def generate_bot_response(history,query, btn):
  """Fiunction takes the query, history and inputs from the qa chain when the submit button is clicked
  to generate a response to the query"""

  if not btn:
      raise gr.Error(message='Upload a PDF')

  qa_chain_with_memory = process_file(btn) # run the qa chain with files from upload
  bot_response = qa_chain_with_memory({"query": query})
  # simulate streaming
  for char in bot_response['result']:
          history[-1][-1] += char
          time.sleep(0.05)
          yield history,''

# The GRADIO Interface
with gr.Blocks() as demo:
    with gr.Row():
            with gr.Row():
              # Chatbot interface
              chatbot = gr.Chatbot(label="DeciLM-7B-instruct bot",
                                   value=[],
                                   elem_id='chatbot')
            with gr.Row():
              # Uploaded PDFs window
              file_output = gr.File(label="Your PDFs")

              with gr.Column():
                # PDF upload button
                btn = gr.UploadButton("📁 Upload a PDF(s)",
                                      file_types=[".pdf"],
                                      file_count="multiple")

    with gr.Column():
        with gr.Column():
          # Ask question input field
          txt = gr.Text(show_label=False, placeholder="Enter question")

        with gr.Column():
          # button to submit question to the bot
          submit_btn = gr.Button('Ask')

    # Event handler for uploading a PDF
    btn.upload(fn=upload_file, inputs=[btn], outputs=[file_output])

    # Event handler for submitting text question and generating response
    submit_btn.click(
        fn= add_text,
        inputs=[chatbot, txt],
        outputs=[chatbot],
        queue=False
        ).success(
          fn=generate_bot_response,
          inputs=[chatbot, txt, btn],
          outputs=[chatbot, txt]
        ).success(
          fn=upload_file,
          inputs=[btn],
          outputs=[file_output]
        )

if __name__ == "__main__":
    demo.launch() # launch app

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://c4122ef7fc70e537ec.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
