In [None]:
#Flow of the process of the below code
# 1. Install necessary packages.
#    - huggingface/transformers: For accessing and using pre-trained language models.
#    - datasets: For loading and handling datasets.
#    - loralib: Custom library, purpose not specified.
#    - sentencepiece: For tokenization.
#    - bitsandbytes: For quantization methods in neural networks.
#    - accelerate: For high-performance training and inference.
#    - xformers: For transformer models and related utilities.
#    - langchain: Custom library for language processing.
#    - gradio: For creating UI for the chat bot.
#    - peft: Custom library, purpose not specified.
#    - chromadb: For creating vector stores.
#    - unstructured: Custom library, purpose not specified.
#    - sentence_transformers: For generating sentence embeddings.
#    - pypdf: For working with PDF documents.
#    - nvidia-smi: For checking GPU status.

# 2. Login to Hugging Face Hub.
#    - huggingface_hub: For accessing models and resources from the Hugging Face model hub.

# 3. Import required libraries.
#    - torch: For tensor operations.
#    - PeftModel, PeftConfig: Custom libraries, purpose not specified.
#    - AutoModelForCausalLM, AutoTokenizer: For loading pre-trained language models and tokenization.
#    - pipeline: For creating pipelines for text generation.
#    - HuggingFaceEmbeddings, MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter, PyPDFLoader:
#      From langchain library for language processing tasks.
#    - gradio: For creating UI components.

# 4. Set up model configuration and load pre-trained conversational model with quantization enabled.
#    - BitsAndBytesConfig: Configuration for quantization method.
#    - AutoModelForCausalLM, AutoTokenizer: For loading pre-trained conversational model and tokenizer.
#    - model_id: Identifier for pre-trained model.
#    - quantization_config: Configuration for model quantization.
#    - tokenizer: Tokenizer for pre-trained model.
#    - model: Pre-trained conversational model.

# 5. Define functions to generate prompts combining system and user instructions.
#    - get_prompt(): Function to generate prompt templates.

# 6. Set up language processing libraries and load PDF documents.
#    - HuggingFaceEmbeddings, MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter, PyPDFLoader:
#      From langchain library for language processing tasks.
#    - loader: Load PDF documents.
#    - text_splitter: Split text into chunks.
#    - db: Create vector store from loaded documents.

# 7. Create text generation pipeline.
#    - create_pipeline(): Function to create text generation pipeline.

# 8. Define class for chat bot handling memory, prompt, and retrieval.
#    - uniBot: Class for handling chat bot functionality.

# 9. Create chat bot object and initialize conversational retrieval chain.
#    - uni_bot: Object of uniBot class.
#    - bot: Chat bot object.

# 10. Define Gradio UI components for user interaction.
#     - Gradio components: Textbox, Chatbot, Button.

# 11. Define functions for responding to user inputs, clearing memory, and updating system prompt.
#     - Functions: respond(), clear_llm_memory(), update_prompt().

# 12. Launch the UI with debugging enabled.
#     - demo.launch(): Launch the Gradio UI.



In [None]:
# @title
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate xformers
!pip -q install langchain
!pip -q install gradio

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━

In [None]:
!pip -q install peft chromadb
!pip -q install unstructured
!pip install -q sentence_transformers
!pip -q install pypdf

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/199.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/199.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [None]:
!nvidia-smi

Sat May 11 19:11:03 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               8W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

In [None]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False)

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config = bnb_config,device_map={"":0})

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
import json
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""



def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader("/content/The master examination consists of module and sub.pdf")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 500,
    chunk_overlap  = 20,
    length_function = len,
)

In [None]:
pages = loader.load_and_split(text_splitter)

In [None]:
db = Chroma.from_documents(pages, HuggingFaceEmbeddings(), persist_directory = '/content/db')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
instruction = "Given the context that has been provided. \n {context}, Answer the following question - \n{question}"

system_prompt = """You are an expert in university information.
You will be given a context to answer from. Be precise in your answers wherever possible.
In case you are sure you don't know the answer then you say that based on the context you don't know the answer.
In all other instances you provide an answer to the best of your capability. Cite urls when you can access them related to the context."""

get_prompt(instruction, system_prompt)

"[INST]<<SYS>>\nYou are an expert in university information.\nYou will be given a context to answer from. Be precise in your answers wherever possible.\nIn case you are sure you don't know the answer then you say that based on the context you don't know the answer.\nIn all other instances you provide an answer to the best of your capability. Cite urls when you can access them related to the context.\n<</SYS>>\n\nGiven the context that has been provided. \n {context}, Answer the following question - \n{question}[/INST]"

In [None]:
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory

In [None]:
template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

[INST]<<SYS>>
You are an expert in university information.
You will be given a context to answer from. Be precise in your answers wherever possible.
In case you are sure you don't know the answer then you say that based on the context you don't know the answer.
In all other instances you provide an answer to the best of your capability. Cite urls when you can access them related to the context.
<</SYS>>

Given the context that has been provided. 
 {context}, Answer the following question - 
{question}[/INST]


In [None]:

memory = ConversationBufferWindowMemory(
    memory_key="chat_history", k=5,
    return_messages=True
)

In [None]:
retriever = db.as_retriever()

In [None]:
def create_pipeline(max_new_tokens=512):
    pipe = pipeline("text-generation",
                model=model,
                tokenizer = tokenizer,
                max_new_tokens = max_new_tokens,
                temperature = 0)
    return pipe

In [None]:
class uniBot:
  def __init__(self, memory, prompt, task:str = "text-generation", retriever = retriever):
    self.memory = memory
    self.prompt = prompt
    self.retriever = retriever



  def create_chat_bot(self, max_new_tokens = 512):
    hf_pipe = create_pipeline(max_new_tokens)
    llm = HuggingFacePipeline(pipeline =hf_pipe)
    qa = ConversationalRetrievalChain.from_llm(
      llm=llm,
      retriever=self.retriever,
      memory=self.memory,
      combine_docs_chain_kwargs={"prompt": self.prompt}
  )
    return qa

In [None]:
uni_bot = uniBot(memory = memory, prompt = prompt)

In [None]:
bot = uni_bot.create_chat_bot()

In [None]:
import gradio as gr
import random
import time

def clear_llm_memory():
  bot.memory.clear()

def update_prompt(sys_prompt):
  if sys_prompt == "":
    sys_prompt = system_prompt
  template = get_prompt(instruction, sys_prompt)

  prompt = PromptTemplate(template=template, input_variables=["context", "question"])

  bot.combine_docs_chain.llm_chain.prompt = prompt

In [None]:
with gr.Blocks() as demo:
    update_sys_prompt = gr.Textbox(label = "Update System Prompt")
    chatbot = gr.Chatbot(label="Uni Bot", height = 300)
    msg = gr.Textbox(label = "Question")
    clear = gr.ClearButton([msg, chatbot])
    clear_memory = gr.Button(value = "Clear LLM Memory")


    def respond(message, chat_history):
        bot_message = bot({"question": message})['answer']
        chat_history.append((message, bot_message))
        return "", chat_history

    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    clear_memory.click(clear_llm_memory)
    update_sys_prompt.submit(update_prompt, inputs=update_sys_prompt)

demo.launch(share=False, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

Keyboard interruption in main thread... closing server.


