## **How to create a chatbot with Private Knowledge-base with RAG**

### **What is it?**

* **The chatbot can answer question related a particular document, specific business, product or domain**

* **Unlike GPT, a personal chatbot is trained using RAG**

### **How this thing can be done**

* **The user should be allowed to upload a document**
  * **System should be able to read the document**
-----------------------
* **Stem and Split all the data**
* **Each chink will converted to numerical representation**

In [None]:
!pip install PyPDF2



In [None]:
import os
import nltk
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup as bs
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize

In [None]:
nltk.download("punkt")

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
segments_SIZE = 99999
NUMBER_OF_MATCHES = 3

In [None]:
ps = PorterStemmer()

### **This function will help us pre-process**

In [None]:
def process_text(txt, segments_size = segments_SIZE):
  sentences = sent_tokenize(txt)

  # TI s
  original_text = []
  processed_text = []
  segments = ""

# If the len of current Seg with len of current state is greater than decided chunk size
  for x in sentences:
    if len(segments) + len(x) >segments_size:
      original_text.append(segments)
      processed_text.append(" ".join([ps.stem(word) for word in segments.split()]))
      segments = x
    else:
      segments += " " + x
  # This is for handling the last piece of text / segment
  if segments:
    original_text.append(segments)
    processed_text.append(" ".join([ps.stem(word) for word in segments.split()]))

  return original_text, processed_text

### **Load the PDF**

In [None]:
def read_pdf(file_path):
  with open(file_path, "rb") as file:
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
      text += page.extract_text()
  return process_text(text)

### **Read the HTML**

In [None]:
def read_HTML(file_path):
  with open(file_path, "r") as file:
    soupFile = bs(file, "html.parser")
    text = soupFile.get_text()
    return process_text(text)

### **Read the Text File**

In [None]:
def read_text(file_path):
  with open(file_path, "r") as file:
    text = file.read()
    return process_text(text)

## **Processing the Content**

### **Finding the best matches (similarity)**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
documents = []

original_docs = []

vectors = None

In [None]:
def add_document(text):
  documents.extend(text)
  vectors = vectorizer.fit_transform(documents)
  return vectors

In [None]:
def process_and_add_document(file_path, file_type):
  if file_type == "pdf":
    original_data, processed_text = read_pdf(file_path=file_path)
  elif file_type == "html":
    original_data, processed_text = read_HTML(file_path=file_path)
  elif file_type == "text":
    original_data, processed_text = read_text(file_path=file_path)
  else:
    raise ValueError("Unsupported File Format recieved! Please pass relevant File format!!")

  original_docs.extend(original_data) # All the original data segment will be stored here
  vectors = add_document(processed_text)
  return vectors

In [None]:
def find_best_matches(query, top_n = NUMBER_OF_MATCHES):
  query_processed = process_text(query)[1]
  query_vector = vectorizer.transform(query_processed)
  similarity = (query_vector * vectors.T).toarray()

  best_match_indexes = similarity.argsort()[0][-top_n:][::-1]

  return [original_docs[i] for i in best_match_indexes], [documents[i] for i in best_match_indexes]

## **Constructing the Prompt for LLM**

### **Engineering a Prompt**

In [None]:
!pip install cohere



In [None]:
import cohere
from google.colab import userdata

In [None]:
co = cohere.ClientV2(api_key=userdata.get("CohereKey"))

In [None]:
def get_resp(query, context):
  messages = [
      {"role": "system", "content": "You are a AI assistant. Use the provided context to answer the user's query accurately and precisely. Try to keep answer concise."},
      {"role" : "system", "content": context},
      {"role":"user", "content": query}
  ]

  response = co.chat(
    model="command-r-plus-08-2024",
    messages = messages
  )

  return response.message.content[0].text.strip()

## **Put all this together**

In [None]:
def reset_database():
  global documents, original_docs, vectors
  documents = []
  original_docs = []
  vectors = None

In [None]:
def initialize(file_name):
  file_type = file_name.split(".")[-1]
  return process_and_add_document(file_path=file_name, file_type=file_type)

In [None]:
def chat(user_query, is_debug = False):
  original_best_docs_match, processed_doc_match = find_best_matches(user_query)
  context = "\n\n".join(original_best_docs_match)

  if is_debug:
    print(f"Context: {context}")
  resp = get_resp(user_query, context)
  return resp

## **Test**

In [None]:
import requests

def download_files():
  samples_files = [
      {
          "url" : "https://www.ipcc.ch/report/ar6/wg1/downloads/outreach/IPCC_AR6_WGI_SummaryForAll.pdf",
          "file_name":"climateChange.pdf"
      },
      {
          "url":"https://medium.com/illumination/i-tried-10-decaf-coffees-as-a-first-time-coffee-drinker-heres-what-i-found-a8c5fb93a40e",
          "file_name": "coffee.html"
      }
  ]

  for x in samples_files:
    resp = requests.get(x["url"])
    with open(x["file_name"] ,"wb") as f:
      f.write(resp.content)

  return [files["file_name"] for files in samples_files]

In [None]:
files_names = download_files()

for file_name in files_names:
  print(file_name)

climateChange.pdf
coffee.html


In [None]:
reset_database()

In [None]:
vectors = initialize("climateChange.pdf")

In [None]:
resp = chat("Who are the authors of the report")

In [None]:
print(resp)

The report is written by members of the Working Group I Technical Support Unit (WGI TSU) and several authors of the report. The authors are:

- Sarah Connors (WGI TSU)
- Sophie Berger (WGI TSU)
- Clotilde Péan (WGI TSU)
- Govindasamy Bala (Chapter 4 author)
- Nada Caud (WGI TSU)
- Deliang Chen (Chapter 1 author)
- Tamsin Edwards (Chapter 9 author)
- Sandro Fuzzi (Chapter 6 author)
- Thian Yew Gan (Chapter 8 author)
- Melissa Gomis (WGI TSU)
- Ed Hawkins (Chapter 1 author)
- Richard Jones (Atlas Chapter author)
- Robert Kopp (Chapter 9 author)
- Katherine Leitzell (WGI TSU)
- Elisabeth Lonnoy (WGI TSU)
- Douglas Maraun (Chapter 10 author)
- Valérie Masson-Delmotte (WGI Co-Chair)
- Tom Maycock (WGI TSU)
- Anna Pirani (WGI TSU)
- Roshanka Ranasinghe (Chapter 12 author)
- Joeri Rogelj (Chapter 5 author)
- Alex C. Ruane (Chapter 12 author)
- Sophie Szopa (Chapter 6 author)
- Panmao Zhai (WGI Co-Chair)


In [None]:
reset_database()
vectors = initialize("coffee.html")

while True:
  user_query = input("Hi, Please ask! (type 'quit' or 'exit' to stop): ")
  if user_query.lower() in ["quit", "exit"]:
    print("Thanks, Hope it helped you!. (PrivateAI left the conversation)..")
    break

  print("========================================")
  print(f"User: \"{user_query}\"")
  resp = chat(user_query)
  print("PrivateAI: ", resp, flush = True)

Hi, Please ask! (type 'quit' or 'exit' to stop): What is main idea in the document
User: "What is main idea in the document"
PrivateAI:  The author, Kory Becker, shares their experience as a first-time coffee drinker, trying out 10 different decaf coffee brands. They provide a personal review of these coffees based on taste, experience, and price, offering insights into their preferences as a newcomer to the world of coffee.
Hi, Please ask! (type 'quit' or 'exit' to stop): Who is author
User: "Who is author"
PrivateAI:  The author of the article is Kory Becker.
Hi, Please ask! (type 'quit' or 'exit' to stop): quit
Thanks, Hope it helped you!. (PrivateAI left the conversation)..


In [None]:
# prompt: Create a interface that help to upload file and then start the conversation with chatbot

import ipywidgets as widgets
from IPython.display import display, clear_output

# ... (Your existing code) ...

# Create file upload widget
uploader = widgets.FileUpload(
    accept='.pdf,.html,.txt',  # Accept PDF, HTML, and text files
    multiple=False  # Allow only one file at a time
)

# Create text input widget for user queries
text_input = widgets.Text(placeholder='Ask your question here...')

# Create output widget to display chatbot responses
output = widgets.Output()

# Function to handle file upload
def on_file_upload(change):
  with output:
    clear_output()  # Clear previous output
    uploaded_file = list(change['new'].values())[0]
    file_name = uploaded_file['metadata']['name']
    with open(file_name, 'wb') as f:
      f.write(uploaded_file['content'])

    try:
      global vectors
      reset_database()
      vectors = initialize(file_name)
      print(f"File '{file_name}' uploaded successfully.")
      print("Ready for your questions!")
    except Exception as e:
      print(f"Error processing the uploaded file: {e}")


# Function to handle user queries
def on_submit(change):
  with output:
    clear_output(wait=True)  # Clear output and wait for new output
    user_query = text_input.value
    text_input.value = ''  # Clear the input field after submission

    if user_query.lower() in ["quit", "exit"]:
      print("Thanks, Hope it helped you!. (PrivateAI left the conversation)..")
      return

    print(f"User: \"{user_query}\"")
    resp = chat(user_query)
    print("PrivateAI: ", resp, flush=True)

# Attach event handlers
uploader.observe(on_file_upload, names='value')
text_input.on_submit(on_submit)

# Display the widgets
display(uploader)
display(text_input)
display(output)


FileUpload(value={}, accept='.pdf,.html,.txt', description='Upload')

Text(value='', placeholder='Ask your question here...')

Output()

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.23.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradi

In [None]:
import gradio as gr

# Assuming reset_database(), initialize(file_path), chat(query) are defined elsewhere

def process_file_and_chat(file_obj, query):
    if file_obj is None:
        return "Please upload a file first.", ""

    file_path = file_obj.name
    try:
        reset_database()
        initialize(file_path)
        response = chat(query)
        return "File processed. Ready for questions!", response
    except Exception as e:
        return f"Error processing file: {e}", ""

def respond(message, chat_history):
    bot_message = chat(message)
    chat_history.append((message, bot_message))
    return "", chat_history

with gr.Blocks() as demo:
    gr.Markdown("# File Upload and Chatbot Interface")
    with gr.Row():
        file_input = gr.File(label="Upload File", file_types=[".pdf", ".html", ".txt"])
        text_input = gr.Textbox(label="Enter your question")
    process_button = gr.Button("Process File and Start Chat")
    output_message = gr.Textbox(label="Status")
    chatbot = gr.Chatbot(label="Chatbot")
    clear = gr.ClearButton([text_input, chatbot])

    process_button.click(
        fn=process_file_and_chat,
        inputs=[file_input, text_input],
        outputs=[output_message, chatbot]
    )

    text_input.submit(respond, [text_input, chatbot], [text_input, chatbot])

demo.launch()

  chatbot = gr.Chatbot(label="Chatbot")


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://146e28f722bbb37fea.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


