# Scrape the LangChain documentation into a ChromaDB Vector Database and use it for a GPT-4 chatbot  to talk with it



## Import Libraries 


In [1]:
#pip install langchain
#pip install openai
#pip install unstructured
#pip install chromadb
# pip install tiktoken

In [2]:
# Import libraries
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader

## Get the text data from Website URL

In [3]:
from bs4 import BeautifulSoup
import requests
import re

# Function for getting the text data from a website url
def get_data(url):
    r = requests.get(url)
    return r.text

# Function to extract and clean text content
def extract_and_clean_text(html_content):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find and extract the text content
    text_content = soup.get_text()

    # Clean up the text (remove extra whitespace, newline characters, etc.)
    cleaned_text = re.sub(r'\s+', ' ', text_content).strip()

    return cleaned_text

# Function to save the text content to a unique text file
def save_to_file(text_content, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text_content)

# Main function
def main():
    # Specify the URL of the webpage
    url = ' https://beautyinsider.sg/traditional-chinese-medicine-in-singapore/'
    # Get the HTML content from the URL
    html_content = get_data(url)

    # Extract and clean the text content
    cleaned_text = extract_and_clean_text(html_content)

    # Generate a unique filename (you can use a timestamp or other method)
    # Here, we're using a static name 'output.txt' for demonstration purposes
    filename = 'FullTCMService.txt'

    # Save the cleaned text to a text file
    save_to_file(cleaned_text, filename)

if __name__ == "__main__":
    main()


# Q&A bot with langchain over a directory

In [4]:
# Create a new openai api key
os.environ["OPENAI_API_KEY"] = "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"
# set up openai api key
openai_api_key = os.environ.get('OPENAI_API_KEY')

# Data base creation with ChromaDB

https://www.youtube.com/watch?v=3yPBVii7Ct0

In [5]:
loader = DirectoryLoader('C:/Users/65917/Desktop/FYP Code/Vector Database', glob="./*.txt")
doc = loader.load ( )
len(doc)

27

In [6]:
text_splitter = RecursiveCharacterTextSplitter (chunk_size=400, chunk_overlap=75)
texts = text_splitter.split_documents(doc)


# Count the number of chunks
len(texts)

862

In [7]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

# OpenAI embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [8]:
# Persist the db to disk
vectordb.persist()
vectordb = None

In [9]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

# Create retriever

In [10]:
retriever = vectordb.as_retriever()

In [11]:
docs = retriever.get_relevant_documents("Which hospitals in Singapore offer TCM service?")

In [12]:
retriever = vectordb.as_retriever(search_kwargs={"k": 20})

# Create a question answering chain

In [13]:
# Create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True,
                                  verbose=True)

In [14]:
# Modified function to also display publication date
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])
        # Assuming the metadata contains a 'published_date' field
        if 'published_date' in source.metadata:
            print(f"Published Date: {source.metadata['published_date']}")

query = "Which hospitals in Singapore offer TCM service?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 Singapore General Hospital, Tan Tock Seng Hospital, Kwong Wai Shiu Hospital, Singapore Chinese Medical Institution, National University Hospital, Ng Teng Fong General Hospital, Changi General Hospital, Sengkang General Hospital, Jurong Health Services, Khoo Teck Puat Hospital, Raffles Medical.


Sources:
C:\Users\65917\Desktop\FYP Code\Vector Database\TCM_Joint5.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\TCM_service8.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\TCM_service8.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FY

In [15]:
# Break it down
query = "Which hospitals in Singapore offer TCM service?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
#llm_response



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 Singapore General Hospital, Tan Tock Seng Hospital, Kwong Wai Shiu Hospital, Singapore Chinese Medical Institution, National University Hospital, Ng Teng Fong General Hospital, Changi General Hospital, Sengkang General Hospital, Jurong Health Services, Khoo Teck Puat Hospital, Raffles Medical.


Sources:
C:\Users\65917\Desktop\FYP Code\Vector Database\TCM_Joint5.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\TCM_service8.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\TCM_service8.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\FullTCMService.txt
C:\Users\65917\Desktop\FY

In [16]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x1ff45a28a90>)

In [17]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


# Deleteing the DB

In [18]:
!zip -r db.zip ./db

  adding: db/ (164 bytes security) (stored 0%)
  adding: db/74f404ba-5e9f-403f-8ec0-59ce64032473/ (164 bytes security) (stored 0%)
  adding: db/74f404ba-5e9f-403f-8ec0-59ce64032473/data_level0.bin (164 bytes security) (deflated 100%)
  adding: db/74f404ba-5e9f-403f-8ec0-59ce64032473/header.bin (164 bytes security) (deflated 61%)
  adding: db/74f404ba-5e9f-403f-8ec0-59ce64032473/length.bin (164 bytes security) (stored 0%)
  adding: db/74f404ba-5e9f-403f-8ec0-59ce64032473/link_lists.bin (164 bytes security) (stored 0%)
  adding: db/chroma.sqlite3 (164 bytes security) (deflated 38%)


# Starting again loading the db

In [19]:
persist_directory = 'db'
embedding = OpenAIEmbeddings()

vectordb2 = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding,
                   )

retriever = vectordb2.as_retriever(search_kwargs={"k": 20})

#### Usung turbo GPT API

In [20]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

In [21]:
# Create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True,
                                  verbose=True)

In [22]:
# Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [23]:
# Question
query = "Do I still need to wear a mask on the bus or MRT?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
From 13 Feb 2023, mask-wearing on public transport will no longer be compulsory. However, commuters, especially the elderly and immunocompromised, as well as those unwell with symptoms of COVID-19 or other respiratory infections, are encouraged to continue wearing masks on their public transport journeys.


Sources:
C:\Users\65917\Desktop\FYP Code\Vector Database\TCM_Joint7.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\WEARMASK.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\TCM_Joint7.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\WEARMASK.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\social_distance_requirement.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\wear_mask_hospital.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\social_distance_requirement.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\social_distance_requirement.txt
C:\Users\65917\Desktop\FYP Code\Vector Database\so

In [24]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [25]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}


## Evaluation 

In [26]:
pip install bert-score

Note: you may need to restart the kernel to use updated packages.


In [27]:
pip install git+https://github.com/Tiiiger/bert_score

Collecting git+https://github.com/Tiiiger/bert_score
  Cloning https://github.com/Tiiiger/bert_score to c:\users\65917\appdata\local\temp\pip-req-build-larxf21j
  Resolved https://github.com/Tiiiger/bert_score to commit dbcf6db37e8bd6ff68446f06b0ba5d0763b62d20
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Note: you may need to restart the kernel to use updated packages.


  Running command git clone --filter=blob:none --quiet https://github.com/Tiiiger/bert_score 'C:\Users\65917\AppData\Local\Temp\pip-req-build-larxf21j'
