In [4]:
# !pip install -U langchain langchain-community langchainhub bs4

In [None]:
import os
import bs4
# from getpass import getpass
from langchain_community.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS, Chroma
import util

: 

In [None]:
# HUGGINGFACEHUB_API_TOKEN = getpass()
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

: 

In [None]:
web_path = ["https://ucsd.libguides.com/gis/gisdata",
    #       "https://ucsd.libguides.com/az.php"
            ]

: 

In [5]:
config = util.load_config('config/config.yaml')
os.environ["HUGGINGFACEHUB_API_TOKEN"] = config['HUGGINGFACEHUB_API_TOKEN']
config

{'HUGGINGFACEHUB_API_TOKEN': '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',
 'emb_model_name': 'sentence-transformers/all-MiniLM-l6-v2',
 'llm_repo_id': 'HuggingFaceH4/zephyr-7b-beta',
 'llm_task': 'text-generation',
 'response_max_length': 1000}

In [6]:
def load_webpage(web_path, use_strainer=False, strainer_class=None):
    loader = None
    if use_strainer:
        bs4_strainer = bs4.SoupStrainer(class_=(strainer_class))
        loader = WebBaseLoader(
            web_paths=(web_path),
            bs_kwargs={"parse_only": bs4_strainer},
        )
    else: 
        loader = WebBaseLoader(web_paths=(web_path))
    docs = loader.load()
    return docs

In [7]:
docs = load_webpage(["https://ucsd.libguides.com/gis/gisdata"], use_strainer=True, strainer_class='s-lib-box-content')

In [8]:
docs

[Document(page_content="\nBrowse the GIS data holdings on X drive\n\nUC San Diego affiliate can browse the Library's geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)\nThe data is organized geographically, roughly by continent, with topical data arrangement inside each folder.\nRemember, if you are looking for data on a smaller area of geography, be sure to check the folder with the larger geography first.\xa0 An example would be if you are looking for data for only one of the states in the United States, you would want to look in the United States Data folder as well as the North America data folder.\nVPN (Cisco AnyConnect Client)Download VPN software on your computer to get off-campus access for everything you'd get using

In [9]:
def split_documents(docs, chunk_size=1000, chunk_overlap=200, add_start_index=True):
    # split document
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
    )
    return text_splitter.split_documents(docs)

In [10]:
split_docs = split_documents(docs)
embeddings = HuggingFaceInferenceAPIEmbeddings(
        api_key=config['HUGGINGFACEHUB_API_TOKEN'], model_name=config['emb_model_name'])

In [11]:
vectorstore = Chroma.from_documents(documents=split_docs, embedding=embeddings,collection_metadata={"hnsw:space": "cosine"})

KeyError: 0

In [None]:
retriever = vectorstore.as_retriever(search_type="mmr")

In [193]:
retriever.get_relevant_documents("What is X drive?")

[Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1}),
 Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1}),
 Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1}),
 Document(page_content="UC San Diego affiliate can browse the Library's geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)\nThe data is organized geographically, roughly by continent, with topical data arrangement inside each folder.\nRemember, if you are looking for data on a smaller area

In [188]:
vectorstore.similarity_search_with_score("What is X drive?")

[(Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1}),
  0.4245591163635254),
 (Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1}),
  0.4245591163635254),
 (Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1}),
  0.4245591163635254),
 (Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1}),
  0.4245591163635254)]

In [204]:
vectorstore = FAISS.from_documents(documents=split_docs, embedding=embeddings)
# retriever = vectorstore.as_retriever(search_type="similarity")
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": -0.02})

: 

In [None]:
retriever.get_relevant_documents("What is X drive?")



[Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1})]

In [142]:
vectorstore.similarity_search_with_score("Who is the president of UC San Diego?")

[(Document(page_content="UC San Diego affiliate can browse the Library's geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)\nThe data is organized geographically, roughly by continent, with topical data arrangement inside each folder.\nRemember, if you are looking for data on a smaller area of geography, be sure to check the folder with the larger geography first.\xa0 An example would be if you are looking for data for only one of the states in the United States, you would want to look in the United States Data folder as well as the North America data folder.", metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 42}),
  1.4470344),
 (Document(page_content='California State GeoportalThe California Sta

In [13]:
def get_retriever(config, embeddings, documents, search_type="similarity", topk=2):
    vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)
    
    retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": topk})
    return retriever

In [14]:
# store as a vector
retriever = get_retriever(config, embeddings, split_docs)

In [15]:
def create_llm(config):
    llm = HuggingFaceHub(
        repo_id=config['llm_repo_id'], 
        task=config['llm_task'],
        model_kwargs={"temperature": 0.1, "max_length": config['response_max_length']}
    )
    
    return llm

In [16]:
llm = create_llm(config)

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
def create_custom_prompt():
    template = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    Always say "thanks for asking!" at the end of the answer.

    {context}

    Question: {question}
    Answer:
    """

    custom_rag_prompt = PromptTemplate.from_template(template)
    
    return custom_rag_prompt

In [18]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [19]:
custom_rag_prompt = create_custom_prompt()

In [20]:
custom_rag_prompt

PromptTemplate(input_variables=['context', 'question'], template='You are an assistant for question-answering tasks. \n    Use the following pieces of retrieved context to answer the question.\n    If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n    Use three sentences maximum and keep the answer as concise as possible.\n    Always say "thanks for asking!" at the end of the answer.\n\n    {context}\n\n    Question: {question}\n    Answer:\n    ')

In [21]:
question = "What is X drive?"

In [22]:
custom_rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
    )

In [23]:
custom_rag_chain.invoke(question)

'You are an assistant for question-answering tasks. \n    Use the following pieces of retrieved context to answer the question.\n    If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n    Use three sentences maximum and keep the answer as concise as possible.\n    Always say "thanks for asking!" at the end of the answer.\n\n    Browse the GIS data holdings on X drive\n\nUC San Diego affiliate can browse the Library\'s geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)\nThe data is organized geographically, roughly by continent, with topical data arrangement inside each folder.\nRemember, if you are looking for data on a smaller area of geography, be sure to check the folder with t

In [77]:
class RAG():
    def __init__(self, config):
        self.config = config
        self.llm = self.create_llm(config)
        self.embeddings = self.create_embeddings(config)
        self.prompt = None
        self.docs = None
        self.split_docs = None
        self.retriever = None
        self.called = False
        
    def __call__(self, web_path, use_strainer=False, strainer_class=None, split=True, chunk_size=1000, chunk_overlap=200, add_start_index=True, search_type="similarity", topk=2):
        self.load_webpage(web_path, use_strainer, strainer_class)
        self.split_documents(chunk_size, chunk_overlap, add_start_index)
        self.set_retriever(search_type, topk)
        self.create_custom_prompt()
        self.custom_rag_chain = (
                                    {"context": self.retriever | self.format_docs, "question": RunnablePassthrough()}
                                    | self.prompt
                                    | self.llm
                                    # | StrOutputParser()
                                )
        self.called = True
    
    def clear():
        self.prompt = None
        self.docs = None
        self.split_docs = None
        self.retriever = None
        self.called = False

    def ask(self, question):
        if not self.called:
            assert False, "RAG not initialized with context. Please provide web paths"
        
        # for chunk in self.custom_rag_chain.stream(question):
        #     print(chunk, end="", flush=True)
        answer = self.custom_rag_chain.invoke(question)
        
        # cleaning up the answer 
        if "Helpful Answer" in answer:
            return re.search(r"Helpful Answer:\s*(.*)", rag.ask(question)).group(1)
        return answer
            
    
    def load_webpage(self, web_path, use_strainer, strainer_class):
        loader = None
        if use_strainer:
            bs4_strainer = bs4.SoupStrainer(class_=(strainer_class))
            loader = WebBaseLoader(
                web_paths=(web_path),
                bs_kwargs={"parse_only": bs4_strainer},
            )
        else: 
            loader = WebBaseLoader(web_paths=(web_path))
        self.docs = loader.load()
    
    def split_documents(self, chunk_size, chunk_overlap, add_start_index):
        # split document
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
        )
        self.split_docs = text_splitter.split_documents(self.docs)
    
    def set_retriever(self, search_type, topk):
        vectorstore = FAISS.from_documents(documents=self.split_docs, embedding=self.embeddings)
        retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": topk})
        self.retriever = retriever

    def create_llm(self, config):
        llm = HuggingFaceHub(
            repo_id=config['llm_repo_id'], 
            task=config['llm_task'],
            model_kwargs={"temperature": 0.1, "max_length": config['response_max_length']}
        )

        return llm
    
    def create_embeddings(self, config):
        return HuggingFaceInferenceAPIEmbeddings(api_key=config['HUGGINGFACEHUB_API_TOKEN'], model_name=config['emb_model_name'])
    
    def create_custom_prompt(self):
        template = """You are an assistant for question-answering tasks. \
        Use the following pieces of retrieved context to answer the question. \
        If you don't know the answer, just say that you don't know, don't try to make up an answer. \
        Use three sentences maximum and keep the answer as concise as possible. \
        Always say "thanks for asking!" at the end of the answer. \

        {context}

        Question: {question}

        Helpful Answer:"""

        custom_rag_prompt = PromptTemplate.from_template(template)

        self.prompt = custom_rag_prompt
    
    def format_docs(self, docs):
        return "\n\n".join(doc.page_content for doc in docs)

In [78]:
rag = RAG(config)

In [79]:
rag(["https://ucsd.libguides.com/gis/gisdata"], use_strainer=True, strainer_class='s-lib-box-content')

In [85]:
rag.ask(question)

'You are an assistant for question-answering tasks.         Use the following pieces of retrieved context to answer the question.         If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.         Use three sentences maximum and keep the answer as concise as possible.         Always say "thanks for asking!" at the end of the answer. \n        Browse the GIS data holdings on X drive\n\nUC San Diego affiliate can browse the Library\'s geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)\nThe data is organized geographically, roughly by continent, with topical data arrangement inside each folder.\nRemember, if you are looking for data on a smaller area of geography, be sure to check the

In [84]:
import re

In [97]:
print(re.search(r"Helpful Answer:\s*(.*)", rag.ask(question)).group(1))

X drive is a storage location where UC San Diego affiliates can browse the Library's geospatial data holdings. However, you cannot download any files from X drive; you must visit the Data & GIS Lab to access files until an online portal is available. Thanks for asking!


In [98]:
"Helpful Answer" in rag.ask(question)

True