In [None]:
# !pip install -U langchain langchain-community langchainhub bs4

In [9]:
import os
import bs4
from getpass import getpass
from langchain_community.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
import util

In [2]:
HUGGINGFACEHUB_API_TOKEN = getpass()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

········


In [3]:
web_path = ["https://ucsd.libguides.com/gis/gisdata",
    #       "https://ucsd.libguides.com/az.php"
            ]

In [42]:
config = util.load_config('config/config.yaml')
config

{'HUGGINGFACEHUB_API_TOKEN': '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',
 'emb_model_name': 'sentence-transformers/all-MiniLM-l6-v2',
 'llm_repo_id': 'HuggingFaceH4/zephyr-7b-beta',
 'llm_task': 'text-generation',
 'response_max_length': 1000}

'sentence-transformers/all-MiniLM-l6-v2'

In [5]:
def load_webpage(web_path, use_strainer=False, strainer_class=None):
    loader = None
    if use_strainer:
        bs4_strainer = bs4.SoupStrainer(class_=(strainer_class))
        loader = WebBaseLoader(
            web_paths=(web_path),
            bs_kwargs={"parse_only": bs4_strainer},
        )
    else: 
        loader = WebBaseLoader(web_paths=(web_path))
    docs = loader.load()
    return docs

In [16]:
docs = load_webpage(["https://ucsd.libguides.com/gis/gisdata"], use_strainer=True, strainer_class='s-lib-box-content')

In [8]:
def split_documents(docs, chunk_size=1000, chunk_overlap=200, add_start_index=True):
    # split document
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
    )
    return text_splitter.split_documents(docs)

In [33]:
split_docs = split_documents(docs)
embeddings = HuggingFaceInferenceAPIEmbeddings(
        api_key=config['HUGGINGFACEHUB_API_TOKEN'], model_name=config['emb_model_name'])

In [34]:
def get_retriever(config, embeddings, documents, search_type="similarity", topk=2):
    vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)
    
    retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": topk})
    return retriever

In [36]:
# store as a vector
get_retriever(config, embeddings, split_docs)

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceInferenceAPIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f311e0fab20>, search_kwargs={'k': 2})

In [31]:
def create_llm(config):
    llm = HuggingFaceHub(
        repo_id=config['llm_repo_id'], 
        task=config['llm_task'],
        model_kwargs={"temperature": 0.1, "max_length": config['response_max_length']}
    )
    
    return llm

In [32]:
llm = create_llm(config)



In [24]:
def create_custom_prompt():
    template = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    Always say "thanks for asking!" at the end of the answer.

    {context}

    Question: {question}

    Helpful answer:"""

    custom_rag_prompt = PromptTemplate.from_template(template)
    
    return custom_rag_prompt

In [30]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [25]:
create_custom_prompt()

PromptTemplate(input_variables=['context', 'question'], template='You are an assistant for question-answering tasks. \n    Use the following pieces of retrieved context to answer the question.\n    If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n    Use three sentences maximum and keep the answer as concise as possible.\n    Always say "thanks for asking!" at the end of the answer.\n\n    {context}\n\n    Question: {question}\n\n    Helpful answer:')

In [29]:
question = "What is X drive?"

In [31]:
custom_rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

In [32]:
custom_rag_chain.invoke(question)

" The X drive is a storage location where UC San Diego affiliates can browse the Library's geospatial data holdings. However, you cannot download any files from there. You must be connected to UCSD VPN and visit the Data & GIS Lab to access files until an online portal is developed. Thanks for asking!"

In [46]:
class RAG():
    def __init__(self, config):
        self.config = config
        self.llm = create_llm(config)
        self.embeddings = create_embeddings(config)
        self.prompt = create_custom_prompt()
        self.docs = None
        self.split_docs = None
        self.retriever = None
        self.called = False
        
    def __call__(self, web_path, use_strainer=False, strainer_class=None, split=True, chunk_size=1000, chunk_overlap=200, add_start_index=True, search_type="similarity", topk=2):
        load_webpage(web_path, use_strainer, strainer_class)
        split_documents(chunk_size, chunk_overlap, add_start_index)
        get_retriever()
        self.custom_rag_chain = (
                                    {"context": self.retriever | format_docs, "question": RunnablePassthrough()}
                                    | self.prompt
                                    | self.llm
                                    | StrOutputParser()
                                )
    def ask(self, question):
        if not self.called:
            assert False, "RAG not initialized with context. Please provide web paths"
        return self.custom_rag_chain.invoke(question)
    
    def load_webpage(self, web_path, use_strainer, strainer_class):
        loader = None
        if use_strainer:
            bs4_strainer = bs4.SoupStrainer(class_=(strainer_class))
            loader = WebBaseLoader(
                web_paths=(web_path),
                bs_kwargs={"parse_only": bs4_strainer},
            )
        else: 
            loader = WebBaseLoader(web_paths=(web_path))
        self.docs = loader.load()
    
    def split_documents(self, chunk_size, chunk_overlap, add_start_index):
        # split document
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
        )
        self.split_docs = text_splitter.split_documents(self.docs)
    
    def get_retriever(self, search_type, topk):
        vectorstore = FAISS.from_documents(documents=self.split_docs, embedding=self.embeddings)
        retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": topk})
        self.retriever = retriever

    def create_llm(self, config):
        llm = HuggingFaceHub(
            repo_id=config['llm_repo_id'], 
            task=config['llm_task'],
            model_kwargs={"temperature": 0.1, "max_length": config['response_max_length']}
        )

        return llm
    
    def create_embeddings(self, config):
        return HuggingFaceInferenceAPIEmbeddings(api_key=config['HUGGINGFACEHUB_API_TOKEN'], model_name=config['emb_model_name'])
    
    def create_custom_prompt(self):
        template = """You are an assistant for question-answering tasks. 
        Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know, don't try to make up an answer.
        Use three sentences maximum and keep the answer as concise as possible.
        Always say "thanks for asking!" at the end of the answer.

        {context}

        Question: {question}

        Helpful answer:"""

        custom_rag_prompt = PromptTemplate.from_template(template)

        return custom_rag_prompt
    
    def format_docs(self, docs):
        return "\n\n".join(doc.page_content for doc in docs)

In [47]:
rag = RAG(config)



NameError: name 'create_embeddings' is not defined