In [1]:
from langchain.document_loaders import DirectoryLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embed and store
from langchain.vectorstores.elasticsearch import ElasticsearchStore 
from langchain.embeddings import GPT4AllEmbeddings
from langchain.embeddings import OllamaEmbeddings # We can also try Ollama embeddings

from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
from elqm.utils.dataFinder import get_data_dir
from elqm.backend.utils import get_es_connection
import os
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
import time

In [3]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
DATA_DIR = get_data_dir("eur_lex_data")
PREPROCESSED_DATA_DIR = get_data_dir("preprocessed")

print(os.path.abspath(DATA_DIR))
print(os.path.abspath(PREPROCESSED_DATA_DIR))

/home/psaegert/Projects/elqm-INLPT-WS2023/elqm-raw/eur_lex_data
/home/psaegert/Projects/elqm-INLPT-WS2023/elqm-raw/preprocessed


In [5]:
for filename in tqdm(os.listdir(DATA_DIR)):
    if filename.endswith(".json"):
        with open(os.path.join(DATA_DIR, filename), 'r') as f:
            data = json.load(f)
        
        bs = BeautifulSoup(data['html'], 'html.parser')

        # Get the text
        text = bs.get_text()

        data['text'] = text
        del data['html']

        with open(os.path.join(PREPROCESSED_DATA_DIR, filename), 'w') as f:
            json.dump(data, f)

In [6]:
# pip install jq
schema = {
    'jq_schema': '.text'
}

In [7]:
loader = DirectoryLoader(PREPROCESSED_DATA_DIR, glob='**/*.json', show_progress=True, loader_cls=JSONLoader, loader_kwargs=schema)

In [8]:
data = loader.load()

In [9]:
# Split into chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(data)
print(f"Split into {len(all_splits)} chunks")

In [10]:
es = get_es_connection()

In [11]:
# 100 docs, 1814 chunks, 1m 1.5s
# 300 docs, 6127 chunks, 3m 29.1s
# 508 docs, 12018 chunks, 6m 52s

In [12]:
REINDEX = False

In [13]:
# pip install gpt4all
# Needs Ubuntu > 22.04 because of glibc

if REINDEX:
    # Clear the index
    es.indices.delete(index="eurlex-langchain", ignore=[400, 404])

    start_time = time.time()
    vectorstore = ElasticsearchStore.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings(), index_name="eurlex-langchain", show_progress=True, es_connection=es)
    print(f"Embedding took {time.time() - start_time} seconds")
else:
    vectorstore = ElasticsearchStore(index_name="eurlex-langchain", es_connection=es, embedding=GPT4AllEmbeddings())

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [14]:
# Retrieve
# question = "What is the capital of france?"
# docs = vectorstore.similarity_search(question)

In [15]:
# RAG prompt
# pip install langchainhub
from langchain import hub
QA_CHAIN_PROMPT = hub.pull("rlm/rag-prompt-llama")

In [16]:
print(QA_CHAIN_PROMPT.messages[0].prompt.template)

[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<</SYS>> 
Question: {question} 
Context: {context} 
Answer: [/INST]


In [17]:
QA_CHAIN_PROMPT.messages[0].prompt.template = """[INST]<<SYS>> You are ELQM, a helpful and specialized assistant for question-answering tasks in the domain of energy law.
Use the following pieces of retrieved context comprised of EU regulations and other legal documents to answer the question.
If you don't know the answer or the question cannot be answered with the context, admit that you cannot answer the question due to the limited available context.
Furthermore, if the user asks a generic question or other situations occur, in which the context is not helpful, kindly remember the user of your purpose.
In addition to the retrieved context, you may also consider the previous conversation history to interact with the user.
Use three sentences maximum and keep the answer concise.<</SYS>> 
Question: {question} 
Context: {context} 
Answer: [/INST]"""

In [18]:
from queue import SimpleQueue, Empty
q = SimpleQueue()

In [19]:
from langchain.callbacks.base import BaseCallbackHandler
from langchain.schema import LLMResult
from typing import Any, Union

job_done = object() # signals the processing is done

class StreamingGradioCallbackHandlerQ(BaseCallbackHandler):
    def __init__(self, q: SimpleQueue):
        self.q = q

    def on_llm_start(
        self, serialized: dict[str, Any], prompts: dict[str], **kwargs: Any
    ) -> None:
        """Run when LLM starts running. Clean the queue."""
        while not self.q.empty():
            try:
                self.q.get(block=False)
            except Empty:
                continue

    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
        """Run on new LLM token. Only available when streaming is enabled."""
        self.q.put(token)

    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
        """Run when LLM ends running."""
        self.q.put(job_done)

    def on_llm_error(
        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
    ) -> None:
        """Run when LLM errors."""
        self.q.put(job_done)

In [20]:
from langchain.callbacks.streaming_stdout_final_only import FinalStreamingStdOutCallbackHandler

In [21]:
# LLM
llm = Ollama(model="llama2", verbose=True)#, callbacks=[FinalStreamingStdOutCallbackHandler(), StreamingGradioCallbackHandlerQ(q)])
print(f"Loaded LLM model {llm.model}")

Loaded LLM model llama2


In [22]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory

In [23]:
memory = ConversationBufferWindowMemory(k=5, memory_key="chat_history")

In [24]:
from langchain.chains import ConversationalRetrievalChain

class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
    def run(self, **kwargs):
        # Execute the original logic
        result = super().run(**kwargs)

        # Access the question from the kwargs
        # question = kwargs.get('question')

        # Store the retrieved documents in the memory
        # self.memory.save_context({"question": question}, {"retrieved_docs": result})  #FIXME: This is not working

        # Yield the final answer
        return result

In [25]:
qa_chain = CustomConversationalRetrievalChain.from_llm(
   llm,
   retriever=vectorstore.as_retriever(),
   combine_docs_chain_kwargs={"prompt": QA_CHAIN_PROMPT},
   memory=memory,
   get_chat_history=lambda h : h,
   # callbacks=[FinalStreamingStdOutCallbackHandler(), StreamingGradioCallbackHandlerQ(q)]
)

In [26]:
qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=vectorstore.as_retriever(),
    combine_docs_chain_kwargs={"prompt": QA_CHAIN_PROMPT},
    memory=memory,
    get_chat_history=lambda h : h,
    )

In [27]:
# Ask a question
# question = f"How loud are air conditioners?"
# result = qa_chain({"question": question})

In [28]:
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
def answer_question(question, history):
    result = qa_chain({"question": question, "chat_history": history})
    return result['answer']

In [30]:
# from threading import Thread

# def answer_question_stream(question, history):
#     # Empty the queue
#     while not q.empty():
#         try:
#             q.get(block=False)
#         except Empty:
#             continue
#     thread = Thread(target=qa_chain.run, kwargs={"question": question, "chat_history": history})
#     thread.start()
    
#     result = ""
#     while True:
#         next_token = q.get(block=True) # Blocks until an input is available
#         if next_token is job_done:
#             break
#         result += next_token
#         yield result
#     thread.join()

#     return result

In [31]:
# from threading import Thread

# def answer_question_stream_with_context(question, history):
#     context = memory.load_memory_variables({})
#     retrieved_docs = context['retrieved_docs']

#     thread = Thread(target=qa_chain.run, kwargs={"question": question, "chat_history": history, "retrieved_docs": retrieved_docs})
#     thread.start()
    
#     result = ""
#     while True:
#         next_token = q.get(block=True) # Blocks until an input is available
#         if next_token is job_done:
#             break
#         result += next_token
#         yield result
#     thread.join()

#     return qa_chain.memory.chat_memory.messages[-1].content

In [32]:
css = """
#chatbot {
   height: 100%;
}
"""

In [33]:
with gr.Blocks(css=css) as demo:
   gr.ChatInterface(fn=answer_question, title="ELQM")

demo.launch();

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


In [34]:
qa_chain.memory

ConversationBufferWindowMemory(memory_key='chat_history')