## General Imports

In [2]:
from elqm.utils import get_data_dir
from elasticsearch import Elasticsearch
import os
import time

## Get directories for data

In [3]:
DATA_DIR = get_data_dir("eur_lex_data")
PREPROCESSED_DATA_DIR = get_data_dir("preprocessed")

print("DATA_DIR: ", DATA_DIR)
print("PREPROCESSED_DATA_DIR: ", PREPROCESSED_DATA_DIR)

DATA_DIR:  /home/computerman/Desktop/NLPT/elqm-INLPT-WS2023/elqm-raw/eur_lex_data
PREPROCESSED_DATA_DIR:  /home/computerman/Desktop/NLPT/elqm-INLPT-WS2023/elqm-raw/preprocessed


## Initilize the LLM Model

In [4]:
from langchain.llms import Ollama

# Initilize the LLM model
llm = Ollama(model="llama2")

## There are several options for promts, here are several

In [4]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("Answer the question based on the context: {context} \n\nQuestion: {question}")

In [7]:
# Construct a prompt for the LLM model
# There are a lot of ways to construct a prompt

from langchain.prompts.chat import ChatPromptTemplate

template = "You are a helpful assistant that translates {input_language} to {output_language}."
human_template = "{text}"

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])

prompt.format_messages(input_language="English", output_language="French", text="I love programming.")
print(type(prompt))

<class 'langchain_core.prompts.chat.ChatPromptTemplate'>


In [4]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt-llama")

prompt.messages[0].prompt.template = """[INST]<<SYS>> You are ELQM, a helpful and specialized assistant for question-answering tasks in the domain of energy law.
Use the following pieces of retrieved context comprised of EU regulations and other legal documents to answer the question.
If you don't know the answer or the question cannot be answered with the context, admit that you cannot answer the question due to the limited available context.
Furthermore, if the user asks a generic question or other situations occur, in which the context is not helpful, kindly remember the user of your purpose.
In addition to the retrieved context, you may also consider the previous conversation history to interact with the user.
Use three sentences maximum and keep the answer concise.<</SYS>> 
Question: {question} 
Context: {context} 
Answer: [/INST]"""

print(type(prompt))

<class 'langchain_core.prompts.chat.ChatPromptTemplate'>


  warn_beta(


## This thing converts a langchain message type to a string (Do we want this?)

In [5]:
# Convert message into a string (AT LEAST IT CLAIMS THIS BUT I SPOTTED NO DIFFERNECE)

from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

## Construct a chain and test it

In [32]:
chain = prompt | llm | output_parser

In [36]:
result = chain.invoke({"context": "Africa is a continent.", "question": "What is Africa?"})
print(type(result))
print(result)

<class 'str'>
Africa is a continent located in the Eastern Hemisphere, bounded by the Mediterranean Sea to the north, the Atlantic Ocean to the west, and the Indian Ocean to the south. It is home to 54 recognized sovereign states, with a diverse range of cultures, languages, and landscapes. Africa is the second-largest continent in the world by area and is home to over 1.3 billion people, making it the second-most populous continent after Asia. The continent is known for its rich cultural heritage, including music, art, and literature, as well as its diverse wildlife and natural beauty.


## Load the preprocessed data using langchain directoryLoader

In [5]:
from langchain.document_loaders import DirectoryLoader, JSONLoader

schema = {"jq_schema": ".\"File content\""}
print(os.path.abspath(PREPROCESSED_DATA_DIR))

# USE A DIR WHERE THE FINISHED PREPROCESSED DATA IS STORED
loader = DirectoryLoader(os.path.abspath(PREPROCESSED_DATA_DIR),
                        glob='*final.json', # [0-9]final.json
                         show_progress=True,
                         loader_cls=JSONLoader,
                         loader_kwargs=schema)

/home/computerman/Desktop/NLPT/elqm-INLPT-WS2023/elqm-raw/preprocessed


In [7]:
data = loader.load()

100%|██████████| 508/508 [00:00<00:00, 567.69it/s]


In [8]:
print("Data storage type:", type(data))
print("Data length:", len(data))
print("Data type:", type(data[0]))
print("Data content:", data[0])

Data storage type: <class 'list'>
Data length: 508
Data type: <class 'langchain_core.documents.base.Document'>
Data content: page_content="\nEUR-Lex - 31953D0030 - EN\nAvis juridique important\n|\n31953D0030\nECSC High Authority: Decision No 30-53 of 2 May 1953 on practices prohibited by Article 60 (1) of the Treaty in the common market for coal and steel  \nOfficial Journal 006 , 04/05/1953 P. 0109 - 0110\n Danish special edition: Series I Chapter 1952-1958 P. 0009 \n English special edition: Series I Chapter 1952-1958 P. 0009 \n Greek special edition: Chapter 08 Volume 1 P. 0005 \n Spanish special edition: Chapter 08 Volume 1 P. 0005 \n Portuguese special edition Chapter 08 Volume 1 P. 0005 \n Finnish special edition: Chapter 12 Volume 3 P. 0003 \n Swedish special edition: Chapter 12 Volume 3 P. 0003 \nDECISION No 30-53  of 2 May 1953  on practices prohibited by Article 60 (1) of the Treaty in the common market for coal and steel\nTHE HIGH AUTHORITY, \nHaving regard to Article 60 and

## Chunk the data

#### TODO: Move this as a function to the preprocessing

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=20)
all_splits = text_splitter.split_documents(data)
print(f"Split into {len(all_splits)} chunks")

Split into 79206 chunks


In [10]:
print("Data storage type:", type(all_splits))
print("Data length:", len(all_splits))
print("Data type:", type(all_splits[0]))
print("Data content:", all_splits[0])

Data storage type: <class 'list'>
Data length: 79206
Data type: <class 'langchain_core.documents.base.Document'>
Data content: page_content='EUR-Lex - 31953D0030 - EN\nAvis juridique important\n|\n31953D0030\nECSC High Authority: Decision No 30-53 of 2 May 1953 on practices prohibited by Article 60 (1) of the Treaty in the common market for coal and steel' metadata={'source': '/home/computerman/Desktop/NLPT/elqm-INLPT-WS2023/elqm-raw/preprocessed/307final.json', 'seq_num': 1}


Chunking does not change the data type, just the size of the list!

## Retriever approach 1: Elastic search magic

In [60]:
es = Elasticsearch('http://localhost:9200')
print(es.info)

<bound method Elasticsearch.info of <Elasticsearch(['http://localhost:9200'])>>


In [114]:
from langchain.vectorstores.elasticsearch import ElasticsearchStore
help(ElasticsearchStore.from_documents)

Help on method from_documents in module langchain_community.vectorstores.elasticsearch:

from_documents(documents: List[langchain_core.documents.base.Document], embedding: Optional[langchain_core.embeddings.Embeddings] = None, bulk_kwargs: Optional[Dict] = None, **kwargs: Any) -> 'ElasticsearchStore' method of abc.ABCMeta instance
    Construct ElasticsearchStore wrapper from documents.
    
    Example:
        .. code-block:: python
    
            from langchain_community.vectorstores import ElasticsearchStore
            from langchain_community.embeddings.openai import OpenAIEmbeddings
    
            db = ElasticsearchStore.from_documents(
                texts,
                embeddings,
                index_name="langchain-demo",
                es_url="http://localhost:9200"
            )
    
    Args:
        texts: List of texts to add to the Elasticsearch index.
        embedding: Embedding function to use to embed the texts.
                  Do not provide if using a

In [40]:
from langchain.embeddings import GPT4AllEmbeddings
from langchain.vectorstores.elasticsearch import ElasticsearchStore

REINDEX = True

if REINDEX:
    # Clear the index
    es.indices.delete(index="eurlex-langchain", ignore=[400, 404])

    start_time = time.time()
    vectorstore = ElasticsearchStore.from_documents(documents=all_splits,
                                                    embedding=GPT4AllEmbeddings(),
                                                    index_name="eurlex-langchain",
                                                    show_progress=True,
                                                    es_connection=es)
    print(f"Embedding took {time.time() - start_time} seconds")
else:
    vectorstore = ElasticsearchStore(index_name="eurlex-langchain",
                                     es_connection=es,
                                     embedding=GPT4AllEmbeddings())

  es.indices.delete(index="eurlex-langchain", ignore=[400, 404])


bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522
Embedding took 22.170400619506836 seconds


## Retriever approach 2: Langchain retriever using faiss vector store

In [11]:
from langchain.embeddings import GPT4AllEmbeddings

embeddings = GPT4AllEmbeddings()

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [18]:
# https://github.com/facebookresearch/faiss
# https://arxiv.org/pdf/1702.08734.pdf
# pip install faiss-cpu ; faiss-gpu didn't work for me ; Ah the reason is it is only supported for conda
# According to the repo "Flat" index is the most effiecient one for small ammounts of data

# Maybe we can even do better than a vectorstore: https://python.langchain.com/docs/modules/data_connection/retrievers
from langchain_community.vectorstores import FAISS

In [113]:
help(FAISS.from_documents)

Help on method from_documents in module langchain_core.vectorstores:

from_documents(documents: 'List[Document]', embedding: 'Embeddings', **kwargs: 'Any') -> 'VST' method of abc.ABCMeta instance
    Return VectorStore initialized from documents and embeddings.



In [13]:
import math
from tqdm import tqdm
# The FAISS functions will not show progress, so process in batches and then merge vectorstores

# make 100 batches if possible
if (len(all_splits) >= 1000):
    batchNum = 1000
else:
    batchNum = len(all_splits)

# calculate the size of the batches
size_of_batches = math.floor(len(all_splits) / batchNum)
rest_size = len(all_splits) - (batchNum * size_of_batches)

batches = [all_splits[i:i + size_of_batches] for i in range(0, batchNum * size_of_batches, size_of_batches)]
rest_batch = all_splits[-rest_size:]

print("Number of splits: ", len(all_splits))
print("Number of batches: ", batchNum)
print("Size of batches: ", size_of_batches)
print("Size of rest batch: ", rest_size)

Number of splits:  79206
Number of batches:  1000
Size of batches:  79
Size of rest batch:  206


In [20]:
# populate the vectorstores
vectorstores = []
for batch in tqdm(batches):
    vectorstores.append(FAISS.from_documents(documents=batch,
                                             embedding=embeddings))
    
rest_vectorstore = FAISS.from_documents(documents=rest_batch,
                                        embedding=embeddings)

# combine the vectorstores
vectorstore = vectorstores[0]

for vec in vectorstores[1:]:
    vectorstore.merge_from(vec)
vectorstore.merge_from(rest_vectorstore)

100%|██████████| 1000/1000 [08:01<00:00,  2.07it/s]


In [21]:
size_of_index = vectorstore.index.ntotal
print("Size of the FAISS index:", size_of_index)

Size of the FAISS index: 79206


In [22]:
print("Vectorstore type:", type(vectorstore))
print("Vectorstore content:", vectorstore)

Vectorstore type: <class 'langchain_community.vectorstores.faiss.FAISS'>
Vectorstore content: <langchain_community.vectorstores.faiss.FAISS object at 0x7936753c7dd0>


In [23]:
retriever = vectorstore.as_retriever()

#### Use retriever like:

In [24]:
query = "This is a test"

In [25]:
docs = retriever.invoke(query)
print("Number of docs:", len(docs))
for i, doc in enumerate(docs):
    print(f"Document {i}")
    print(doc.page_content)
    print(doc.metadata)
    print()

Number of docs: 4
Document 0
enters the test operation.
{'source': '/home/computerman/Desktop/NLPT/elqm-INLPT-WS2023/elqm-raw/preprocessed/277final.json', 'seq_num': 1}

Document 1
shall be injected to perform the test;
{'source': '/home/computerman/Desktop/NLPT/elqm-INLPT-WS2023/elqm-raw/preprocessed/158final.json', 'seq_num': 1}

Document 2
testing method is available.
{'source': '/home/computerman/Desktop/NLPT/elqm-INLPT-WS2023/elqm-raw/preprocessed/180final.json', 'seq_num': 1}

Document 3
performance of the tests.
{'source': '/home/computerman/Desktop/NLPT/elqm-INLPT-WS2023/elqm-raw/preprocessed/158final.json', 'seq_num': 1}



#### Use FAISS like:

In [26]:
query ="This is a test"

In [27]:
docs_and_scores = vectorstore.similarity_search_with_score(query)

print("Number of docs:", len(docs_and_scores))
for i, (doc, score) in enumerate(docs_and_scores):
    print(f"Document {i}")
    print(doc.page_content)
    print(f"Score: {score}")
    print()

Number of docs: 4
Document 0
enters the test operation.
Score: 0.850996732711792

Document 1
shall be injected to perform the test;
Score: 0.9017623662948608

Document 2
testing method is available.
Score: 0.9205477237701416

Document 3
performance of the tests.
Score: 0.9209227561950684



In [28]:
embedding_vector = embeddings.embed_query(query)
print("Query embedding length:", len(embedding_vector))
print("Query embedding:", embedding_vector)

docs = vectorstore.similarity_search_by_vector(embedding_vector)

print("Number of docs:", len(docs))
for i, doc in enumerate(docs):
    print(f"Document {i}")
    print(doc.page_content)
    print()

Query embedding length: 384
Query embedding: [0.02008598856627941, 0.020369146019220352, -0.03328794240951538, 0.05006341636180878, 0.010319828987121582, -0.03074713610112667, -0.006307267118245363, 0.020657578483223915, -0.0006196335889399052, 0.01820748671889305, 0.03446296229958534, -0.08538782596588135, 0.007782005239278078, 0.04672839492559433, -0.051374878734350204, -0.051072221249341965, 0.020909084007143974, -0.05579216033220291, -0.059465520083904266, 0.05841923505067825, 0.04251153767108917, 0.017066456377506256, -0.05490635335445404, 0.018923688679933548, 0.03195365145802498, -0.004465232603251934, -0.0431266613304615, 0.011368070729076862, 0.03462895005941391, -0.035111717879772186, 0.012774482369422913, 0.03956768661737442, 0.05025430768728256, 0.018407776951789856, 0.06511792540550232, -0.01580517180263996, 0.03316093981266022, 0.017037540674209595, 0.03508692607283592, 0.029834900051355362, 0.014514746144413948, -0.10974641144275665, 0.043374933302402496, 0.0261569339781

In [29]:
filename = "faiss_index"
dir = "./"
path = os.path.join(dir, filename)

In [30]:
# Saving the vectorstore
print("Saving file to: ", path)
vectorstore.save_local(path)

Saving file to:  ./faiss_index


In [19]:
# Loading the vectorstore
vectorstore = FAISS.load_local(path, embeddings)

Remark: You can search the vectorstore by filtering their metadata

## Build another chain and test it

In [41]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(k=5, memory_key="chat_history")

In [26]:
from langchain.chains import ConversationalRetrievalChain

qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    combine_docs_chain_kwargs={"prompt": prompt},
    memory=memory,
    get_chat_history=lambda h : h,
    )

In [27]:
question = "What is your job?"
history = ""

In [28]:
result = qa_chain({"question": question, "chat_history": history})
print(result)

  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


{'question': 'What is your job?', 'chat_history': '', 'answer': 'As ELQM, my job is to assist in answering question-answering tasks related to energy law, using the context provided by EU regulations and other legal documents. My expertise lies in providing information on employment, operation, and maintenance of energy-related systems, as well as the fulfillment of duties within the domain of energy law. I am here to help you navigate these complex legal issues and provide accurate answers to your questions. How may I assist you today?'}


## Stream output word for word vs output whole answer at once

In [None]:
# Streamed
# Did not work
for chunk in qa_chain.stream({"question": question, "chat_history": history}):
    print(chunk, end="", flush=True)

# I don't know how to write a function that streams to gradio

In [None]:
# Whole
def answer_question(question, history):
    result = qa_chain({"question": question, "chat_history": history})
    return result['answer']

## Gradio interface

In [31]:
import gradio as gr

css = """#chatbot {height: 100%;}"""

with gr.Blocks(css=css) as demo:
   gr.ChatInterface(fn=answer_question, title="ELQM")

demo.launch();

  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
