In [1]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.document_loaders import UnstructuredFileLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [2]:
loader = DirectoryLoader('./Docs/', glob="./*.pdf", loader_cls=UnstructuredFileLoader,show_progress=True,use_multithreading=True)
documents = loader.load()

100%|██████████| 1/1 [00:18<00:00, 18.32s/it]


In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
texts

[Document(page_content='14* Inlenmlional Scientific and Technical Conference "From imagery\n\nlo map: digital photoR, ammetric\n\nleclmologies"\n\nFeatures of Point Clouds and Functionalities of Processing Software M.J.P.M. Lemmens Delft University of Technology, Faculty of Architecture and the Built Environment, The Netherlands', metadata={'source': 'Docs\\Paper_Lemmens_2014.pdf'}),
 Document(page_content='To be useful, the millions or even billions of 3D points generated by a variety of active and passive sensors need to be stored, organised, combined, geo-referenced, measured, analysed and distributed within organisations or outward. Initially, the data is unorganised; software has been designed, developed and put on the market place to organise the unorganised and to extract information from the point clouds. In principle, the curved 2D surface can represent any instance such as soil pollution, forest biomass, rainfall, terrain elevation or the seabed. In the field of geomatics, th

In [4]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [5]:
text_embedding_pairs = zip(texts, embedding)
text_embedding_pairs_list = list(text_embedding_pairs)

In [7]:

faiss_db = FAISS.from_documents(texts, embedding)

In [17]:
query="What is geo referencing"

In [18]:
embedding_vector = embedding.embed_query(query)
docs_and_scores = faiss_db.similarity_search_with_score(query)
docs_and_scores

[(Document(page_content='Geo-referencing 3D patches of point clouds - i.e. individual scans - must be transformed to a single coordinate system to enable geo-referencing or registration, which involves identifying common points in the overlaps. As a geometric transformation model, the 3D similarity (Helmert) transformation is usually applied (three translation parameters, three rotation parameters and one scale parameter). This process requires at least seven coordinates, i.e. two 3D ground control points (GCP) and one heightGCP. To obtain redundancy and thus the ability to detect outliers and to compute precision through error propagation, seven or more GCPs that are evenly distributed along the border of the site are commonly measured, usually with differential GNSS or total stations. The stitching of the single point clouds requires tie points in the overlaps. These are distinctive points in the scene or consist of targets placed, marked or painted in the overlaps. The latter requir

In [10]:
retriever = faiss_db.as_retriever()

In [11]:

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

In [12]:
pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=1024
)
local_llm = HuggingFacePipeline(pipeline=pipe)

In [13]:
qa_chain = RetrievalQA.from_chain_type(llm=local_llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [14]:
def process_llm_response(llm_response):
    print(llm_response['result'])   

In [16]:
query = "What is geo referencing?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Geo-referencing 3D patches of point clouds - i.e. individual scans - must be transformed to a single coordinate system to enable geo-referencing or registration, which involves identifying common points in the overlaps.
