In [5]:
!pip install langchain
!pip install pypdf
!pip install chromadb
!pip install tiktoken
!pip install openai



Setting up the OpenAI API key

In [6]:
import numpy as np
import os
import sys
from langchain.text_splitter import TokenTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo


sys.path.append('../..')

In [2]:
os.environ["OPENAI_API_KEY"] = '<Your_Key>'

**Loading pdf file**

In [8]:
Loader = PyPDFLoader("/content/Dissertation_Report_2128468-4.pdf")
pages = Loader.load()

**Splitting the content**

Defining function to split the text.

In [9]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size= 1000,
    chunk_overlap=150,
    length_function=len
)

Applying the split function to the text

In [10]:
docs = text_splitter.split_documents(pages)

In [11]:
total_splits= len(docs)
print('Total number of chunks:', total_splits)

Total number of chunks: 86


In [12]:
page_length= len(pages)
print('Page length of a pdf file:', page_length)

Page length of a pdf file: 38


In [13]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [14]:
docs = text_splitter.split_documents(pages)

In [15]:
docs[10].metadata

{'source': '/content/Dissertation_Report_2128468-4.pdf', 'page': 0}

In [16]:
markdown_document = docs

In [17]:
headers_to_split_on =[
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

**Vectorstores and Embedding multiple pdf files**

In [65]:
# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("/content/Dissertation_Report_2128468-4.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [19]:
# Splitting data
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

**Initializing parameters to create vector database**

In [20]:
embedding = OpenAIEmbeddings()
persist_directory = 'docs/chroma/'
#Applying the split function
splits = text_splitter.split_documents(docs)

**Creating 3 questions**

In [21]:
sentence1 = "Active and passive are the two types of remote sensing observation"
sentence2 = "Active sensors also record the time required to reach back to the sensor as delay time"
sentence3 = "Due to relief displacement in Sentinel-1 radar images, they are subjected to geometric distortion: Layover and Foreshortening"

In [22]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [23]:
np.dot(embedding1, embedding2)

0.7826158365914484

In [24]:
np.dot(embedding1, embedding3)

0.7754425646497681

In [25]:
np.dot(embedding2, embedding3)

0.7678525658979849

**Creating vector database**

In [26]:
!rm -rf ./docs/chroma  # remove old database files if any
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [27]:
print(vectordb._collection.count())

138


**Similarity Search**

In [28]:
question = "How many stages of preprocessing steps are there in analysing SAR data?"

In [29]:
docs = vectordb.similarity_search(question,k=3)

In [30]:
len(docs)

3

In [31]:
docs[0].page_content

'TABLE OF CONTENTS\nLIST OF FIGURES . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . iv\nLIST OF TABLES . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v\nACKNOWLEDGEMENTS . . . . . . . . . . . . . . . . . . . . . . . . . . . . viii\nCHAPTER\n1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1\n2. Study Area and Field Information . . . . . . . . . . . . . . . . . . . . 4\n2.1 Study Area and Ground Truth Data . . . . . . . . . . . . . . . . 4\n2.2 Rice Crop Calendar . . . . . . . . . . . . . . . . . . . . . . . . 4\n3. Literature Review . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\n3.1 Introduction and Background . . . . . . . . . . . . . . . . . . . 6\n3.2 SAR Data Pre-processing . . . . . . . . . . . . . . . . . . . . . 7\n3.3 Polarisation and Backscatter Coefficient . . . . . . . . . . . . . 8\n3.4 Phenology . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9\n3.5 Vegetation Indices . . . . . . . . 

In [32]:
vectordb.persist()

In [33]:
for doc in docs:
    print(doc.metadata)

{'source': '/content/Dissertation_Report_2128468-4.pdf', 'page': 1}
{'source': '/content/Dissertation_Report_2128468-4.pdf', 'page': 14}
{'source': '/content/Dissertation_Report_2128468-4.pdf', 'page': 1}


In [34]:
print(docs[1].page_content)

tain geometric distortion. Secondly, SAR images contain unwanted noise also known as
speckle. Speckle noise is a grainy salt-and-pepper pattern present in SAR images. Prepro-
cessing of data is necessary to remove noise from the images.
3.2 SAR Data Pre-processing
SAR data includes unwanted radiometric noise and geometric distortion. Preprocess-
ing of SAR images needs to be performed for a wide range of applications. Speckle noise
reduction and geometric correction need to be removed by several preprocessing steps.
The geometry of the Earth’s surface in an image looks different from point to point in the
range direction. Slant range distortion, geometric distortion(layover and foreshortening),
radiometric distortion and speckle are the major factors that affect radar backscatter [27].
The side-looking nature of radar results in geometric distortion or images. As the
satellite moves from near range to far range, Earth’s surface near the radar signal looks
compressed and results in slan

**Retrieval**

In [35]:
question = "Pre-processing of SAR data?"
docs_ss = vectordb.similarity_search(question,k=3)

**Max Marginal Relevance**

In [36]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=2, fetch_k=3)

In [37]:
docs_mmr[0].page_content[:100]

'tain geometric distortion. Secondly, SAR images contain unwanted noise also known as\nspeckle. Speckl'

In [38]:
docs_mmr[1].page_content[:100]

'2 \n1 About this guide  \nThis “layman’s” guide was developed to provide users who have  little or no '

**Similarity search Using filter and printing metadata**

In [39]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":"/content/Dissertation_Report_2128468-4.pdf"}
)

In [40]:
for d in docs:
    print(d.metadata)

{'source': '/content/Dissertation_Report_2128468-4.pdf', 'page': 14}
{'source': '/content/Dissertation_Report_2128468-4.pdf', 'page': 15}
{'source': '/content/Dissertation_Report_2128468-4.pdf', 'page': 1}


**Using SelfQueryRetriever to filter data.**

It uses LLM to extract query string and metadata. So it doesn't require any databases and indexes.

In [54]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="This document is of my dissertation report and references `/content/Dissertation_Report_2128468-4.pdf`, `/content/Crop_Growth_Assessment_Using_Sentinel-1_GRD_SAR_Descriptors.pdf`, or `/content/Laymans_SAR_Interpretation_Guide_2.0a.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page is from the dissertation report",
        type="integer",
    ),
]

In [55]:
document_content_description = "Dissertation Report"
llm = OpenAI(temperature=0)


In [58]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
question = "How many crop-cycles?"
qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})


result = qa_chain({"query": question})
result["result"]

' Rice fields in An Giang province, Vietnam are usually cultivated three times a year. The harvesting of the rice crops is normally done in April, August and December. Thanks for asking!'

In [59]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [60]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [61]:
question = "How many satellite data are available?"
result = qa({"question": question})

In [62]:
result['answer']

' Two satellites are available, Sentinel-1A and Sentinel-1B.'

In [63]:
question = "Do we need to preprocess the SAR data?"
result = qa({"question": question})

In [64]:
result['answer']

' Yes, preprocessing of Sentinel-1A and Sentinel-1B SAR data is necessary to remove noise and correct geometric distortion.'