In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pdf_folder_path='/content/drive/MyDrive/LLM_1/MangrovePDFs'

In [None]:
!pip install openai[embeddings]==0.27.6
!pip install langchain==0.0.155
!pip install PyPDF2==3.8.1
!pip install tiktoken==0.3.3
!pip install faiss-cpu==1.7.4
!pip install unstructured==0.6.2
!pip install chromadb==0.3.21
!pip install llama-index==0.6.1
!pip install jupyterlab


In [None]:
import logging
import sys
import os

os.environ["OPENAI_API_KEY"] = "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"

In [None]:
## load the PDF using pypdf
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
!pip install pypdf

In [None]:
import pypdf

In [None]:
loader = PyPDFLoader('/content/drive/MyDrive/LLM_1/MangrovePDFs/Simulating-spatial-change-of-m-8cd5240a-17b3-4292-957d-5005ea324adc.pdf')


In [None]:
# the 10k financial report are huge, we will need to split the doc into multiple chunk.
# This text splitter is the recommended one for generic text. It is parameterized by a list of characters.
# It tries to split on them in order until the chunks are small enough.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
data = loader.load()
texts = text_splitter.split_documents(data)

# view the first chunk
texts[0]

Document(page_content='Simulating spatial change of mangrove habitat under the impact of\ncoastal land use: Coupling MaxEnt and Dyna-CLUE models\nYuyu Wanga,1, Bixiao Chaoa,b,1,P e n gD o n gc,D i a nZ h a n gb, Weiwei Yub,d,W e n j i aH ub,d,⁎, Zhiyuan Mab,d,\nGuangcheng Chenb,d, Zhenghua Liub, Bin Chenb,d,⁎\naSchool of Ecology and Nature Conservation, Beijing Forestry University, Beijing 100083, PR China\nbThird Institute of Oceanography, Ministry of Natural Resources, Xiamen 361005, PR China\ncAerospace Information Research Institute, Chinese Academy of Sciences, Beijing 100094, PR China\ndFujian Provincial Key Laboratory of Marine Ecological Conservation and Restoration, Xiamen 361005, PR China\nHIGHLIGHTS\n•We predicted mangrove habitat\nchanges in Guangdong, China consider-\ning land-use change.\n•Species distribution and land-use\nmodels were coupled as an integrated\nframework.\n•Under the current trend, only 70% of the\nmangrove habitats would remain in2030.\n•Potential mangro

In [None]:
# import Chroma and OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

In [None]:
# initialize OpenAIEmbedding
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

In [None]:
# use Chroma to create in-memory embedding database from the doc
docsearch = Chroma.from_documents(texts, embeddings,  metadatas=[{"source": str(i)} for i in range(len(texts))])



In [None]:
## perform search based on the question
query = "Where are mangroves?"
docs = docsearch.similarity_search(query)

In [None]:
docs

In [None]:
## importing necessary framework
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain

from langchain.chat_models import ChatOpenAI

In [None]:
## use LLM to get answering
chain = load_qa_chain(ChatOpenAI(temperature=0.2,model_name='gpt-3.5-turbo'),
                      chain_type="stuff")

In [None]:
query = "Where are mangroves?"


In [None]:
chain.run(input_documents=docs, question=query)

'Mangroves are distributed along the coasts in tropical and subtropical regions between latitudes of approximately 30° N and 30° S. They can be found in various countries around the world, particularly in Asia, Africa, and the Americas.'

In [None]:
chain = load_qa_with_sources_chain(ChatOpenAI(temperature=0.2,model_name='gpt-3.5-turbo'),
                                   chain_type="stuff")

In [None]:
query = "Where are mangroves?"

In [None]:
query = "Where are mangroves?"
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

{'output_text': "Mangroves are distributed along the coasts in tropical and subtropical regions between latitudes of approximately 30° N and 30° S. They can be found in countries such as China, where 57.3% of China's mangroves are scattered along the coastline of Guangdong. However, mangroves are facing extinction in many countries due to land use and land cover changes, such as tree harvesting, logging, and clear-cutting for aquaculture ponds. Mangrove clearance for urban expansion and coastal infrastructure development has also contributed to their loss. Efforts are being made to conserve and restore mangrove habitats through improved management and policies. \nSOURCES: /content/drive/MyDrive/LLM_1/MangrovePDFs/Simulating-spatial-change-of-m-8cd5240a-17b3-4292-957d-5005ea324adc.pdf"}

In [None]:
qa=RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0.2,model_name='gpt-3.5-turbo'), chain_type="stuff",
                                                retriever=docsearch.as_retriever())
query = "Where are mangroves?"
qa.run(query)

'Mangroves are distributed along the coasts in tropical and subtropical regions between latitudes of approximately 30° N and 30° S. They can be found in various countries around the world, particularly in Asia, Africa, Australia, and the Americas.'