In [26]:
#!pip install langchain
#!pip install pypdf
#!pip install python-dotenv
#!pip install pinecone-client
#!pip install tiktoken
import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

load_dotenv()

True

### Load data



In [9]:
# PyPDFLoader will split a pdf by page and return a list of pages. Append all lists pages from all books to data.
data = []
file_path = "Algae-Chatbot/"
file_names = os.listdir(file_path)

for file_name in file_names:
    if file_name.endswith('.pdf'):
        try:
            loader = PyPDFLoader(file_path+file_name)
            data.append(loader.load())
            print(f'{file_name} added')
        
        except:
            print(f"{file_name} can't be loaded correctly")



1-s2.0-S0196890409000764-main.pdf added
1-s2.0-S0196890410002207-main.pdf added
1-s2.0-S0360319916320717-main.pdf added
1-s2.0-S0734975012001048-main.pdf added
1-s2.0-S1319016409000462-main.pdf added
1-s2.0-S1364032114005413-main.pdf added
1-s2.0-S1364032114009915-main.pdf added
978-1-4939-2684-8.pdf added
978-3-030-57911-1.pdf added
978-3-031-33144-2 (1).pdf added
978-3-031-33144-2.pdf added
978-3-031-42026-9.pdf added
978-3-319-17031-2.pdf added
978-3-319-19018-1.pdf added
978-3-319-51010-1.pdf added
978-3-319-52666-9.pdf added
978-3-319-68717-9.pdf added
978-3-319-74703-3.pdf added
978-81-322-3965-9.pdf added
978-94-007-7494-0.pdf added
978-981-16-6162-4.pdf added
978-981-16-8682-5.pdf added
978-981-19-6810-5.pdf added


invalid pdf header: b'\n1702'


978-981-32-9607-7.pdf added
ajol-file-journals_82_articles_128906_submission_proof_128906-973-349244-1-10-20160120.pdf can't be loaded correctly
Algal-biodiesel-promising-source-to-power-CI-e_2020_Materials-Today--Proceed.pdf added
BF00446694.pdf added
Comparative-studies-on-liquefaction-of-low-lipid-microalgae-into-bio_2019_Fu.pdf added
dring-2003-photocontrol-of-development-in-algae.pdf added
Enhanced-lipid-productivity-approaches-in-microalgae-_2016_Journal-of-the-En.pdf added
Freshwater Microbiology - 2004 - Sigee - Algae  The Major Microbial Biomass in Freshwater Systems.pdf added
FULLTEXT01.pdf added
identification_of_algea_growth_kinetics-groen_kennisnet_9247.pdf added
Impact-of-cultivation-conditions-on-the-biomass-and-lipid-in-microal_2021_Fu.pdf added
Journal of Phycology - 2012 - Clerck - Algal taxonomy  a road to nowhere.pdf added
Journal of Phycology - 2012 - Guiry - HOW MANY SPECIES OF ALGAE ARE THERE.pdf added
Life-cycle-assessment-on-microalgal-biodiesel-production-_201

Then let's actually check out what's been loaded

In [11]:
#Unfold the list of pages
list_of_pages = [item for sublist in data for item in sublist]
print(type(list_of_pages))
print (f'There are {len(list_of_pages)} page(s) in the data')
print (f'There are {len(list_of_pages[0].page_content)} characters in a sample document')
print (f'A sample: {list_of_pages[340].page_content[:10000]}')

<class 'list'>
There are 8218 page(s) in the data
There are 4681 characters in a sample document
A sample: 239
 23. To set up the chromatographic conditions, first run a series of 
TLCs to find the solvent system that will give a good separa -
tion of the components of the mixture under study. If two 
components of interest are running close together, a differ -
ence of Rf values around 0.2–0.3 between them will indicate a 
satisfactory solvent system. There are often irrelevant impuri -
ties that could be either very polar or very nonpolar; these can 
be largely ignored. The next step is the amount of silica to be 
used. Usually, if the component required is well separated 
from other components, a ratio of 20:1 (silica:sample) should 
be used. If the Rf difference between spots is less than 0.2, the 
separation may be improved by increasing the amount of silica 
according to the ratios shown in Fig.  3.
∆Rf
∆Rf0.30 0.14 0.10 0.08
Approximate 
ratio silica (g)/
sample (g)20:1 50:1 100

### Chunk data up into smaller documents
We can't attached a whole book as context to our LLM. It will recieve to much noise, and will likely not answer our question that well. Instead we divide the book into chunks. Then we will later only attach the really relevant chunks as context to the question we send til the llm. This will give us a sharper respons.

In [12]:
# We'll split our data into chunks around 2056 characters each with a 256 character overlap.
# 2056 characters is completely arbitrary, around a half page. 
# - Play around wither larger and smaller chunks to maybe improve results.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2056, chunk_overlap=256)
texts = text_splitter.split_documents(list_of_pages)

In [13]:
# Number of chunks we have
print (f'Now you have {len(texts)} chunks')

Now you have 15228 chunks


In [14]:
#lets see what is inside the chunks:
print(texts[8000])

page_content='described in rice seedlings under cold stress, showing that the cold-induced ABAincrease is linked to CKs signaling (Maruyama et al. 2014 ).\nInterestingly, exogenous application of certain molecules increases the ABA\nlevels leading to an alleviation of cold stress. In this sense, melatonin, which wasrecently discovered in plants and it is present in high amounts in several medicinalplants such as feverfew ( Tanacetum parthenium ), St John ’s wort ( Hypericum\nperforatum ), and Huang-qin ( Scutellaria baicalensis ) (Bajwa et al. 2014 ), was\nshown to induce the antioxidant defense via ABA-dependent andABA-independent pathways in the grass Elymus nutans (Fu et al. 2017 ). Melatonin\nincreases ABA levels but there were no evidence of a reciprocal effect of ABA onmelatonin endogenous content, suggesting that this compound should be actingupstream of ABA (Fu et al. 2017 ). Similarly, celastrol, a triterpenoid isolated from\nTripterygium wilfordii which is used to treat some 

### Create embeddings for document chunks to get ready for semantic search

The technic for finding only the relevant chunks for our question is to do a semantic search. The way we prepare the cunks for this is through embedding them (getting a vector per document).

This will help us compare documents later on.
We use openAI's embeddingmodel for this.


In [15]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [16]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'YourAPIKey')
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  warn_deprecated(


Let's test it out. I want to see which documents are most closely related to a query.



### Pinecone (a vectorstore in the cloud)


In [29]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY', 'YourAPIKey')
PINECONE_ENV = os.getenv('PINECONE_ENV', 'gcp-starter') 

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment=PINECONE_ENV  
)
index_name = "algaeopenai" 

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(name=index_name, metric="cosine", dimension=1536)

# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`

#docsearch = Pinecone.from_documents(texts, embeddings, index_name=index_name)


#already existing index:
docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [30]:
index = pinecone.Index(index_name)


In [31]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.15228,
 'namespaces': {'': {'vector_count': 15228}},
 'total_vector_count': 15228}

In [32]:
text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embeddings, text_field
)

### Query those docs to get your answer back

Great, those are just the docs which should hold our answer. Now we can pass those to a LangChain chain to query the LLM.

We could do this manually, but a chain is a convenient helper for us.

In [33]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [34]:
llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [35]:
query = "In the context of hydrogen, what does THEUS stand for?"
docs = vectorstore.similarity_search(query, k=3)
docs[2]

Document(page_content='Chapter 31\nWater Is the Ultimate Source of Hydrogen\nEnergy: Scientiﬁc Citations and Quotations\nHussein K. Abdel-Aal and Nejat Veziroglu\nAbstract From the ancient times, mankind has always been aware that all life on\nearth depends upon water, the principal ingredient of all living cells. Its importancein forming the creation in all of its aspects in general and the living creatures in\nparticular, as well as human kind, animals, and plants, is evident to all of us. It is\nhighly important to know that water has played a major role for human kind. When\nthe Ionian philosopher Thales of Miletus (624–545 BC) replaced the Gods withNatural Laws as the force governing all phenomena; he made water the central\nelement in his theory.\nThis chapter addresses the issue that water, besides its indispensable usage by\nhuman beings in drinking and in everyday life, is the key element in providing lifewith energy, in the form of hydrogen .\nThe main aim of this work is to 

In [36]:
print('User:')
print(query)
print("Algaefessor:")
print(chain.run(input_documents=docs, question=query))
print(f"Source: {docs[2].metadata['source']} on page: {int(docs[2].metadata['page'])}")

User:
In the context of hydrogen, what does THEUS stand for?
Algaefessor:
THEUS stands for Totalized Hydrogen Energy Utilization System.
Source: Algae-Chatbot/978-3-319-17031-2.pdf on page: 449


In [37]:
query = "What is cyanobacteria and why is it classified as an algae"
docs = vectorstore.similarity_search(query, k=3)
print('User:')
print(query)
print("Algaefessor:")
print(chain.run(input_documents=docs, question=query))
print(f"Source: {docs[2].metadata['source']} on page: {int(docs[2].metadata['page'])}")

User:
What is cyanobacteria and why is it classified as an algae
Algaefessor:
Cyanobacteria, also known as blue-green algae, are a group of bacteria that have the ability to perform photosynthesis. They are classified as algae because they contain chlorophyll a and related compounds, which are characteristic of algae. Additionally, cyanobacteria share many structural features with bacteria, but their ability to convert atmospheric nitrogen into ammonia and their production of nitrogenous compounds and cyclic polyethers are more similar to algae.
Source: Algae-Chatbot/978-3-319-51010-1.pdf on page: 118
