In [1]:
# Importing the necesary functions
from pi_pubs import Researcher
from chat import load_papers_pdf,text_splitter,vectorstore
import os

The steps to create a RAG according to Langchain (https://python.langchain.com/docs/use_cases/question_answering/) templates are the following:
$$\begin(itemize)
\item Get the PDFs for PIs (for our particular case)
\item Loading
\item Splitting
\item Storage
\item Retrieval
\item Generation
\item Chat-History
\end{itemize}$$

The following code is intentend to create a first pipeline for a RAG application with the papers from different PIs. Dr. Adrian Jinich is used a test case

In [2]:
# Get the PDFs for PIs (for our particular case)
PI_name = "Adrian Jinich"
test = Researcher(PI_name)
test._pmdid_collection("richiam@uchicago.edu") # An email is required to access PubmedId
test._download_papers() 
# test.main() The last two lines can be replaced with this command
# Note: Currently downloading all papers in predetermied folder ()

2023-11-01 17:38:08,473 - INFO - pubmed2pdf.cli - Trying to fetch pmid 37405387
2023-11-01 17:38:10,620 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36945430
2023-11-01 17:38:10,620 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36802628
2023-11-01 17:38:11,083 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36247217
2023-11-01 17:38:13,345 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35337802
2023-11-01 17:38:14,277 - INFO - pubmed2pdf.cli - Trying to fetch pmid 34782606
2023-11-01 17:38:14,277 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33376214
2023-11-01 17:38:14,806 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33068537
2023-11-01 17:38:16,117 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31733221
2023-11-01 17:38:17,955 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31404220
2023-11-01 17:38:18,792 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30898926
2023-11-01 17:38:19,412 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30356318
2023-11-01 17:38:19,412 - INFO - pubmed2

Done downloading. All downloaded can be found in /home/richiam/pubmed2pdf


2023-11-01 17:38:19,964 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25387603
2023-11-01 17:38:19,964 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24586198
2023-11-01 17:38:19,964 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22383887
2023-11-01 17:38:19,964 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19851443


In [3]:
# Loading

docs = load_papers_pdf("/home/richiam/pubmed2pdf")
# Showing some of the documents
print(docs[0])

page_content='High-Resolution Profiling of Stationary-Phase Survival\nReveals Yeast Longevity Factors and Their Genetic\nInteractions\nErika Garay, Sergio E. Campos, Jorge Gonza ´lez de la Cruz, Ana P. Gaspar, Adrian Jinich,\nAlexander DeLuna *\nLaboratorio Nacional de Geno ´mica para la Biodiversidad, Centro de Investigacio ´n y de Estudios Avanzados del IPN, Irapuato, Guanajuato, Mexico\nAbstract\nLifespan is influenced by a large number of conserved proteins and gene-regulatory pathways. Here, we introduce a\nstrategy for systematically finding such longevity factors in Saccharomyces cerevisiae and scoring the genetic interactions\n(epistasis) among these factors. Specifically, we developed an automated competition-based assay for chronologicallifespan, defined as stationary-phase survival of yeast populations, and used it to phenotype over 5,600 single- or double-\ngene knockouts at unprecedented quantitative resolution. We found that 14% of the viable yeast mutant strains were\naf

In [4]:
# Splitting
splits = text_splitter(docs, 1200, 150)
print(splits[0])
# A place of improvemnet would be to be able to "cut the head of the paper, ie the importanto information to use a metadata 
# and to be consistent with all papers"
print(f"There are a total of {len(splits)} chunks")

page_content='High-Resolution Profiling of Stationary-Phase Survival\nReveals Yeast Longevity Factors and Their Genetic\nInteractions\nErika Garay, Sergio E. Campos, Jorge Gonza ´lez de la Cruz, Ana P. Gaspar, Adrian Jinich,\nAlexander DeLuna *\nLaboratorio Nacional de Geno ´mica para la Biodiversidad, Centro de Investigacio ´n y de Estudios Avanzados del IPN, Irapuato, Guanajuato, Mexico\nAbstract\nLifespan is influenced by a large number of conserved proteins and gene-regulatory pathways. Here, we introduce a\nstrategy for systematically finding such longevity factors in Saccharomyces cerevisiae and scoring the genetic interactions\n(epistasis) among these factors. Specifically, we developed an automated competition-based assay for chronologicallifespan, defined as stationary-phase survival of yeast populations, and used it to phenotype over 5,600 single- or double-\ngene knockouts at unprecedented quantitative resolution. We found that 14% of the viable yeast mutant strains were' me

In [5]:
# Storage

# First we need to define the embedding function and define OpenAI variables
os.environ['OPENAI_API_KEY'] = "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"
from langchain.embeddings.openai import OpenAIEmbeddings
# The directory where the vector database is located, it also needs to be emptied
!rm -rf ./docs/chroma
persist_directory = 'docs/chroma/'
vectordb = vectorstore(documents=splits,embedding_function=OpenAIEmbeddings(),directory=persist_directory)


In [6]:
print(f"The Vectordatabese has a total collection of {vectordb._collection.count()} for a total of {len(splits)} chunks" )

The Vectordatabese has a total collection of 489 for a total of 489 chunks


In [7]:
# Retrieval

retriever = vectordb.as_retriever()

# Prompt to get information about 



In [8]:
# Generate

# Creating a Prompot for the LLM:

from langchain.prompts import PromptTemplate

template = """You are an assistant for question-answering tasks. 
The information provided are published scientific articles that were published by Adrian Jinich.
Use this information to answer questions about the work, life, and publications of Adrian Jinich.
If you do not know the answer, just say that you do not know the answer. Use ten sentences maximum and use technical terms in the answers
Context:{context}
Question: {question}
"""

rag_prompt_custom = PromptTemplate.from_template(template)

from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(model_name = "gpt-3.5-turbo", temperature=0)

rag_chain = ({"context": retriever, "question": RunnablePassthrough()} | rag_prompt_custom | llm)

rag_chain.invoke("What are the main topics that Adrian Jinich has been working on ?")

AIMessage(content='The main topics that Adrian Jinich has been working on include thermodynamic constraints in carbon fixation pathways, acetogenesis, redox biochemistry, biochemical reaction networks, and computational chemistry.')

In [9]:
rag_chain.invoke("Can you give a number of the total papers that Adrian Jinich has published?")

AIMessage(content="I'm sorry, but the provided information does not specify the total number of papers that Adrian Jinich has published.")

In [12]:
rag_chain.invoke("What has Adrian Jinich published about Escherichia coli?, explain in detail")

AIMessage(content='Adrian Jinich has published scientific articles about Escherichia coli. One of the articles, titled "Invariant Distribution of Promoter Activities in Escherichia coli," was co-authored by Alon Zaslaver, Shai Kaplan, Anat Bren, Avi Mayo, Erez Dekel, Uri Alon, and Shalev Itzkovitz. This article discusses the need for cells to allocate their limited resources to express a wide range of genes. The authors employed a robotic assay using a comprehensive reporter strain library for E. coli to measure promoter activity on a genomic scale. This allowed them to track promoter activity continuously as cells changed their growth rate from exponential to stationary phase. The article provides insights into how E. coli partitions its transcriptional resources between different promoters. The study was conducted at the Weizmann Institute of Science in Rehovot, Israel, where Adrian Jinich was affiliated with the Department of Computer Science and Applied Mathematics and the Departme

In [13]:
rag_chain.invoke("Can you summarize the work of Adrian Jinich ?, explain in detail")

AIMessage(content='Adrian Jinich is a researcher who has contributed to the field of redox biochemistry. He has published several scientific articles on the topic, including "Thermodynamic constraints shape the structure of carbon fixation pathways" and "Does acetogenesis really require especially low reduction potential?". In these articles, Jinich explores the thermodynamic principles underlying metabolic reactions and the requirements for acetogenesis.\n\nJinich\'s work involves the use of quantum chemical approaches to estimate the thermodynamics of metabolic reactions. He collaborates with other researchers, such as Haniu Ren, Sung-Jin Kim, Elad Noor, and Benjamin Sanchez-Lengeling, to carry out the research and analyze the data. The research is supervised by Ala´n Aspuru-Guzik and Arren Bar-Even.\n\nJinich\'s research has practical implications for understanding and manipulating metabolic pathways. By studying the thermodynamic constraints of carbon fixation and acetogenesis, Jin

In [14]:
rag_chain.invoke("Who is Adrian Jinich?")

AIMessage(content='Adrian Jinich is a researcher who has published scientific articles on topics related to thermodynamics and biochemistry.')

In [15]:
rag_chain.invoke("What was the last paper that Adrian Jinich published?")

AIMessage(content='The last paper that Adrian Jinich published is titled "Quantum Chemical Approach to Estimating the Thermodynamics of Metabolic Reactions" and was published in the journal Scientific Reports in 2014.')

In [17]:
# Adding Conversation history:

