In [1]:
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import FAISS

import os
import pickle

In [2]:
# Load index from file

In [3]:
load_dotenv()

True

In [4]:
loaded_faiss_vs = FAISS.load_local(
    folder_path="./vectordb/faiss/cord19/",
    embeddings=OpenAIEmbeddings())

In [5]:
retriever = loaded_faiss_vs.as_retriever(search_kwargs={'k': 5})

In [6]:
# Define the RAG pipeline

In [7]:
model = 'gpt-3.5-turbo-instruct'
llm = OpenAI(model_name=model, openai_api_key=os.getenv("OPENAI_API_KEY"))

In [8]:
template = """Answer the question or Explain the topic given this additional context: {context}
Question: {question}"""

In [9]:
prompt = ChatPromptTemplate.from_template(template)

In [10]:
docs_file_path = './dataset/cord19/documents.pkl'
with open(docs_file_path, 'rb') as file:
    docs = pickle.load(file)
print("Documents loaded successfully.")

Documents loaded successfully.


In [11]:
def format_docs(_docs):
    ls = []
    for doc in _docs:
        if doc.page_content in docs:
            ls.append(docs[doc.page_content]["text"][:800])
    return ls

In [12]:
chain = ({"context": retriever | format_docs, "question": RunnablePassthrough()} 
         | prompt 
         | llm 
         | StrOutputParser())

In [13]:
# Run the RAG pipeline

In [14]:
query_file_path = './openai_embeddings/cord19/query_embeddings.pkl'
with open(query_file_path, 'rb') as file:
    loaded_queries = pickle.load(file)
print("Query embeddings loaded successfully.")

Query embeddings loaded successfully.


In [15]:
loaded_queries["1"]["text"]

"what is the origin of COVID-19 (seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans)"

In [16]:
chain.invoke(loaded_queries["1"]["text"])

'\n\nAnswer: The origin of COVID-19, caused by the SARS-CoV-2 virus, is still under investigation. However, based on current evidence, it is believed that the virus likely evolved from a bat coronavirus and was transmitted to humans through an intermediate animal host, possibly a pangolin. The first known cases of COVID-19 were reported in Wuhan, China in December 2019, with evidence suggesting that the virus may have originated from a wet market where live animals were sold for food. Further research is needed to fully understand the exact origins and evolution of the SARS-CoV-2 virus.'

In [17]:
llm(loaded_queries["1"]["text"])

'\n\nThe origin of COVID-19 can be traced back to a wet market in Wuhan, China, where the first cases were reported in December 2019. The virus responsible for COVID-19 is called SARS-CoV-2 and it is a type of coronavirus, a large family of viruses that also includes the viruses responsible for SARS and MERS.\n\nScientists believe that the SARS-CoV-2 virus originated in bats and then jumped to an intermediate animal host, possibly a pangolin, before being transmitted to humans. This is similar to the origin of SARS, which is believed to have originated in bats and then spread to civet cats before infecting humans.\n\nThe exact mechanism of how the virus was transmitted from animals to humans is still under investigation. It is possible that the virus was transmitted through direct contact with an infected animal or through the consumption of contaminated animal products.\n\nThe SARS-CoV-2 virus is thought to have evolved from a virus found in bats, but it is not yet clear how it acquir

In [18]:
# loaded_queries["PLAIN-23"]["text"]

In [19]:
# chain.invoke(loaded_queries["PLAIN-23"]["text"])

In [20]:
# loaded_queries["PLAIN-33"]["text"]

In [21]:
# chain.invoke(loaded_queries["PLAIN-33"]["text"])

In [22]:
# loaded_queries["PLAIN-78"]["text"]

In [23]:
# chain.invoke(loaded_queries["PLAIN-78"]["text"])

In [24]:
# loaded_queries["PLAIN-143"]["text"]

In [25]:
# chain.invoke(loaded_queries["PLAIN-143"]["text"])

In [26]:
# loaded_queries["PLAIN-165"]["text"]

In [27]:
# chain.invoke(loaded_queries["PLAIN-165"]["text"])

In [28]:
# llm("If I have inflammatory bowel disease, how will smoking affect my risk of developing colorectal cancer?")

In [29]:
# chain.invoke("If I have inflammatory bowel disease, how will smoking affect my risk of developing colorectal cancer?")
