# Get the Top 5 Chunks to Check Paper's Stance

## Step 0: Environment Setup

In [1]:
from dotenv import load_dotenv  # type: ignore
import os
from langchain_neo4j import Neo4jGraph  # type: ignore
from libs import create_vector_index
import pandas as pd  # type: ignore
from conn import connect2Googlesheet
from annolibs import get_all_chunks_per_paper, compare_embeddings
from openai import OpenAI

load_dotenv()

True

In [2]:
# Connect to Neo4j database
try:
    graph = Neo4jGraph(
        url=os.getenv("NEO4J_URL"),
        username=os.getenv("NEO4J_USERNAME"),
        password=os.getenv("NEO4J_PASSWORD"),
    )
    print("Connected to Neo4j database successfully.")
except ValueError as e:
    print(f"Could not connect to Neo4j database: {e}")

Connected to Neo4j database successfully.


In [3]:
# Check if the entities index exists
index_name = "entities"
query = "SHOW INDEXES YIELD name, type WHERE type = 'VECTOR' AND name = $index_name"

result = graph.query(query, params={"index_name": index_name})
if result:
    print("The 'entities' index already exists.")
else:
    create_vector_index(graph, "entities")

## Step 1: Load Questions from Google Sheet

In [4]:
spreadsheet = connect2Googlesheet()

# Select the worksheet: relevance
worksheet = spreadsheet.get_worksheet(2)

# Get all records as a list of dictionaries
data = worksheet.get_all_records()

# Convert to Pandas DataFrame
df_Paper = pd.DataFrame(data)
df_Paper.head()

Unnamed: 0,condition,number,docs,Question,Mahmud's Note,status,comments,Unnamed: 8
0,ARDS,1,ACURASYS,Does early administration of neuromuscular blo...,Like,,,
1,ARDS,2,ACURASYS,Do patients with severe ARDS being treated wit...,Replace,fixed,,
2,ARDS,3,ROSE,"In patients with moderate to severe ARDS, does...",Maybe this question: In patients with moderate...,fixed,,
3,ARDS,4,ROSE,Do patients with moderate-to-severe ARDS have ...,Local question (not sure if this is the aim of...,fixed,Wrong concept since PEEP by itself is mandator...,Does the use of neuromuscular blockers in pati...
4,ARDS,5,FACTT,"Among patients with ALI/ARDS, does a conservat...",Local question (not sure if this is the aim of...,fixed,Check if studies defined conservative by CVP <...,


## Step 2: Get the Chunks from Each Paper

In [5]:
# Filter papers with 'Sepsis' in condition
sepsis_papers = df_Paper[
    df_Paper["condition"].str.contains("Sepsis", case=False, na=False)
]

# Display the filtered papers
print(f"Found {len(sepsis_papers)} papers related to Sepsis:")
# display(sepsis_papers)

# Get unique paper names
sepsis_paper_names = sepsis_papers["docs"].str.strip().unique()
print(f"Number of unique paper names: {len(sepsis_paper_names)}")
sepsis_papers_chunks = get_all_chunks_per_paper(graph, sepsis_paper_names)

Found 35 papers related to Sepsis:
Number of unique paper names: 35
Found 100 chunks in paper ADRENAL
Found 100 chunks in paper ANNANE
Found 98 chunks in paper APROCCHSS
Found 99 chunks in paper CORTICUS
Found 100 chunks in paper HEAT
Found 100 chunks in paper PROWESS
Found 100 chunks in paper ALBIOS
Found 100 chunks in paper SAFE
Found 99 chunks in paper ProMISe
Found 100 chunks in paper PROWESS-SHOCK
Found 0 chunks in paper DPSMVAS
Found 100 chunks in paper NEvsVP
Found 100 chunks in paper LOVIT
Found 100 chunks in paper PLMALDHSS
Found 100 chunks in paper SADASS
Found 100 chunks in paper ULS
Found 95 chunks in paper SS3vsSS2
Found 100 chunks in paper EHPSS
Found 100 chunks in paper FRESHS
Found 100 chunks in paper NSS
Found 100 chunks in paper SEPSISPAM
Found 100 chunks in paper SOAP-II
Found 100 chunks in paper TRISS
Found 100 chunks in paper VASST
Found 100 chunks in paper Rivers Trial
Found 99 chunks in paper ProCESS
Found 100 chunks in paper ARISE
Found 100 chunks in paper HYPRE

In [6]:
# # check if the expected number of papers match the actual number of papers
# # Get list of expected papers
# expected_papers = list(sepsis_papers_chunks.values())  # 딕셔너리의 값을 리스트로 변환

# # Get list of actual papers from chunks_of_paper directory
# actual_papers = [
#     f.replace("chunks_of_", "").replace(".csv", "")
#     for f in os.listdir("./chunks_of_paper")
#     if f.endswith(".csv")
# ]

# # Find missing papers
# missing_papers = set(expected_papers) - set(actual_papers)

# print("Missing papers:")
# for paper in missing_papers:
#     print(f"- {paper}")

## Step 3: Compare Question Embedding and Paper Chunk Embeddings

In [42]:
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

client = OpenAI(api_key=api_key)

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

idx = 0  # index of the question. Change this to test different questions
test_question = df_Paper["Question"].iloc[idx]
print(f"Question {idx+1}: {test_question}")

for i, paper in enumerate(sepsis_paper_names):
    print(f"\nPaper {i+1}: {paper}")
    paper_name = str(paper)  # without .pdf extension
    top5chunks = compare_embeddings(question=test_question, paper=paper_name, top_k=5)

    prompt = (
        f"Does the following question have any relevance to the text chunks below? Keep your answer short\n\nQuestion: {test_question}\n\nText Chunks:\n"
        + "\n".join(
            [f"{i+1}. {text}" for i, text in enumerate(top5chunks["chunk_text"])]
        )
    )

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200,
        )
    except Exception as e:
        print(f"Error: {e}")
        continue

    output = response.choices[0].message.content.strip()
    display(output)

Question 1: Does early administration of neuromuscular blocking agents increases the ventilator free days?

Paper 1: ADRENAL


'No, the text chunks do not directly address the question about the effect of early administration of neuromuscular blocking agents on ventilator-free days.'


Paper 2: ANNANE


'No, the question does not have relevance to the provided text chunks.'


Paper 3: APROCCHSS


'No, the text chunks do not address the effect of neuromuscular blocking agents on ventilator-free days.'


Paper 4: CORTICUS


'No, the text chunks do not provide information relevant to the question about early administration of neuromuscular blocking agents and ventilator-free days.'


Paper 5: HEAT


'Yes, the question is relevant as it pertains to ventilator-free days, which are mentioned in the text chunks discussing outcomes related to mechanical ventilation.'


Paper 6: PROWESS


'No, the text chunks do not provide relevant information regarding the effects of early administration of neuromuscular blocking agents on ventilator-free days.'


Paper 7: ALBIOS


'No, the text chunks do not contain relevant information regarding the effect of early administration of neuromuscular blocking agents on ventilator-free days.'


Paper 8: SAFE


'No, the question about neuromuscular blocking agents and ventilator-free days is not relevant to the provided text chunks, which focus on fluid administration and its effects in a clinical setting.'


Paper 9: ProMISe


'No, the text chunks do not provide relevant information regarding the effect of early administration of neuromuscular blocking agents on ventilator-free days.'


Paper 10: PROWESS-SHOCK


'No, the text chunks do not provide relevant information regarding the effect of early administration of neuromuscular blocking agents on ventilator-free days.'


Paper 11: DPSMVAS
No chunk data found for paper: DPSMVAS


'No, the question does not have relevant text chunks provided for context.'


Paper 12: NEvsVP


'No, the question about neuromuscular blocking agents and ventilator-free days does not have relevance to the provided text chunks, which primarily focus on vasopressor management in septic shock.'


Paper 13: LOVIT


'No, the provided text chunks do not address the influence of early administration of neuromuscular blocking agents on ventilator-free days.'


Paper 14: PLMALDHSS


'No, the question does not have relevance to the text chunks provided. The text discusses the effects of glucocorticoids, vasopressor treatment, and mechanical ventilation but does not specifically address neuromuscular blocking agents or their impact on ventilator-free days.'


Paper 15: SADASS


'No, the question about neuromuscular blocking agents and ventilator-free days is not relevant to the provided text chunks, which discuss drotrecogin alfa and its effects on mortality and sepsis treatment.'


Paper 16: ULS


'No, the question does not have relevance to the provided text chunks.'


Paper 17: SS3vsSS2


'No, the question does not have relevance to the provided text chunks. They discuss septic shock and related treatments, but do not mention neuromuscular blocking agents or ventilator-free days.'


Paper 18: EHPSS


'No, the question about neuromuscular blocking agents and ventilator-free days is not relevant to the provided text chunks.'


Paper 19: FRESHS


'Yes, the question is relevant as it involves the effects of neuromuscular blocking agents, which may relate to mechanical ventilation and ventilator-free days, but the provided text chunks do not mention neuromuscular blocking agents or their effects.'


Paper 20: NSS


'No, the question does not have relevance to the provided text chunks. The text focuses on vasopressors and their effects, without mentioning neuromuscular blocking agents or ventilator-free days.'


Paper 21: SEPSISPAM


'No, the text chunks do not relate to the question about neuromuscular blocking agents and ventilator-free days.'


Paper 22: SOAP-II


'No, the text chunks do not provide relevant information regarding the early administration of neuromuscular blocking agents and its effect on ventilator-free days.'


Paper 23: TRISS


'No, the question does not have relevance to the provided text chunks.'


Paper 24: VASST


'No, the question about neuromuscular blocking agents and ventilator-free days has no relevance to the provided text chunks, which focus on the use of vasopressin versus norepinephrine in septic shock.'


Paper 25: Rivers Trial


'No, the question about neuromuscular blocking agents does not have relevance to the provided text chunks, which mainly discuss goal-directed therapy in severe sepsis and septic shock.'


Paper 26: ProCESS


'No, the question does not have relevance to the text chunks provided.'


Paper 27: ARISE


'No, the text chunks do not address the question regarding the early administration of neuromuscular blocking agents and ventilator-free days.'


Paper 28: HYPRESS


'No, the text chunks do not directly address the question regarding the early administration of neuromuscular blocking agents and its effect on ventilator-free days.'


Paper 29: IDEAL-ICU


'Yes, the question has relevance as it pertains to the outcomes discussed in the text chunks, specifically regarding mechanical ventilation days and ICU stays.'


Paper 30: FEAST


'No, the question does not have relevance to the provided text chunks.'


Paper 31: NSEPSIS


'No, the question does not have any relevance to the provided text chunks.'


Paper 32: UHSEPSIS


'No, the question does not have relevance to the text chunks provided.'


Paper 33: CABS


'No, the question does not have relevance to the provided text chunks.'


Paper 34: DDS


'No, the question does not have relevance to the text chunks provided. The text focuses on sedation and effects of dexmedetomidine without discussing neuromuscular blocking agents.'


Paper 35: EDIS


'No, the question does not have relevance to the text chunks provided.'