# Get the Top 5 Chunks to Check Paper's Stance

## Step 0: Environment Setup

In [1]:
from dotenv import load_dotenv  # type: ignore
import os
from langchain_neo4j import Neo4jGraph  # type: ignore
from libs import create_vector_index
import pandas as pd  # type: ignore
from conn import connect2Googlesheet
from annolibs import get_all_chunks_per_paper, compare_embeddings
from openai import OpenAI

load_dotenv()

True

In [2]:
# Connect to Neo4j database
try:
    graph = Neo4jGraph(
        url=os.getenv("NEO4J_URL"),
        username=os.getenv("NEO4J_USERNAME"),
        password=os.getenv("NEO4J_PASSWORD"),
    )
    print("Connected to Neo4j database successfully.")
except ValueError as e:
    print(f"Could not connect to Neo4j database: {e}")

Connected to Neo4j database successfully.


In [3]:
# Check if the entities index exists
index_name = "entities"
query = "SHOW INDEXES YIELD name, type WHERE type = 'VECTOR' AND name = $index_name"

result = graph.query(query, params={"index_name": index_name})
if result:
    print("The 'entities' index already exists.")
else:
    create_vector_index(graph, "entities")

## Step 1: Load Questions from Google Sheet

In [4]:
spreadsheet = connect2Googlesheet()

# Select the worksheet: relevance
worksheet = spreadsheet.get_worksheet(2)

# Get all records as a list of dictionaries
data = worksheet.get_all_records()

# Convert to Pandas DataFrame
df_Paper = pd.DataFrame(data)
df_Paper.head()

Unnamed: 0,condition,number,docs,Question,Mahmud's Note,status,comments,Unnamed: 8
0,ARDS,1,ACURASYS,Does early administration of neuromuscular blo...,Like,,,
1,ARDS,2,ACURASYS,Do patients with severe ARDS being treated wit...,Replace,fixed,,
2,ARDS,3,ROSE,"In patients with moderate to severe ARDS, does...",Maybe this question: In patients with moderate...,fixed,,
3,ARDS,4,ROSE,Do patients with moderate-to-severe ARDS have ...,Local question (not sure if this is the aim of...,fixed,Wrong concept since PEEP by itself is mandator...,Does the use of neuromuscular blockers in pati...
4,ARDS,5,FACTT,"Among patients with ALI/ARDS, does a conservat...",Local question (not sure if this is the aim of...,fixed,Check if studies defined conservative by CVP <...,


## Step 2: Get the Chunks from Each Paper

In [5]:
# Filter papers with 'Sepsis' in condition
sepsis_papers = df_Paper[
    df_Paper["condition"].str.contains("Sepsis", case=False, na=False)
]

# Display the filtered papers
print(f"Found {len(sepsis_papers)} papers related to Sepsis:")
# display(sepsis_papers)

# Get unique paper names
sepsis_paper_names = sepsis_papers["docs"].str.strip().unique()
print(f"Number of unique paper names: {len(sepsis_paper_names)}")
sepsis_papers_chunks = get_all_chunks_per_paper(graph, sepsis_paper_names)

Found 35 papers related to Sepsis:
Number of unique paper names: 35
Found 100 chunks in paper ADRENAL
Found 100 chunks in paper ANNANE
Found 98 chunks in paper APROCCHSS
Found 99 chunks in paper CORTICUS
Found 100 chunks in paper HEAT
Found 100 chunks in paper PROWESS
Found 100 chunks in paper ALBIOS
Found 100 chunks in paper SAFE
Found 99 chunks in paper ProMISe
Found 100 chunks in paper PROWESS-SHOCK
Found 0 chunks in paper DPSMVAS
Found 100 chunks in paper NEvsVP
Found 100 chunks in paper LOVIT
Found 100 chunks in paper PLMALDHSS
Found 100 chunks in paper SADASS
Found 100 chunks in paper ULS
Found 95 chunks in paper SS3vsSS2
Found 100 chunks in paper EHPSS
Found 100 chunks in paper FRESHS
Found 100 chunks in paper NSS
Found 100 chunks in paper SEPSISPAM
Found 100 chunks in paper SOAP-II
Found 100 chunks in paper TRISS
Found 100 chunks in paper VASST
Found 100 chunks in paper Rivers Trial
Found 99 chunks in paper ProCESS
Found 100 chunks in paper ARISE
Found 107 chunks in paper HYPRE

In [6]:
# # check if the expected number of papers match the actual number of papers
# # Get list of expected papers
# expected_papers = list(sepsis_papers_chunks.values())  # 딕셔너리의 값을 리스트로 변환

# # Get list of actual papers from chunks_of_paper directory
# actual_papers = [
#     f.replace("chunks_of_", "").replace(".csv", "")
#     for f in os.listdir("./chunks_of_paper")
#     if f.endswith(".csv")
# ]

# # Find missing papers
# missing_papers = set(expected_papers) - set(actual_papers)

# print("Missing papers:")
# for paper in missing_papers:
#     print(f"- {paper}")

## Step 3: Compare Question Embedding and Paper Chunk Embeddings

In [6]:
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

client = OpenAI(api_key=api_key)

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

idx = 10  # index of the question. Change this to test different questions
test_question = df_Paper["Question"].iloc[idx]
print(f"Question {idx+1}: {test_question}")

Question 11: Patients with septic shock undergoing mechanical ventilation, did continuous infusion of hydrocortisone result in lower 90-day mortality?


In [None]:
# for i, paper in enumerate(sepsis_paper_names):
#     print(f"\nPaper {i+1}: {paper}")
#     paper_name = str(paper)  # without .pdf extension
#     top5chunks = compare_embeddings(question=test_question, paper=paper_name, top_k=5)

#     prompt = (
#         f"Does the following question have any relevance to the text chunks below? Keep your answer short\n\nQuestion: {test_question}\n\nText Chunks:\n"
#         + "\n".join(
#             [f"{i+1}. {text}" for i, text in enumerate(top5chunks["chunk_text"])]
#         )
#     )

#     try:
#         response = client.chat.completions.create(
#             model="gpt-4o-mini",
#             messages=[{"role": "user", "content": prompt}],
#             max_tokens=200,
#         )
#     except Exception as e:
#         print(f"Error: {e}")
#         continue

#     output = response.choices[0].message.content.strip()
#     display(output)

In [8]:
# 특정 문서 선택 (예: 첫 번째 문서)
paper_name = str(
    sepsis_paper_names[11]
)  # 첫 번째 문서를 선택, 원하는 인덱스로 변경 가능

print(f"\nProcessing Paper: {paper_name}")

# 선택한 문서에 대한 top 5 chunks 가져오기
top5chunks = compare_embeddings(question=test_question, paper=paper_name, top_k=5)

# 프롬프트 생성
prompt = (
    f"Does the following question have any relevance to the text chunks below? Keep your answer short\n\nQuestion: {test_question}\n\nText Chunks:\n"
    + "\n".join([f"{i+1}. {text}" for i, text in enumerate(top5chunks["chunk_text"])])
)

# GPT-4o-mini에 요청
try:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=200,
    )
except Exception as e:
    print(f"Error: {e}")
    response = None

# 결과 출력
if response:
    output = response.choices[0].message.content.strip()
    display(output)


Processing Paper: NEvsVP


'Yes, the question is relevant as it pertains to the investigation of treatment effects (specifically hydrocortisone) on mortality in patients with septic shock, which is a common theme in the provided text chunks discussing septic shock and treatment outcomes.'

In [9]:
top5chunks

Unnamed: 0,paper_name,position,chunk_text,similarity_score
90,NEvsVP.pdf,91,", Cooper DJ, Holmes CL, Mehta S, Granton JT, Storms MM, et al.: Vasopressin versus norepinephrine infusion in patients with septic shock. N Engl J Med 358:877–887, 2008. 4. Levy MM, Evans LE, Rhodes A: The Surviving Sepsis Campaign Bundle: 2018 update. Intensive Care Med 44:925–928, 2018. 5. Bauer SR, Aloi JJ, Ahren",0.6554
40,NEvsVP.pdf,41,"Copyright © 2019 by the Shock Society. Unauthorized reproduction of this article is prohibited. We compared the effect of the discontinuation order on mortality in septic shock and detected no significant difference in either overall mortality or ICU mortality between the two groups (OR 1.28, 95% CI 0.77 to 2.10, P ¼ 0.34; OR 0.99, 95% CI 0.74 to 1.34, P ¼ 0.96) (Fig. 3",0.633
81,NEvsVP.pdf,82,"a less severe septic shock (3, 31). In a subgroup analysis, we divided the studies into two subgroups according to a predefined corticosteroids usage rate of 75%. A higher incidence of hypotension was observed when VP was discon- tinued first in studies in which patients received corticosteroid therapy percentage below 75%. It is worth noting that only the study by Jeon et al. (7) indicated that patients for whom NE was discontinued first were more likely",0.626
64,NEvsVP.pdf,65,"the critically ill. Accumulated evidence dem- onstrates that targeting and maintaining the goal MAP along with the early initiation of vasoactive agents in patients with septic shock is associated with reduced mortality rates (14, 15). FIG. 3. The effect of different vasopressor discontinuation order on overall mortality (A) and ICU mortality (B) in patients with septic shock. CI indicates confidence interval; DC, discontinued; M-H, Mantel-Haenszel",0.6175
87,NEvsVP.pdf,88,"not been associated with poor outcomes in septic shock patient who receives concomitant VP and NE therapy. The use of corticosteroids may mitigate this effect. However, the TSA indicated a lack of firm evidence for these results. A multicenter, prospective, RCT is warranted to con- firm these findings. REFERENCES 1. Singer M, Deutschman CS, Seymour CW, Shankar-Hari M, Annane D, Bauer M, Bellomo R, Bernard",0.6169
