In [1]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from config import *
import fitz
from tqdm import tqdm
import shutil

In [2]:
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_embedding(text, model="text-embedding-3-small"):
    response = client.embeddings.create(input=[text], model=model)
    return np.array(response.data[0].embedding)

def cos_sim(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

# Define the test texts
inclusion_text = "studies using simulation modeling such as agent-based models, Monte Carlo, and Markov chains."
exclusion_text = "studies using only regression analysis or descriptive statistics without simulation."
combined_text = inclusion_text + " \n " + exclusion_text
chunk_text = "This study uses logistic regression to model heart disease outcomes and not just regression analysis."

# Get embeddings
embedding_inclusion = get_embedding(inclusion_text)
embedding_exclusion = get_embedding(exclusion_text)
embedding_combined  = get_embedding(combined_text)
embedding_chunk     = get_embedding(chunk_text)

# Compute similarities
sim_incl = cos_sim(embedding_chunk, embedding_inclusion)
sim_excl = cos_sim(embedding_chunk, embedding_exclusion)
sim_comb = cos_sim(embedding_chunk, embedding_combined)

# Print results
print("Cosine Similarity (Chunk ↔ Inclusion):", round(sim_incl, 4))
print("Cosine Similarity (Chunk ↔ Exclusion):", round(sim_excl, 4))
print("Cosine Similarity (Chunk ↔ Combined) :", round(sim_comb, 4))
print("Contrastive Relevance Score (incl - excl):", round(sim_incl - sim_excl, 4))


Cosine Similarity (Chunk ↔ Inclusion): 0.319
Cosine Similarity (Chunk ↔ Exclusion): 0.495
Cosine Similarity (Chunk ↔ Combined) : 0.4291
Contrastive Relevance Score (incl - excl): -0.176


In [8]:
OUTPUT_FOLDER = "../data/output"
output_files = os.listdir(OUTPUT_FOLDER)

In [9]:
# Target value
target_value = 11

# Reverse lookup
for attr in dir(fitz):
    if attr.startswith("PDF_ANNOT_"):
        if getattr(fitz, attr) == target_value:
            print(f"{target_value} corresponds to: {attr}")
            break


11 corresponds to: PDF_ANNOT_STRIKE_OUT


In [None]:
compliant_files = []
compliant_dict = {}
yes_count = 0
no_count = 0
total_count = 0

for filename in tqdm(output_files):
    if not filename.endswith("_highlighted.pdf"):
        continue
        
    pdf_path = os.path.join(OUTPUT_FOLDER, filename)
    
    # Read the PDF and check for highlights
    doc = fitz.open(pdf_path)
    found_colors = set()
    has_highlights = False
    
    # Initialize criteria flags for this file
    criteria_met = {
        'Population': False,
        'Intervention': False,
        'Outcome': False,
        'Study approach': False
    }
    
    # Check each page for highlights
    for page in doc:
        highlights = page.annots()
        
        for highlight in highlights:
            if highlight.type[0] not in [fitz.PDF_ANNOT_HIGHLIGHT, fitz.PDF_ANNOT_UNDERLINE, fitz.PDF_ANNOT_SQUIGGLY, fitz.PDF_ANNOT_STRIKE_OUT]:
                continue
                
            has_highlights = True
            color = highlight.colors["stroke"]
            # Convert color tuple to match CRITERIA_COLORS format
            color_tuple = (round(color[0], 1), round(color[1], 1), round(color[2], 1))
            
            # Check if highlight has a comment starting with "YES"
            comment = highlight.info.get("content", "")
            if comment.startswith(("YES", "MAYBE")):
                # Add color to set if it's one of the first 4 criteria colors
                if color_tuple in list(CRITERIA_COLORS.values())[:4]:
                    found_colors.add(color_tuple)
                    # Track which criteria are met
                    if color_tuple == list(CRITERIA_COLORS.values())[0]:  # Population
                        criteria_met['Population'] = True
                        yes_count += 1
                    elif color_tuple == list(CRITERIA_COLORS.values())[1]:  # Intervention
                        criteria_met['Intervention'] = True
                        yes_count += 1
                    elif color_tuple == list(CRITERIA_COLORS.values())[2]:  # Outcome
                        criteria_met['Outcome'] = True
                        yes_count += 1
                    elif color_tuple == list(CRITERIA_COLORS.values())[3]:  # Study approach
                        criteria_met['Study approach'] = True
                        yes_count += 1
                    else:
                        no_count += 1
            total_count += 1

    # Store criteria for all files with highlights
    if has_highlights:
        compliant_dict[filename] = criteria_met
        
        # Check if all 4 colors were found for compliant files list
        if len(found_colors) == 4:
            compliant_files.append(filename)
        
    doc.close()

# Create DataFrame showing all files and their criteria status
df_compliant = pd.DataFrame.from_dict(compliant_dict, orient='index')

# Add a column showing if all criteria are met
df_compliant['All Criteria Met'] = df_compliant.all(axis=1)
df_compliant = df_compliant.reset_index(names='File')

# Display summary statistics
print(f"Found {len(compliant_files)} compliant files:")
for file in compliant_files:
    print(f"- {file}")

print(f"\nTotal files analyzed: {len(df_compliant)}")

for criterion in df_compliant.columns[1:-1]:
    count = df_compliant[criterion].sum()
    print(f"{criterion}: {count} files")

print(f"\nFiles meeting all criteria: {df_compliant['All Criteria Met'].sum()}")


100%|██████████| 21/21 [00:10<00:00,  2.05it/s]

Found 3 compliant files:
- Burke, Copeland, Sussman, Hayward, Gross, Briceno, Whitney, Giordani, Elkind, Manly, Gottesman, Gaskin, Sidney, Yaffe, Sacco, Heckbert, Hughes, Galecki, Levine_PLOS One_2024_highlighted.pdf
- Fouladi, Asadi, Sherer, Madadi__2024_highlighted.pdf
- Mihaylova, Wu, Zhou, Williams, Schlackow, Emberson, Reith, Keech, Robson, Parnell, Armitage, Gray, Simes, Baigent_Health Technol. Assess. (winch. Engl.)_2024_highlighted.pdf

Total files analyzed: 16
Population: 9 files
Intervention: 12 files
Outcome: 13 files
Study approach: 8 files

Files meeting all criteria: 3





In [42]:
color_to_flagname = {
    tuple(int(c * 255) for c in list(CRITERIA_COLORS.values())[0]): "Population",
    tuple(int(c * 255) for c in list(CRITERIA_COLORS.values())[1]): "Intervention",
    tuple(int(c * 255) for c in list(CRITERIA_COLORS.values())[2]): "Outcome",
    tuple(int(c * 255) for c in list(CRITERIA_COLORS.values())[3]): "Study approach"
}

def extract_highlighted_text(page, annot):
    """Extract highlighted text using the highlight's quads."""
    words = page.get_text("words")  # list of (x0, y0, x1, y1, "word", block_no, line_no, word_no)
    quads = annot.vertices
    sentences = []
    if quads:
        quad_count = int(len(quads)/4)
        for i in range(quad_count):
            # Each quad is 4 points, convert to rect
            quad = quads[i*4:(i+1)*4]
            rect = fitz.Quad(quad).rect
            # Find all words within the rect
            words_in_rect = [w for w in words if fitz.Rect(w[:4]).intersects(rect)]
            text = " ".join(w[4] for w in words_in_rect)
            if text:
                sentences.append(text)
    return " ".join(sentences)

pdf_comments_dict = {}
highlight_buffer = {}

for pdf_filename in tqdm(output_files[-1::-1]):
    if not pdf_filename.endswith("_highlighted.pdf"):
        continue

    pdf_path = os.path.join(OUTPUT_FOLDER, pdf_filename)
    doc = fitz.open(pdf_path)
    file_comments = []

    for page in doc:
        for annot in page.annots() or []:
            if annot.type[0] == 8:  # Highlight
                comment_text = annot.info.get("content", "")
                color_tuple = tuple(int(annot.colors['stroke'][i]*255) for i in range(3)) if annot.colors and 'stroke' in annot.colors else None
                flag = None
                if color_tuple:
                    for ref_tuple, flagname in color_to_flagname.items():
                        if color_tuple_close(color_tuple, ref_tuple):
                            flag = flagname
                            break
                if flag is None:
                    flag = "Unknown"
                # Extract highlighted text
                chunk_text = extract_highlighted_text(page, annot)
                # Buffer by comment to aggregate
                if comment_text not in highlight_buffer:
                    highlight_buffer[comment_text] = {
                        "flag": flag,
                        "chunk_text": chunk_text,
                        "comment_text": comment_text
                    }
                else:
                    # Concatenate highlighted text if more than one highlight for this comment
                    highlight_buffer[comment_text]["chunk_text"] += " " + chunk_text

    # After all annotations processed, save to dict
    file_comments = list(highlight_buffer.values())
    pdf_comments_dict[pdf_filename] = file_comments
    doc.close()
    highlight_buffer.clear()  # Clear for next file


100%|██████████| 18/18 [12:28<00:00, 41.58s/it]


In [44]:
flag_map = {
    "Population": "Population",
    "Intervention": "Intervention",
    "Outcome": "Outcome",
    "Study approach": "Study Design",
    "Study Design": "Study Design"
}
flags = ["Population", "Intervention", "Outcome", "Study Design"]

rows = []
for filename, highlights in pdf_comments_dict.items():
    title = filename.replace("_highlighted.pdf", "")
    # Group highlights by normalized flag
    flag_to_chunks = {flag: [] for flag in flags}
    flag_to_comments = {flag: [] for flag in flags}
    for h in highlights:
        flag = flag_map.get(h["flag"], h["flag"])
        if flag in flags:
            if h.get("chunk_text"):
                flag_to_chunks[flag].append(h["chunk_text"])
            if h.get("comment_text"):
                flag_to_comments[flag].append(h["comment_text"])
    # Build table rows for each flag
    for flag in flags:
        # Chunk row
        chunk_row = [title, flag, "Chunk"] + flag_to_chunks[flag]
        rows.append(chunk_row)
        # Comment row
        comment_row = ["", "", "Comment"] + flag_to_comments[flag]
        rows.append(comment_row)
        title = ""  # Only first row per PDF

# Find max number of chunks/comments across all flags to pad columns
max_cols = max(len(row) for row in rows)
for row in rows:
    while len(row) < max_cols:
        row.append("")

columns = ["Title", "CID", ""] + [f"" for _ in range(max_cols - 3)]
df = pd.DataFrame(rows, columns=columns)
# df.to_excel("test.xlsx", index=False)
df.head(2)

Unnamed: 0,Title,CID,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,"Trieu, Huang, Aminde, Cobiac, Coyle, Wanjau, T...",Population,Chunk,WHO recommends lowering population sodium inta...,0 20 40 60 80 –10 10 0 Cumulative health-care ...,,,,,,...,,,,,,,,,,
1,,,Comment,YES \nScore: 75 \nThe chunk discusses the ge...,NO \nScore: 10 \nThe chunk primarily present...,,,,,,...,,,,,,,,,,


In [19]:
df_compliant

Unnamed: 0,File,Population,Intervention,Outcome,Study approach,All Criteria Met
0,"Alqahtani, Al-Omar, Alshehri, Abanumay, Alabdu...",True,True,True,False,False
1,"Burkart, Brauer, Aravkin, Godwin, Hay, He, Ian...",True,True,True,True,True
2,"Burke, Copeland, Sussman, Hayward, Gross, Bric...",True,True,True,True,True
3,"Emmert-Fees, Felea, Staudigel, Ananthapavan, L...",False,True,False,True,False
4,"Fouladi, Asadi, Sherer, Madadi__2024_highlight...",True,True,True,True,True
5,"Gibbs, Griffin, Gutacker, Villaseñor, Walker__...",True,False,True,True,False
6,"Henkel, Burger, Sletner, Pedersen__2024_highli...",False,True,True,False,False
7,"Kim, Wang, Lauren, Liu, Marklund, Lee, Micha, ...",False,True,True,False,False
8,Kulhanova et al. - 2018 - The fraction of lung...,True,True,True,False,False
9,"Mihaylova, Wu, Zhou, Williams, Schlackow, Embe...",True,True,True,True,True


In [20]:
df_compliant.to_excel("../data/excels/compliant_files.xlsx", index=False)

In [8]:
df = pd.read_csv("../data/excels/Exported Items.csv")
df.head(1)

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
0,7Z3S4GYD,journalArticle,2021.0,"Jayasekera, Jinani; Sparano, Joseph A.; O'Neil...",Development and Validation of a Simulation Mod...,Journal of Clinical Oncology,,0732-183X,10.1200/JCO.21.00651,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,...,,,,,,,,,,


In [13]:
df_human_included = df[df['Manual Tags'].str.contains('obj2_include') & 
                       df['Manual Tags'].str.contains('test_hand_search|test_medline_search', regex=True)][['Manual Tags', 'File Attachments', 'Title']]
df_human_included.shape

(26, 3)

In [14]:
df_human_included.iloc[5]['File Attachments']

'C:\\Users\\pouri\\Zotero\\storage\\C9VIG9GK\\Knudsen et al. - 2016 - Estimation of Benefits, Burden, and Harms of Color.pdf; '

In [24]:
# df_human_included[df_human_included['File Attachments'].str.contains('The modelled impact of')]

In [25]:
import os
import shutil

# Paths
archive_path = r'C:\Users\pouri\Python\Lang_Chain\Projects\NLP_pipeline_full_text\data\papers\Archive'
destination_path = r'C:\Users\pouri\Python\Lang_Chain\Projects\NLP_pipeline_full_text\data\papers'

# Ensure destination path exists
os.makedirs(destination_path, exist_ok=True)

# Normalize filenames from the DataFrame column to a set
included_files = set(df_human_included['File Attachments'].astype(str).str.split('\\').str[-1].str.strip())

# Loop through files and copy if in included list
for file in os.listdir(archive_path):
    if file.strip() in included_files:
        shutil.copy2(os.path.join(archive_path, file), os.path.join(destination_path, file))


## Eval with LLM

In [62]:
df_reference = pd.read_excel("../data/excels/full_text_fuzzy_eval.xlsx", sheet_name="Reference")
df_reference.head(2)

Unnamed: 0,Title,DOI,Unnamed: 2,CID,Unnamed: 4,1,2,3,4,5,...,71,72,73,74,75,76,77,78,79,80
0,"Trieu, Huang, Aminde, Cobiac, Coyle, Wanjau, T...",https://doi.org/10.1016/s2468-2667(24)00221-4,10.1016/s2468-2667(24)00221-4,Population,Chunk,WHO recommends lowering population sodium inta...,0 20 40 60 80 –10 10 0 Cumulative health-care ...,,,,...,,,,,,,,,,
1,,,,,Comment,YES \nScore: 75 \nThe chunk discusses the ge...,NO \nScore: 10 \nThe chunk primarily present...,,,,...,,,,,,,,,,


In [87]:
grouped_pairs = []
for i in range(0, len(df_reference), 2):
    pair = []
    
    row1 = df_reference.iloc[i, 5:].dropna().tolist()
    pair.append(row1)
    
    if i + 1 < len(df_reference):
        row2 = df_reference.iloc[i + 1, 5:].dropna().tolist()
        pair.append(row2)
    
    grouped_pairs.append(pair)


In [113]:
INCLUSION_CRITERIA = [
    """
    Population - Populations at risk of developing an NCD (Non-Communicable Diseases)
    Studies of real or generalizable human populations at risk of developing NCDs. These may be defined by geographic, demographic, or social characteristics (e.g., national population, region, age group).
    """,
    """
    Intervention, exposure, or scenario (includes comparator) 
    Studies evaluating health impacts of exposures, interventions, or policies on NCD outcomes through simulation of hypothetical scenarios. Includes burden-of-disease and comparative risk assessment studies.
    """,
    """
    Outcome - Selected non-communicable diseases (NCDs - Non-Communicable Diseases) or NCD risk factors 
    Studies reporting on outcomes related to major NCDs (e.g., cardiovascular disease, cancer, diabetes, chronic respiratory diseases, mental health, neurological disorders, injury, or musculoskeletal conditions) or their risk factors.
    """,
    """
    Study approach - Computational simulation modelling 
    Studies that use computational simulation modeling (e.g., system dynamics, agent-based models, microsimulation, Markov models, or attributable risk models) as the primary method of analysis.
    """
]

EXCLUSION_CRITERIA = [
    """
    Studies focusing exclusively on individuals already diagnosed with NCDs or using highly specific clinical cohorts without generalizability.
    """,
    """
    Studies that focus exclusively on tertiary prevention or do not simulate hypothetical scenarios (e.g., purely observational studies).
    """,
    """
    Studies that are primarily health economic evaluations (e.g., cost-effectiveness analyses) or those focusing only on non-NCD outcomes.
    """,
    """
    Studies using regression, observational methods, trend forecasts, or risk prediction models that do not simulate interventions.
    """
]

In [125]:
from langchain_openai import ChatOpenAI
from collections import defaultdict

llm = ChatOpenAI(model="gpt-4.1", temperature=0)

In [126]:
responses = defaultdict(list)
for i, pair in enumerate(tqdm(grouped_pairs)):
    for row1_text, row2_text in zip(pair[0], pair[1]):
        prompt = f"""
            You are a systematic review assistant.
            Here are the inclusion criteria:
            {INCLUSION_CRITERIA[i%4]}

            Here are the exclusion criteria:
            {EXCLUSION_CRITERIA[i%4]}

            First row (study info): {row1_text}
            Second row (provided decision): {row2_text}

            Task:
            1. State if you agree with the second row's decision (YES=INCLUDE, NO=EXCLUDE).
            2. Give a short reason.
            
            Answer in this format:
            Agrees: ...
            Reason: ...
        """

        response = llm.invoke(prompt)
        responses[i+1].append(response.content)
        

100%|██████████| 68/68 [13:17<00:00, 11.72s/it]


In [205]:
responses[71]

[]