In [1]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from config import *
import fitz
from tqdm import tqdm
import shutil

In [2]:
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_embedding(text, model="text-embedding-3-small"):
    response = client.embeddings.create(input=[text], model=model)
    return np.array(response.data[0].embedding)

def cos_sim(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

# Define the test texts
inclusion_text = "studies using simulation modeling such as agent-based models, Monte Carlo, and Markov chains."
exclusion_text = "studies using only regression analysis or descriptive statistics without simulation."
combined_text = inclusion_text + " \n " + exclusion_text
chunk_text = "This study uses logistic regression to model heart disease outcomes and not just regression analysis."

# Get embeddings
embedding_inclusion = get_embedding(inclusion_text)
embedding_exclusion = get_embedding(exclusion_text)
embedding_combined  = get_embedding(combined_text)
embedding_chunk     = get_embedding(chunk_text)

# Compute similarities
sim_incl = cos_sim(embedding_chunk, embedding_inclusion)
sim_excl = cos_sim(embedding_chunk, embedding_exclusion)
sim_comb = cos_sim(embedding_chunk, embedding_combined)

# Print results
print("Cosine Similarity (Chunk ↔ Inclusion):", round(sim_incl, 4))
print("Cosine Similarity (Chunk ↔ Exclusion):", round(sim_excl, 4))
print("Cosine Similarity (Chunk ↔ Combined) :", round(sim_comb, 4))
print("Contrastive Relevance Score (incl - excl):", round(sim_incl - sim_excl, 4))


Cosine Similarity (Chunk ↔ Inclusion): 0.319
Cosine Similarity (Chunk ↔ Exclusion): 0.495
Cosine Similarity (Chunk ↔ Combined) : 0.4291
Contrastive Relevance Score (incl - excl): -0.176


In [2]:
OUTPUT_FOLDER = "../data/output"
output_files = os.listdir(OUTPUT_FOLDER)

In [13]:
# Target value
target_value = 11

# Reverse lookup
for attr in dir(fitz):
    if attr.startswith("PDF_ANNOT_"):
        if getattr(fitz, attr) == target_value:
            print(f"{target_value} corresponds to: {attr}")
            break


11 corresponds to: PDF_ANNOT_STRIKE_OUT


In [18]:
compliant_files = []
compliant_dict = {}

for filename in tqdm(output_files):
    if not filename.endswith("_highlighted.pdf"):
        continue
        
    pdf_path = os.path.join(OUTPUT_FOLDER, filename)
    
    # Read the PDF and check for highlights
    doc = fitz.open(pdf_path)
    found_colors = set()
    has_highlights = False
    
    # Initialize criteria flags for this file
    criteria_met = {
        'Population': False,
        'Intervention': False,
        'Outcome': False,
        'Study approach': False
    }
    
    # Check each page for highlights
    for page in doc:
        highlights = page.annots()
        
        for highlight in highlights:
            if highlight.type[0] not in [fitz.PDF_ANNOT_HIGHLIGHT, fitz.PDF_ANNOT_UNDERLINE, fitz.PDF_ANNOT_SQUIGGLY, fitz.PDF_ANNOT_STRIKE_OUT]:
                continue
                
            has_highlights = True
            color = highlight.colors["stroke"]
            # Convert color tuple to match CRITERIA_COLORS format
            color_tuple = (round(color[0], 1), round(color[1], 1), round(color[2], 1))
            
            # Check if highlight has a comment starting with "YES"
            comment = highlight.info.get("content", "")
            if comment.startswith("YES"):
                # Add color to set if it's one of the first 4 criteria colors
                if color_tuple in list(CRITERIA_COLORS.values())[:4]:
                    found_colors.add(color_tuple)
                    # Track which criteria are met
                    if color_tuple == list(CRITERIA_COLORS.values())[0]:  # Population
                        criteria_met['Population'] = True
                    elif color_tuple == list(CRITERIA_COLORS.values())[1]:  # Intervention
                        criteria_met['Intervention'] = True
                    elif color_tuple == list(CRITERIA_COLORS.values())[2]:  # Outcome
                        criteria_met['Outcome'] = True
                    elif color_tuple == list(CRITERIA_COLORS.values())[3]:  # Study approach
                        criteria_met['Study approach'] = True

    # Store criteria for all files with highlights
    if has_highlights:
        compliant_dict[filename] = criteria_met
        
        # Check if all 4 colors were found for compliant files list
        if len(found_colors) == 4:
            compliant_files.append(filename)
        
    doc.close()

# Create DataFrame showing all files and their criteria status
df_compliant = pd.DataFrame.from_dict(compliant_dict, orient='index')

# Add a column showing if all criteria are met
df_compliant['All Criteria Met'] = df_compliant.all(axis=1)
df_compliant = df_compliant.reset_index(names='File')

# Display summary statistics
print(f"Found {len(compliant_files)} compliant files:")
for file in compliant_files:
    print(f"- {file}")

print(f"\nTotal files analyzed: {len(df_compliant)}")

for criterion in df_compliant.columns[1:-1]:
    count = df_compliant[criterion].sum()
    print(f"{criterion}: {count} files")

print(f"\nFiles meeting all criteria: {df_compliant['All Criteria Met'].sum()}")


100%|██████████| 18/18 [00:15<00:00,  1.16it/s]

Found 6 compliant files:
- Burkart, Brauer, Aravkin, Godwin, Hay, He, Iannucci, Larson, Lim, Liu, Murray, Zheng, Zhou, Stanaway_Lancet (lond. Engl.)_2021_highlighted.pdf
- Burke, Copeland, Sussman, Hayward, Gross, Briceno, Whitney, Giordani, Elkind, Manly, Gottesman, Gaskin, Sidney, Yaffe, Sacco, Heckbert, Hughes, Galecki, Levine_PLOS One_2024_highlighted.pdf
- Fouladi, Asadi, Sherer, Madadi__2024_highlighted.pdf
- Mihaylova, Wu, Zhou, Williams, Schlackow, Emberson, Reith, Keech, Robson, Parnell, Armitage, Gray, Simes, Baigent_Health Technol. Assess. (winch. Engl.)_2024_highlighted.pdf
- Rehm, Shield, Roerecke, Gmel_BMC Public Health_2016_highlighted.pdf
- Smit, Perez-Guzman, Mutai, Cassidy, Kibachio, Kilonzo, Hallett_Clin. Infect. Dis.  Off. Publ. Infect. Dis. Soc. Am._2020_highlighted.pdf

Total files analyzed: 16
Population: 11 files
Intervention: 13 files
Outcome: 13 files
Study approach: 11 files

Files meeting all criteria: 6





In [19]:
df_compliant

Unnamed: 0,File,Population,Intervention,Outcome,Study approach,All Criteria Met
0,"Alqahtani, Al-Omar, Alshehri, Abanumay, Alabdu...",True,True,True,False,False
1,"Burkart, Brauer, Aravkin, Godwin, Hay, He, Ian...",True,True,True,True,True
2,"Burke, Copeland, Sussman, Hayward, Gross, Bric...",True,True,True,True,True
3,"Emmert-Fees, Felea, Staudigel, Ananthapavan, L...",False,True,False,True,False
4,"Fouladi, Asadi, Sherer, Madadi__2024_highlight...",True,True,True,True,True
5,"Gibbs, Griffin, Gutacker, Villaseñor, Walker__...",True,False,True,True,False
6,"Henkel, Burger, Sletner, Pedersen__2024_highli...",False,True,True,False,False
7,"Kim, Wang, Lauren, Liu, Marklund, Lee, Micha, ...",False,True,True,False,False
8,Kulhanova et al. - 2018 - The fraction of lung...,True,True,True,False,False
9,"Mihaylova, Wu, Zhou, Williams, Schlackow, Embe...",True,True,True,True,True


In [20]:
df_compliant.to_excel("../data/excels/compliant_files.xlsx", index=False)

In [8]:
df = pd.read_csv("../data/excels/Exported Items.csv")
df.head(1)

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
0,7Z3S4GYD,journalArticle,2021.0,"Jayasekera, Jinani; Sparano, Joseph A.; O'Neil...",Development and Validation of a Simulation Mod...,Journal of Clinical Oncology,,0732-183X,10.1200/JCO.21.00651,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,...,,,,,,,,,,


In [13]:
df_human_included = df[df['Manual Tags'].str.contains('obj2_include') & 
                       df['Manual Tags'].str.contains('test_hand_search|test_medline_search', regex=True)][['Manual Tags', 'File Attachments', 'Title']]
df_human_included.shape

(26, 3)

In [14]:
df_human_included.iloc[5]['File Attachments']

'C:\\Users\\pouri\\Zotero\\storage\\C9VIG9GK\\Knudsen et al. - 2016 - Estimation of Benefits, Burden, and Harms of Color.pdf; '

In [24]:
# df_human_included[df_human_included['File Attachments'].str.contains('The modelled impact of')]

In [25]:
import os
import shutil

# Paths
archive_path = r'C:\Users\pouri\Python\Lang_Chain\Projects\NLP_pipeline_full_text\data\papers\Archive'
destination_path = r'C:\Users\pouri\Python\Lang_Chain\Projects\NLP_pipeline_full_text\data\papers'

# Ensure destination path exists
os.makedirs(destination_path, exist_ok=True)

# Normalize filenames from the DataFrame column to a set
included_files = set(df_human_included['File Attachments'].astype(str).str.split('\\').str[-1].str.strip())

# Loop through files and copy if in included list
for file in os.listdir(archive_path):
    if file.strip() in included_files:
        shutil.copy2(os.path.join(archive_path, file), os.path.join(destination_path, file))
