In [1]:
import sys
print(sys.executable)

/Users/rahul/Desktop/fact_check/fact-check/venv/bin/python


In [2]:
import os
from src.preprocess import load_marketing_claims, load_clinical_docs, process_clinical_doc
from src.matcher import build_tfidf_matrix, match_claim_to_docs, build_clinical_texts_dict
from src.utils import setup_logger, save_json


In [3]:
logger = setup_logger("MainNotebook", level=10) 

# Define paths (adjust these based on your repository structure)
marketing_claims_path = os.path.join("data", "Flublok_Claims.json")
clinical_docs_directory = os.path.join("data", "Clinical Files")
results_path = os.path.join("results", "basic_results.json")

logger.info("Paths defined successfully.")

2025-04-11 13:37:28,657 - MainNotebook - INFO - Paths defined successfully.


In [4]:
claims = load_marketing_claims(marketing_claims_path)
logger.info(f"Loaded {len(claims)} marketing claims.")

# Load clinical document file paths from the clinical docs folder
clinical_docs = load_clinical_docs(clinical_docs_directory)
logger.info(f"Found {len(clinical_docs)} clinical documents.")

2025-04-11 13:37:32,095 - MainNotebook - INFO - Loaded 9 marketing claims.
2025-04-11 13:37:32,098 - MainNotebook - INFO - Found 8 clinical documents.


In [5]:
# Process each clinical document into a tuple (document_name, cleaned_text)
clinical_texts = build_clinical_texts_dict(clinical_docs, process_clinical_doc)
logger.info("Processed all clinical documents.")

# Optional: Display a preview of one processed document
if clinical_texts:
    doc_name, text = clinical_texts[0]
    logger.info(f"Preview of {doc_name}: {text[:200]}...")

2025-04-11 13:37:38,220 - MainNotebook - INFO - Processed all clinical documents.
2025-04-11 13:37:38,225 - MainNotebook - INFO - Preview of Arunachalam et al. (2021).pdf: REVIEW ARTICLE OPEN Unique features of a recombinant haemagglutinin inﬂuenza vaccine that inﬂuence vaccine performance Arun B. Arunachalam 1✉, Penny Post2 and Deborah Rudin3 The inﬂuenza vaccine ﬁeld ...


In [6]:
# Extract the text content only, needed to build the TF-IDF matrix.
clinical_text_only = [text for _, text in clinical_texts]

# Build the TF-IDF matrix and obtain the vectorizer object.
tfidf_matrix, vectorizer = build_tfidf_matrix(clinical_text_only)
logger.info("TF-IDF matrix built for clinical documents.")


2025-04-11 13:38:26,963 - MainNotebook - INFO - TF-IDF matrix built for clinical documents.


In [7]:
# Number of top matches you want per claim
top_k = 3

# Container for the final output results
results = {"claims": []}

for claim_obj in claims:
    claim_text = claim_obj['claim']
    # Get top matching clinical documents for this claim
    matches = match_claim_to_docs(claim_text, clinical_texts, tfidf_matrix, vectorizer, top_k=top_k)
    
    # Append the result for this claim in the required format
    results["claims"].append({
        "claim": claim_text,
        "match_source": matches  # matches is a list of dictionaries as produced in matcher.py
    })
    
    logger.info(f"Processed claim: {claim_text[:50]}... with {len(matches)} matches.")

logger.info("Matching completed for all claims.")


2025-04-11 13:38:43,084 - MainNotebook - INFO - Processed claim: Flublok ensures identical antigenic match with WHO... with 3 matches.
2025-04-11 13:38:43,085 - MainNotebook - INFO - Processed claim: Flublok contains 3x the hemagglutinin (HA) antigen... with 3 matches.
2025-04-11 13:38:43,088 - MainNotebook - INFO - Processed claim: Cell- and egg-based flu vaccines have the potentia... with 3 matches.
2025-04-11 13:38:43,090 - MainNotebook - INFO - Processed claim: Recombinant technology leads to a broader immune r... with 3 matches.
2025-04-11 13:38:43,093 - MainNotebook - INFO - Processed claim: Vaccination with a higher-dose recombinant flu vac... with 3 matches.
2025-04-11 13:38:43,094 - MainNotebook - INFO - Processed claim: Flublok (quadrivalent) was evaluated in the pivota... with 3 matches.
2025-04-11 13:38:43,095 - MainNotebook - INFO - Processed claim: Flublok is produced using a novel production platf... with 3 matches.
2025-04-11 13:38:43,097 - MainNotebook - INFO - Process

In [8]:
save_json(results, results_path)
logger.info(f"Results saved to {results_path}.")

# Display a summary of one match from the results
import json
print(json.dumps(results["claims"][0], indent=4))

2025-04-11 13:38:57,774 - MainNotebook - INFO - Results saved to results/basic_results.json.
{
    "claim": "Flublok ensures identical antigenic match with WHO- and FDA-selected flu strains.",
    "match_source": [
        {
            "document_name": "FlublokPI.pdf",
            "matching_text": "HIGHLIGHTS OF PRESCRIBING INFORMATION These highlights do not include all the information needed to use Flublok\u00ae safely and effectively. See full prescribing information for Flublok. Flublok (In\ufb02uenza...",
            "score": 0.12143134130269985
        },
        {
            "document_name": "Treanor et al. (2011).pdf",
            "matching_text": "Vaccine 29 7733\u2013 7739 Contents lists available at ScienceDirect Vaccine jou rn al h om epa ge: www.elsevier.com/locate/vaccine Protective ef\ufb01cacy of a trivalent recombinant hemagglutinin protein vacci...",
            "score": 0.07467662456769332
        },
        {
            "document_name": "Arunachalam et al. (2021)