In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm

from langchain_ollama import ChatOllama

from langchain_core.prompts import ChatPromptTemplate
from langchain_pymupdf4llm import PyMuPDF4LLMLoader

from pydantic import BaseModel, Field, constr
from typing import Literal, List, Optional

import ast
import re

In [2]:
DB_PATH = r"../database"
OUT_DIR = r"results"
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

### AI Screening

In [None]:
# screened abstracts

df = pd.read_csv(os.path.join(DB_PATH, "articles_to_screen.csv"))[["key", "title", "abstract"]]

display(df.head())
print("# of articles to screen = ", len(df))

### Abstract Screening by LLM Reviewer

In [None]:
class OutputSchema(BaseModel):
    thoughts: str = Field(description= "thoughts of the model")
    decision: int = Field(description="1 if the article is SELECTED, 0 otherwise")
    reason: str = Field(description = "generate a consise one sentence long reason for the decision")
    ai_method_list: Optional[List[str]] = Field(default=None, description="list all the AI methods explored in the article")


# llm = ChatCohere(model="command-r", temperature=0.0)
llm = ChatOllama(model="deepseek-r1:32b", temperature=0.0, num_ctx=5_000)
struct_llm = llm.with_structured_output(OutputSchema)

In [None]:

messages = [
    ('system',  "You are a helpful AI reviewer that ACCURATELY SCREENS and SELECTS 'ORIGINAL RESEARCH ARTICLES' that falls within the scope of the given 'TOPIC', based on their ABSTRACT." 
                "Your decision should be '1' if SELECTED or '0' otherwise."
                "Generate a concise, one-sentence reason to motivate your decision."
                "If selected, list all the AI methods explored in the ORIGINAL RESEARCH ARTICLE."
                "**Note: ORIGINAL RESEARCH ARTICLES do NOT include REVIEW ARTICLES. Be precise and objective in your evaluation.**"
    ),
    ('human', "ABSTRACT:\n\n title: {title}, \n content: {abstract}\n\n TOPIC: APPLICATIONS OF AI IN CARBON ION THERAPY")
  ]
prompt_template = ChatPromptTemplate.from_messages(messages)
chain = prompt_template | struct_llm

decision_df = {"key":[], "title":[], "abstract":[], "decision":[], "ai_method_list":[], "reason":[], "thoughts":[]}

rows = [row.to_dict() for _,row in df.iterrows()]

for row in tqdm(rows):
    # key, title, selection = row.to_dict()["key"], row.to_dict()["title"], row.to_dict()["selection"]

    inputs = {key:row[key] for key in ["title", "abstract"]} 

    output = chain.invoke(inputs)

    for key,val in {**row, **dict(output)}.items():
        decision_df[key].append(val)

decision_df = pd.DataFrame(decision_df)
decision_df.to_csv(os.path.join(OUT_DIR, "ai_decision.csv"), index=False)

### Complete Critical Review by LLM Reviewer

In [3]:

class OutputSchema(BaseModel):
    thoughts: str = Field(description="thoughts of the model")
    aim: str = Field(description="answer to q1: aim of the research article")
    category: Literal["Treatment planning, optimization and verification",
        "Synthetic imaging",
        "Tumor control probability (TCP) prediction",
        "Normal tissue complication probability (NTCP) prediction"] = Field(description="answer to q2: article category selected from the predefined LIST")
    dataset: str = Field(description="answer to q3: dataset description and the strategy associated with training, validation and test involved in AI modelling. Also provide the sample counts associated with the train, val, and test sets.")
    ai_methodology: str = Field(description="answer to q4: the AI methodology used by the authors for their analysis")
    ai_method_list: List[str] = Field(description="followup answer to q4: list of ai methodologies used by the authors for their analysis")
    pros_and_cons: str = Field(description="answer to q5: the strengths and weaknesses of the methodology followed")
    results: str = Field(description="answer to q6: summary of the results in terms of the performance metrics and the appropriateness of the metrics used to evaluate the AI model.")
    arguments: str = Field(description="answer to q7: the strong and weak arguments that the authors are point out in the discussion")
    conclusion: str = Field(description="answer to q8. their conclusion, and main arguments to support it")
    critical_summary: str = Field(description="a critical summary combining the answers for q1-q8")
    short_summary: str = Field(description="A concise critical summary limited to **300 characters**")
    

# llm = ChatCohere(model='command-r', temperature=0.0)
llm = ChatOllama(model="deepseek-r1:32b", temperature=0.0, num_ctx=15_000)
struct_llm = llm.with_structured_output(OutputSchema)

In [None]:
# from transformers import AutoTokenizer

# import json
# schema_str = json.dumps(OutputSchema.model_json_schema())
# tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
# len(tokenizer.encode(schema_str))

In [4]:
system_message = ''''
You are an expert AI reviewer tasked to perform an accurate literature review of the provided ARTICLE for the specified REVIEW_TOPIC.
Carefully read the entire ARTICLE (excluding the abstract) and answer each of the QUESTIONS precisely and strictly based on the information presented in the ARTICLE.
Then, generate a CRITICAL_SUMMARY that integrates all of your answers, followed by a SHORT_SUMMARY that provides a concise 300-character-long compressed version of the CRITICAL_SUMMARY.

QUESTIONS:
q1. What is the aim of this study?
q2. Select the single most appropriate category for the article from the LIST below.
    LIST:
    - Treatment planning, optimization and verification
        Focus: processes before treatment delivery, such as dose prediction, treatment planning, plan optimization, dose calculation, and plan verification.
    - Synthetic imaging
        Focus: generating or transforming images for treatment planning or dose delivery.
    - Tumor control probability (TCP) prediction
        Focus: modeling tumor response to treatment.
    - Normal tissue complication probability (NTCP) prediction
        Focus: adverse events, toxicities, complications, or quality-of-life after treatment.
    **Selection Rules:**
    i. If the study fits multiple categories, select the one explicitly stated as the primary endpoint.
    ii. If no primary endpoint is stated, choose the category most emphasized in the title and/or aim.
    iii. The answer must be exactly one element from the LIST above.
    iv. Do not include reasoning or deliberation in the answer. Only output the category name.

q3. Describe the dataset and provide the training, validation, and test strategy involved in AI modelling, including the corresponding sample counts.
q4. Define the AI methodology used by the authors in their analysis and list all AI methods that they explored.
q5. What are the strengths and weaknesses of the methodology followed?
q6. Can you summarize the results in terms of the performance metrics and assess whether the authors chose appropriate metrics to evaluate the AI model?
q7. What are the strong and weak arguments that the authors are pointing out in the discussion?
q8. What is their conclusion, and what are their main arguments to support it?

Answer each question accurately using the information solely available in the ARTICLE. 
If specific information required to answer any question is not present in the ARTICLE, state this clearly instead of speculating.
The ANSWERS should be detailed, evidence-based and written in an objective and academic tone appropriate for a scientific literature review. 

The CRITICAL_SUMMARY should integrate the answers to q1-q8 into a coherent, structured review.
The SHORT_SUMMARY should be a compressed version of CRITICAL_SUMMARY, limited to 300 characters (including spaces).
'''

human_message = "ARTICLE:\n{article}\n\nREVIEW_TOPIC:{topic}"

In [5]:
messages = [
    ('system', system_message),
    ('human', human_message)
]

prompt_template = ChatPromptTemplate.from_messages(messages)
chain = prompt_template | struct_llm

In [6]:
outputs = []

articles_df = pd.read_excel("../database/articles_to_review.xlsx")

for i,row in tqdm(list(articles_df.iterrows())):

    row_dict = row.to_dict()

    file_path = os.path.join(DB_PATH, row_dict['path'])

    loader = PyMuPDF4LLMLoader(file_path, mode='single') #we are loading all the pages of the documents in a single page
    docs = loader.load()
    manuscript = docs[0].page_content

    # Keep everything except contents inside "References"
    pattern = re.compile(r'^[#\*\s]*references\b', re.IGNORECASE | re.MULTILINE)
    match = pattern.search(manuscript)
    if match:
        ref_index = match.start()
        manuscript = manuscript[:ref_index + len(match.group(0))]

    output = dict(chain.invoke({"article":manuscript, "topic":"APPLICATIONS OF AI IN CARBON ION THERAPY"}))
    output["file_path"] = file_path

    output = {**row_dict, **output}

    print(output["category"], file_path, output['aim'])

    outputs.append(output)

  
pd.DataFrame(outputs).to_csv(os.path.join(OUT_DIR, "ai_review_final.csv"), index=False)

  0%|          | 0/19 [00:00<?, ?it/s]

Synthetic imaging ../database/selected_articles/Zhang_DR_only_CIRT_TPS_DL_PhyMed_2022.pdf The aim is to evaluate if it's possible to use just DR images with deep learning for treatment planning in carbon ion radiotherapy, using a phantom and head-and-neck patients.
Synthetic imaging ../database/selected_articles/Parrella_SyntCT_CIRT_Abdomen_Bioeng_2023.pdf The aim is to evaluate the feasibility of generating synthetic CT (sCT) volumes for carbon ion radiotherapy (CIRT) in abdominal sites using a conditional GAN (cGAN). The study focuses on creating sCT from MRI scans to avoid additional radiation exposure and improve treatment planning accuracy.
Synthetic imaging ../database/selected_articles/Knausl_synthetic_CT_in_adapative_CIRT.pdf The aim is to explore using synthetic CT (sCT) in carbon-ion therapy for better workflow efficiency without extra imaging dose, despite challenges like patient positioning and limited data.
Synthetic imaging ../database/selected_articles/Pepa_syntheticCT f