In [1]:
import os

os.environ["COHERE_API_KEY"] = 'E2efronXbGqXSR0sclyInSjXptFOjdIjsbCeXFwP'

from langchain_cohere import ChatCohere
from langchain_ollama import ChatOllama

In [2]:
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate

In [3]:
from pydantic import BaseModel, Field

In [4]:
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
# screening abstracts

In [None]:

class OutputSchema(BaseModel):
    thoughts: str = Field(description= "thoughts of the model")
    decision: int = Field(description="1 if the abstract was selected, 0 otherwise")
    reason: str = Field(description = "generate a consise one sentence long reason for the decision")


# llm = ChatCohere(model="command-r", temperature=0.0)
llm = ChatOllama(model="deepseek-r1:32b", temperature=0.0)
struct_llm = llm.with_structured_output(OutputSchema)

In [None]:
df = pd.read_csv("articles.csv")[["key", "title", "abstract"]]

display(df.head())
print(len(df))

In [None]:
messages = [
    ('system',  "You are a helpful AI agent that assists in accurately screening the article using its abstract. " 
                "Based solely on the abstract provided, determine whether the article discusses an APPLICATION OF AI METHODS IN CARBON ION THERAPY? "
                "Your decision should be '0' for NO or '1' for YES. Then, generate a concise, one-sentence reason for your decision."
    ),
    ('human', "abstract:\n\n title: {title}, \n content: {abstract}")
  ]

prompt_template = ChatPromptTemplate.from_messages(messages)
chain = prompt_template | struct_llm

decision_df = {"key":[], "title":[], "abstract":[], "decision":[], "reason":[], "thoughts":[]}

rows = [row.to_dict() for i,row in df.iterrows()]

for row in tqdm(rows):
  # key, title, selection = row.to_dict()["key"], row.to_dict()["title"], row.to_dict()["selection"]

  inputs = {key:row[key] for key in ["title", "abstract"]} 

  output = chain.invoke(inputs)

  for key,val in {**row, **dict(output)}.items():
      decision_df[key].append(val)
  
decision_df = pd.DataFrame(decision_df)
decision_df.to_csv("ai_screening_outputs_final.csv", index=False)


In [None]:
# generating summaries

In [61]:

class OutputSchema(BaseModel):
    thoughts: str = Field(description= "thoughts of the model")
    study_objective: str = Field(description="The objective of the article")
    ai_methods: str = Field(description="The AI method(s) used in the article in the context of carbon ion therapy")
    application_domain: str = Field(description="carbon ion therapy application domain")
    dataset_characteristics: str = Field(description="characteristics of the dataset, including training, validation, test, etc")
    key_findings: str = Field(description="key findings mentioned in the article")
    pros_and_cons: str = Field(description="strengths and limitations of the study")
    future_direction: str = Field(description="future directions (if stated)")
# llm = ChatCohere(model="command-r", temperature=0.0)
# llm = ChatCohere(model='command-r', temperature=0.0)
llm = ChatOllama(model="deepseek-r1:32b", temperature=0.0)
struct_llm = llm.with_structured_output(OutputSchema)



In [65]:
file_path = r"1-s2.0-S1120179724002163-main.pdf"

In [37]:
# from langchain_community.document_loaders import PyPDFLoader


# loader = PyPDFLoader(file_path)
# pages = []
# async for page in loader.alazy_load():
#     pages.append(page)

# manuscript = "\n".join([page.page_content for page in pages])

In [64]:
import pymupdf4llm

manuscript = pymupdf4llm.to_markdown(file_path)

ref_index = manuscript.lower().find("references")

# Keep everything up to and including "References"
if ref_index != -1:
    manuscript = manuscript[:ref_index + len("references")]

# Now md_text contains only up to the References section
print(manuscript)

[Physica Medica 124 (2024) 103421](https://doi.org/10.1016/j.ejmp.2024.103421)

Contents lists available at ScienceDirect
# Physica Medica

[journal homepage: www.elsevier.com/locate/ejmp](https://www.elsevier.com/locate/ejmp)
#### Original paper
## A dosiomics approach to treatment outcome modeling in carbon ion radiotherapy for skull base chordomas
### Giovanni Parrella [a] [,] [*], Simone Annunziata [a], Letizia Morelli [a], Silvia Molinelli [b], Giuseppe Magro [b], Mario Ciocca [b], Giulia Riva [c], Lucia Pia Ciccone [c], Alberto Iannalfi [c], Chiara Paganelli [a], Ester Orlandi [d] [,] [e], Guido Baroni [a ]

a *Politecnico di Milano, Department of Electronics, Information and Bioengineering, Milano, Italy*
b *Centro Nazionale di Adroterapia Oncologica, Medical Physics Unit, Pavia, Italy*
c *Centro Nazionale di Adroterapia Oncologica, Radiotherapy Unit, Pavia, Italy*
d *Centro Nazionale di Adroterapia Oncologica, Radiation Oncology Clinical Unit, Pavia, Italy*
e *University of Pav

In [66]:
messages = [
    ('system',  "You are a helpful AI agent that assists in accurately summarizing the article in a standardized format "
                "to be used for the literature review titled 'APPLICATIONS OF AI IN CARBON ION THERAPY'. \n"
                "Summarize the article under the following standard headers:\n"
                "'1. Study objective', '2. AI methods used', '3. carbon therapy application domain', '4. Dataset characteristics', '5. Key findings', "
                "'6. Strengths and limitations', '7. Future directions (if stated)' "
    ),
    ('human', "article: \n {article}")
  ]

prompt_template = ChatPromptTemplate.from_messages(messages)
chain = prompt_template | struct_llm

In [67]:
output = chain.invoke({"article":manuscript})

In [68]:
dict(output)

# https://www.reddit.com/r/LangChain/comments/1e7cntq/whats_the_best_python_library_for_extracting_text/

{'thoughts': "Okay, so I'm trying to understand this article about using dosiomics to predict treatment outcomes for skull base chordomas treated with CIRT. Let me break it down step by step. First, the article mentions something called LETd, which stands for Linear Energy Transfer dose maps. They're comparing these to physical and RBE-weighted dose maps. I think RBE is Relative Biological Effectiveness, right? So they're looking at different ways of measuring radiation doses in the context of CIRT, which is Carbon Ion Radiation Therapy. That makes sense because proton therapy uses protons, but carbon ions are heavier particles, so their effects might be different and more complex to model. The article talks about predicting local recurrence (LR) versus local control (LC). So they're trying to see if certain dose metrics can predict whether the tumor will come back or not after treatment. They used something called a time-dependent ROC analysis for each feature. I remember ROC curves a