## Literature Search

In [1]:
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import json
import os
import pandas as pd
import logging
from utils.Evidence_Retrieval.pubmedretrieval import PubMedRetrieval
from camel.types import TaskType, ModelType,ModelPlatformType
from utils.logging import setup_logging

setup_logging()


# Hyperparameters
YOUR_CONFIG_PATH = 'YOUR_CONFIG_PATH' # e.g. 'config/config.json'
YOUR_DATASET_PATH = 'YOUR_DATASET_PATH' # e.g. 'data/2021ACR RA'
YOUR_QUESTION_DECOMPOSTITION_PATH =  'YOUR_QUESTION_DECOMPOSTITION_PATH' # e.g. 'data/2021ACR RA/Question_Decomposition'
save_base =  "your_results_save_base" #  e.g. 'data/2021ACR RA/Search strategies/pubmed/Results'

disease = 'disease_or_topic_of_your_question' # Disease name or clinical topic of your clinical question. e.g. 'Rheumatoid Arthritis (RA)'
pico_idx = "xxxxxxxxx" # PICO index of the clinical question, saved in your PICO_Information.json file

# Search config
additional_parameters = {'datetype': 'pdat', 'mindate': '1946', 'maxdate': '2025/03/30'} # pubmed parameters
filters = {"Just search for RCT":'''<search results> AND ("Randomized controlled trial"[pt] OR "Controlled clinical trial"[pt] OR Randomized[tiab] OR Placebo[tiab] OR "Drug therapy"[sh] OR Randomly[tiab] OR Trial[tiab] OR Groups[tiab])''', 'No review': "<search results> NOT review[pt]"} # search filters: Dict[str, str]
use_agent = True # whether to use Agentic method or not



In [None]:
config_path = os.path.join(YOUR_CONFIG_PATH)
with open(config_path, 'r', encoding="utf8") as file:
            config = json.load(file)
model_config = config['model']["literature_search_model"]
model_name = model_config['model_name']
base_url = model_config['BASE_URL']
api_key = model_config['API_KEY']


In [6]:
question_deconstruction_datapath = os.path.join(
    YOUR_QUESTION_DECOMPOSTITION_PATH, 'PICO_Information.json'
)
# load csv file
question_deconstruction_data = pd.read_json(
    question_deconstruction_datapath, dtype={'Index': str}
)
question_deconstruction_data = question_deconstruction_data[
    question_deconstruction_data['Index'] == pico_idx
]
original_qd_dict = question_deconstruction_data.to_dict(orient='records')



In [8]:
model_setting = {'search_term_formation': model_name,'search_strategy_formation':model_name}
clinical_question = original_qd_dict[0]['Question']
population = original_qd_dict[0]['P']
intervention = original_qd_dict[0]['I']
comparison = original_qd_dict[0]['C']
save_path = os.path.join(save_base,model_name, 'use_agent_'+str(use_agent))
pico_idx = original_qd_dict[0]['Index']
retriever = PubMedRetrieval(disease=disease,clinical_question=clinical_question,population=population,intervention=intervention,comparison=comparison,api_key=api_key,base_url=base_url,model_setting=model_setting,use_agent=use_agent,save_path=save_path,pico_idx=pico_idx,filters=filters, additional_parameters=additional_parameters)

In [10]:
retriever.run()

In [12]:
retriever.search_terms

{'population_terms': ['rheumatoid arthritis',
  'RA',
  'DMARD monotherapy',
  'disease-modifying antirheumatic drugs'],
 'intervention_terms': ['continuing treatment',
  'treatment duration',
  'treatment reassessment']}

In [13]:
save_results_path = os.path.join(save_path, f'PICO{pico_idx}.json')
with open(save_results_path, 'r', encoding="utf8") as file:
    search_results = json.load(file)

# Heuristic screening
# remove records without abstract
print('Total records: '+str(len(search_results)))
search_results = [record for record in search_results if record['Abstract'] != None]
print('Records with abstract: '+str(len(search_results)))
# deduplicate
pmid_set = {d["Paper_Index"] for d in search_results}
for r in search_results:
    if r["Paper_Index"] in pmid_set:
        pmid_set.remove(r["Paper_Index"])
    else:
        search_results.remove(r)
print('Records after deduplication: '+str(len(search_results)))



Total records: 2109
Records with abstract: 2066
Records after deduplication: 2066


In [14]:
publication_type_set = {type for record in search_results for type in record['Publication Types']}
print(publication_type_set)

{'Case Reports', 'Editorial', 'Equivalence Trial', 'Clinical Trial, Phase IV', 'Comparative Study', 'Comment', 'Introductory Journal Article', 'Letter', 'Guideline', 'Clinical Trial, Phase I', 'Clinical Trial', "Research Support, U.S. Gov't, P.H.S.", 'Historical Article', 'Network Meta-Analysis', 'Clinical Trial, Phase III', 'Journal Article', 'Research Support, N.I.H., Extramural', "Research Support, Non-U.S. Gov't", 'Research Support, N.I.H., Intramural', 'English Abstract', "Research Support, U.S. Gov't, Non-P.H.S.", 'Pragmatic Clinical Trial', 'Observational Study', 'Research Support, American Recovery and Reinvestment Act', 'Meta-Analysis', 'Evaluation Study', 'Multicenter Study', 'Clinical Trial, Phase II', 'Randomized Controlled Trial', 'Controlled Clinical Trial', 'Practice Guideline', 'Lecture', 'Validation Study', 'Clinical Trial Protocol', 'Clinical Study'}


In [15]:
# remove invalid publication types
invalid_publication_types = ['Comment', 'Editorial', 'Case Reports', 'News', 'Interview','Published Erratum','Observational Study','Autobiography','Address','Meta-Analysis','Retracted Publication'] # you can modify this list according to your needs
for record in search_results:
    if any(pt in invalid_publication_types for pt in record['Publication Types']):
        search_results.remove(record)
print('Records after removing invalid publication types: '+str(len(search_results)))

Records after removing invalid publication types: 1842


In [None]:
# save the data as json
quicker_data = {
    "disease": disease,
	"clinical_question": clinical_question,
    'pico_idx': pico_idx,
	"population": population,
	"intervention": intervention,
	"comparison": comparison,
	"search_results": search_results,
}

quicker_data_path = os.path.join(YOUR_DATASET_PATH, f'quicker_data(PICO_IDX{pico_idx})_ls.json')
with open(quicker_data_path, 'w', encoding="utf8") as file:
    json.dump(quicker_data, file, indent=4)
print(f"Quicker data saved to {quicker_data_path}")
logging.info(f"Quicker data saved to {quicker_data_path}")