In [1]:
import sys
import os
sys.path.append("..")

from  pathlib            import  Path
from  utils.pubmed_utils import Neural_Retriever_PubMed

PydanticUserError: If you use `@root_validator` with pre=False (the default) you MUST specify `skip_on_failure=True`. Note that `@root_validator` is deprecated and should be replaced with `@model_validator`.

For further information visit https://errors.pydantic.dev/2.7/u/root-validator-pre-skip

# Using Clinfo.AI 

In this tutorial, we will go through each step of the Clinfo.AI workflow. Before we start, we need to set up a few things. 


### 1.- Setting up enviorment:
1.a.- Install the conda enviroment using the yml file provided.

``` conda env create -f environment.yaml ```

1.b.- Select your enviorment to run notebook. I recommend using vscode: 



### 2.- Creating Accounts

You will need at least one account and at most two (depending on how many calls/hour you plan to do):
* OPENAI account: If you start a free account for the first time, you will get $5 in API credits.
* NCBI_API_KEY: This is only necessary if you plan to make more than 10 calls per hour.


Once you have created both accounts  go to **src\config.py** file and: 

* Set OPENAI_API_KEY to your openAI API key

If you created an NCBI API account add your key and email in the following values: 
* NCBI_API_KEY 
* EMAIL 
Otherwise leave them as None





### 3.- Defining your own promts:
We have designed prompts for each step of Clinfo.ai Workflow, leaveriging the power of in-contex-learning. If you want to us your own promps you can edit them **src\prompts**


In [2]:
# Make Sure you followed at least step 1-2 before running this cell.
from  config import OPENAI_API_KEY, NCBI_API_KEY, EMAIL
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [6]:

## 4.- Init Neural Retriver from path. Do not change path if you want to use base  prombts, otherwise specify your own architecture
file_path   = os.path.join("..","prompts","PubMed","Architecture_1","master.json")
nrpm        = Neural_Retriever_PubMed(architecture_path=file_path,verbose=True,debug=False,open_ai_key=OPENAI_API_KEY,email=EMAIL)

NameError: name 'Neural_Retriever_PubMed' is not defined

# Let's start!

In [14]:
### Step 0 : Ask a question ###
question    = "What is the prevalence of COVID-19 in the United States?"

## Step 1: Search PubMed: ###
# To achive this we will:
# 1.a Convert the question into a query using LLMs
# 1.b Use this Pubmed Query to look for pubmed abstracts about the topic. 
# This step will  returns a tupple:  ( [list of queries used to retrive articles] , [list of article ids  (PMIDs) that were retrieved])
pubmed_queries, article_ids = nrpm.search_pubmed(question,
                                                 num_results=10,        # Limit the max number of results you can retrive per query
                                                 num_query_attempts=1) # Number of attemps to generate query (use more than 1 for better results)


print(f"Articles retrived: {len(article_ids)}")
print(pubmed_queries)
print(article_ids)


********************************************************
Generated pubmed query: COVID-19[Title/Abstract] AND prevalence[Title/Abstract] AND United States[Title/Abstract]

Retrieved 10 IDs
['35429399', '35404682', '33518464', '36821800', '36560794', '33686893', '37542149', '36279944', '37053525', '37509543']
Search IDs: {'36560794', '35404682', '36821800', '37053525', '33686893', '36279944', '35429399', '33518464', '37542149', '37509543'}
Articles retrived: 10
['COVID-19[Title/Abstract] AND prevalence[Title/Abstract] AND United States[Title/Abstract]']
['36560794', '35404682', '36821800', '37053525', '33686893', '36279944', '35429399', '33518464', '37542149', '37509543']


In [15]:
## Step 2: Fetch article data
# Preiously, we only extracted he PMIDs. No we will use those  PMIDs to retrive the metadata:
articles = nrpm.fetch_article_data(article_ids)
print(articles)

# Print example fo first Article: 
article_num = 1
print(f"Article :{article_num}")
print(articles[article_num].keys())
print(articles[article_num]['PubmedData'])
print(articles[article_num]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"])
print(articles[article_num]["MedlineCitation"]["Article"])


[{'MedlineCitation': DictElement({'GeneralNote': [], 'OtherAbstract': [], 'CitationSubset': ['IM'], 'SpaceFlightMission': [], 'KeywordList': [ListElement([StringElement('COVID-19', attributes={'MajorTopicYN': 'N'}), StringElement('NIS', attributes={'MajorTopicYN': 'N'}), StringElement('United States', attributes={'MajorTopicYN': 'N'}), StringElement('complications', attributes={'MajorTopicYN': 'N'}), StringElement('mortality', attributes={'MajorTopicYN': 'N'}), StringElement('myocarditis', attributes={'MajorTopicYN': 'N'}), StringElement('prevalence', attributes={'MajorTopicYN': 'N'})], attributes={'Owner': 'NOTNLM'})], 'OtherID': [], 'PMID': StringElement('36560794', attributes={'Version': '1'}), 'DateCompleted': {'Year': '2022', 'Month': '12', 'Day': '26'}, 'DateRevised': {'Year': '2023', 'Month': '09', 'Day': '18'}, 'Article': DictElement({'Language': ['eng'], 'ArticleDate': [DictElement({'Year': '2022', 'Month': '12', 'Day': '14'}, attributes={'DateType': 'Electronic'})], 'ELocatio

In [17]:
# STEP 3 Summarize each article
# This step is parallelized, though it might look like one single call, it performs one call per article to summarize.
# Then the relevancy of the article (based on the original question) is provided by another LLM call.

article_summaries,irrelevant_articles =  nrpm.summarize_each_article(articles, question)

Chen C, Haupert SR, Zimmermann L, Shi X, Fritsche LG, Mukherjee B. Global Prevalence of Post-Coronavirus Disease 2019 (COVID-19) Condition or Long COVID: A Meta-Analysis and Systematic Review.. The Journal of infectious diseases. 2021;226(9):1593-1607.
~~~~~~~~~~
BACKGROUND:
This study aims to examine the worldwide prevalence of post-coronavirus disease 2019 (COVID-19) condition, through a systematic review and meta-analysis.

METHODS:
PubMed, Embase, and iSearch were searched on July 5, 2021 with verification extending to March 13, 2022. Using a random-effects framework with DerSimonian-Laird estimator, we meta-analyzed post-COVID-19 condition prevalence at 28+ days from infection.

RESULTS:
Fifty studies were included, and 41 were meta-analyzed. Global estimated pooled prevalence of post-COVID-19 condition was 0.43 (95% confidence interval [CI], .39-.46). Hospitalized and nonhospitalized patients had estimates of 0.54 (95% CI, .44-.63) and 0.34 (95% CI, .25-.46), respectively. Region

In [22]:
# Summaries for relevant articles
article_summaries

[{'title': 'Health Care Policies and COVID-19 Prevalence: Is There Any Association?',
  'url': 'https://pubmed.ncbi.nlm.nih.gov/33686893/',
  'abstract': 'The coronavirus disease 2019 (COVID-19) pandemic has affected almost all countries and territories. As of December 6, 2020, the United States of America and India have the highest prevalence. Each country has implemented different strategies to control and reduce the spread of disease. Here, the association between prevalence number and health policies is evaluated by comparing 2 groups of countries: (1) Italy, the United States of America, Germany, Spain, and India with a higher prevalence than a linear trend line; and (2) Singapore and China with a lower or equal prevalence than linear forecasts. A rapid overview revealed that many countries have similar strategies for controlling COVID-19, including the suspension of air travel, the lockdown on the cities with the most cases detected, active case findings, monitoring of close cont

In [25]:
# Articles deemed irelevant
irrelevant_articles 

[{'title': 'COVID-19 Associated Myocarditis Clinical Outcomes among Hospitalized Patients in the United States: A Propensity Matched Analysis of National Inpatient Sample.',
  'url': 'https://pubmed.ncbi.nlm.nih.gov/36560794/',
  'abstract': 'Coronavirus-19 (COVID-19), preliminarily a respiratory virus, can affect multiple organs, including the heart. Myocarditis is a well-known complication among COVID-19 infections, with limited large-scale studies evaluating outcomes associated with COVID-19-related Myocarditis. We used the National Inpatient Sample (NIS) database to compare COVID-19 patients with and without Myocarditis. A total of 1,659,040 patients were included in the study: COVID-19 with Myocarditis (<i>n</i> = 6,455, 0.4%) and COVID-19 without Myocarditis (<i>n</i> = 1,652,585, 99.6%). The primary outcome was in-hospital mortality. Secondary outcomes included mechanical ventilation, vasopressor use, sudden cardiac arrest, cardiogenic shock, acute kidney injury requiring hemodi

In [26]:
# STEP 4 do a synthesis of all summaries to answer question: 
synthesis =   nrpm.synthesize_all_articles(article_summaries, question)
print("synthesis")

=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
Literature Summary: The United States has one of the highest prevalences of COVID-19 globally, attributed to differences in the implementation of health policies and strategies (Nejadghaderi et al., 2021). A study analyzing occupational risks for COVID-19 infection in the United States found that certain industries, such as healthcare and social assistance, had elevated risks of COVID-19 compared to other industries and occupations (Koh, 2020). The prevalence of COVID-19 also increased with each additional worker in a household. A systematic review and meta-analysis estimated the regional prevalence of post-COVID-19 condition in the United States to be 0.31 (Chen et al., 2021). However, the sample sizes and risk of bias in these studies are not clearly stated, limiting the strength of the evidence.

TL;DR: The prevalence of COVID-19 in the United States is high compared to other countries, with certain occupations facing elevated risks, and a si

# Great! We answered our first question using Clinfo.AI!
## Here are all the steps condensed:

In [32]:
file_path   = os.path.join("..","prompts","PubMed","Architecture_1","master.json")
nrpm        = Neural_Retriever_PubMed(architecture_path=file_path,verbose=False,debug=False,open_ai_key=OPENAI_API_KEY,email=EMAIL)


### Step 0 : Ask a question ###
question    = "What is the prevalence of COVID-19 in the United States?"

## Step 1: Search PubMed ###
# Convert the question into a query using gpt 
# This returns a list of queries (used to retrive articles) and a list of article ids that were retrieved
pubmed_queries, article_ids = nrpm.search_pubmed(question,num_results=10,num_query_attempts=1)

## Step 1.a: Fetch article data
#  Convert  list of Ids into a list of dictionaries (populated by pumbed API)
articles = nrpm.fetch_article_data(article_ids)

###  STEP 2 Summarize each article (only if they are relevant [Step 3]) ###
article_summaries,irrelevant_articles =  nrpm.summarize_each_article(articles, question)


### STEP 4: Synthesize the results ###
synthesis =   nrpm.synthesize_all_articles(article_summaries, question)

#synthesis, article_summaries, irrelevant_articles, articles, article_ids, pubmed_queries,

print(synthesis)


Task Name: pubmed_query_prompt
------------------------------------------------------------------------

Task Name: relevance_prompt
------------------------------------------------------------------------

Task Name: summarization_prompt
------------------------------------------------------------------------

Task Name: synthesize_prompt
------------------------------------------------------------------------
********************************************************
Generated pubmed query: COVID-19 prevalence United States

Retrieved 10 IDs
['34888288', '33663642', '34281357', '37639043', '34311990', '35996224', '36508742', '35206474', '36333051', '37422043']
Search IDs: {'34888288', '34311990', '35996224', '33663642', '35206474', '37639043', '36508742', '34281357', '37422043', '36333051'}
